src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 int coding_system_require_warning;
 371
 372 /* Mnemonic string for each format of end-of-line.  */
 373 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 374 /* Mnemonic string to indicate format of end-of-line is not yet
 375    decided.  */
 376 Lisp_Object eol_mnemonic_undecided;
 377
 378 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 379    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 380 int system_eol_type;
 381
 382 #ifdef emacs
 383
 384 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 385
 386 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 387
 388 /* Coding system emacs-mule and raw-text are for converting only
 389    end-of-line format.  */
 390 Lisp_Object Qemacs_mule, Qraw_text;
 391
 392 /* Coding-systems are handed between Emacs Lisp programs and C internal
 393    routines by the following three variables.  */
 394 /* Coding-system for reading files and receiving data from process.  */
 395 Lisp_Object Vcoding_system_for_read;
 396 /* Coding-system for writing files and sending data to process.  */
 397 Lisp_Object Vcoding_system_for_write;
 398 /* Coding-system actually used in the latest I/O.  */
 399 Lisp_Object Vlast_coding_system_used;
 400
 401 /* A vector of length 256 which contains information about special
 402    Latin codes (especially for dealing with Microsoft codes).  */
 403 Lisp_Object Vlatin_extra_code_table;
 404
 405 /* Flag to inhibit code conversion of end-of-line format.  */
 406 int inhibit_eol_conversion;
 407
 408 /* Flag to inhibit ISO2022 escape sequence detection.  */
 409 int inhibit_iso_escape_detection;
 410
 411 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 412 int inherit_process_coding_system;
 413
 414 /* Coding system to be used to encode text for terminal display.  */
 415 struct coding_system terminal_coding;
 416
 417 /* Coding system to be used to encode text for terminal display when
 418    terminal coding system is nil.  */
 419 struct coding_system safe_terminal_coding;
 420
 421 /* Coding system of what is sent from terminal keyboard.  */
 422 struct coding_system keyboard_coding;
 423
 424 /* Default coding system to be used to write a file.  */
 425 struct coding_system default_buffer_file_coding;
 426
 427 Lisp_Object Vfile_coding_system_alist;
 428 Lisp_Object Vprocess_coding_system_alist;
 429 Lisp_Object Vnetwork_coding_system_alist;
 430
 431 Lisp_Object Vlocale_coding_system;
 432
 433 #endif /* emacs */
 434
 435 Lisp_Object Qcoding_category, Qcoding_category_index;
 436
 437 /* List of symbols `coding-category-xxx' ordered by priority.  */
 438 Lisp_Object Vcoding_category_list;
 439
 440 /* Table of coding categories (Lisp symbols).  */
 441 Lisp_Object Vcoding_category_table;
 442
 443 /* Table of names of symbol for each coding-category.  */
 444 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 445   "coding-category-emacs-mule",
 446   "coding-category-sjis",
 447   "coding-category-iso-7",
 448   "coding-category-iso-7-tight",
 449   "coding-category-iso-8-1",
 450   "coding-category-iso-8-2",
 451   "coding-category-iso-7-else",
 452   "coding-category-iso-8-else",
 453   "coding-category-ccl",
 454   "coding-category-big5",
 455   "coding-category-utf-8",
 456   "coding-category-utf-16-be",
 457   "coding-category-utf-16-le",
 458   "coding-category-raw-text",
 459   "coding-category-binary"
 460 };
 461
 462 /* Table of pointers to coding systems corresponding to each coding
 463    categories.  */
 464 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 465
 466 /* Table of coding category masks.  Nth element is a mask for a coding
 467    category of which priority is Nth.  */
 468 static
 469 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 470
 471 /* Flag to tell if we look up translation table on character code
 472    conversion.  */
 473 Lisp_Object Venable_character_translation;
 474 /* Standard translation table to look up on decoding (reading).  */
 475 Lisp_Object Vstandard_translation_table_for_decode;
 476 /* Standard translation table to look up on encoding (writing).  */
 477 Lisp_Object Vstandard_translation_table_for_encode;
 478
 479 Lisp_Object Qtranslation_table;
 480 Lisp_Object Qtranslation_table_id;
 481 Lisp_Object Qtranslation_table_for_decode;
 482 Lisp_Object Qtranslation_table_for_encode;
 483
 484 /* Alist of charsets vs revision number.  */
 485 Lisp_Object Vcharset_revision_alist;
 486
 487 /* Default coding systems used for process I/O.  */
 488 Lisp_Object Vdefault_process_coding_system;
 489
 490 /* Char table for translating Quail and self-inserting input.  */
 491 Lisp_Object Vtranslation_table_for_input;
 492
 493 /* Global flag to tell that we can't call post-read-conversion and
 494    pre-write-conversion functions.  Usually the value is zero, but it
 495    is set to 1 temporarily while such functions are running.  This is
 496    to avoid infinite recursive call.  */
 497 static int inhibit_pre_post_conversion;
 498
 499 /* Char-table containing safe coding systems of each character.  */
 500 Lisp_Object Vchar_coding_system_table;
 501 Lisp_Object Qchar_coding_system;
 502
 503 /* Return `safe-chars' property of coding system CODING.  Don't check
 504    validity of CODING.  */
 505
 506 Lisp_Object
 507 coding_safe_chars (coding)
 508      struct coding_system *coding;
 509 {
 510   Lisp_Object coding_spec, plist, safe_chars;
 511
 512   coding_spec = Fget (coding->symbol, Qcoding_system);
 513   plist = XVECTOR (coding_spec)->contents[3];
 514   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 515   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 516 }
 517
 518 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 519   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 520
 521 \f
 522 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 523
 524 /* Emacs' internal format for representation of multiple character
 525    sets is a kind of multi-byte encoding, i.e. characters are
 526    represented by variable-length sequences of one-byte codes.
 527
 528    ASCII characters and control characters (e.g. `tab', `newline') are
 529    represented by one-byte sequences which are their ASCII codes, in
 530    the range 0x00 through 0x7F.
 531
 532    8-bit characters of the range 0x80..0x9F are represented by
 533    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 534    code + 0x20).
 535
 536    8-bit characters of the range 0xA0..0xFF are represented by
 537    one-byte sequences which are their 8-bit code.
 538
 539    The other characters are represented by a sequence of `base
 540    leading-code', optional `extended leading-code', and one or two
 541    `position-code's.  The length of the sequence is determined by the
 542    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 543    whereas extended leading-code and position-code take the range 0xA0
 544    through 0xFF.  See `charset.h' for more details about leading-code
 545    and position-code.
 546
 547    --- CODE RANGE of Emacs' internal format ---
 548    character set        range
 549    -------------        -----
 550    ascii                0x00..0x7F
 551    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 552    eight-bit-graphic    0xA0..0xBF
 553    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 554    ---------------------------------------------
 555
 556    As this is the internal character representation, the format is
 557    usually not used externally (i.e. in a file or in a data sent to a
 558    process).  But, it is possible to have a text externally in this
 559    format (i.e. by encoding by the coding system `emacs-mule').
 560
 561    In that case, a sequence of one-byte codes has a slightly different
 562    form.
 563
 564    Firstly, all characters in eight-bit-control are represented by
 565    one-byte sequences which are their 8-bit code.
 566
 567    Next, character composition data are represented by the byte
 568    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 569    where,
 570         METHOD is 0xF0 plus one of composition method (enum
 571         composition_method),
 572
 573         BYTES is 0xA0 plus the byte length of these composition data,
 574
 575         CHARS is 0xA0 plus the number of characters composed by these
 576         data,
 577
 578         COMPONENTs are characters of multibyte form or composition
 579         rules encoded by two-byte of ASCII codes.
 580
 581    In addition, for backward compatibility, the following formats are
 582    also recognized as composition data on decoding.
 583
 584    0x80 MSEQ ...
 585    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 586
 587    Here,
 588         MSEQ is a multibyte form but in these special format:
 589           ASCII: 0xA0 ASCII_CODE+0x80,
 590           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 591         RULE is a one byte code of the range 0xA0..0xF0 that
 592         represents a composition rule.
 593   */
 594
 595 enum emacs_code_class_type emacs_code_class[256];
 596
 597 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 598    Check if a text is encoded in Emacs' internal format.  If it is,
 599    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 600
 601 static int
 602 detect_coding_emacs_mule (src, src_end, multibytep)
 603       unsigned char *src, *src_end;
 604       int multibytep;
 605 {
 606   unsigned char c;
 607   int composing = 0;
 608   /* Dummy for ONE_MORE_BYTE.  */
 609   struct coding_system dummy_coding;
 610   struct coding_system *coding = &dummy_coding;
 611
 612   while (1)
 613     {
 614       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 615
 616       if (composing)
 617         {
 618           if (c < 0xA0)
 619             composing = 0;
 620           else if (c == 0xA0)
 621             {
 622               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 623               c &= 0x7F;
 624             }
 625           else
 626             c -= 0x20;
 627         }
 628
 629       if (c < 0x20)
 630         {
 631           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 632             return 0;
 633         }
 634       else if (c >= 0x80 && c < 0xA0)
 635         {
 636           if (c == 0x80)
 637             /* Old leading code for a composite character.  */
 638             composing = 1;
 639           else
 640             {
 641               unsigned char *src_base = src - 1;
 642               int bytes;
 643
 644               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 645                                                bytes))
 646                 return 0;
 647               src = src_base + bytes;
 648             }
 649         }
 650     }
 651  label_end_of_loop:
 652   return CODING_CATEGORY_MASK_EMACS_MULE;
 653 }
 654
 655
 656 /* Record the starting position START and METHOD of one composition.  */
 657
 658 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 659   do {                                                          \
 660     struct composition_data *cmp_data = coding->cmp_data;       \
 661     int *data = cmp_data->data + cmp_data->used;                \
 662     coding->cmp_data_start = cmp_data->used;                    \
 663     data[0] = -1;                                               \
 664     data[1] = cmp_data->char_offset + start;                    \
 665     data[3] = (int) method;                                     \
 666     cmp_data->used += 4;                                        \
 667   } while (0)
 668
 669 /* Record the ending position END of the current composition.  */
 670
 671 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 672   do {                                                          \
 673     struct composition_data *cmp_data = coding->cmp_data;       \
 674     int *data = cmp_data->data + coding->cmp_data_start;        \
 675     data[0] = cmp_data->used - coding->cmp_data_start;          \
 676     data[2] = cmp_data->char_offset + end;                      \
 677   } while (0)
 678
 679 /* Record one COMPONENT (alternate character or composition rule).  */
 680
 681 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
 682   (coding->cmp_data->data[coding->cmp_data->used++] = component)
 683
 684
 685 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 686    is not less than SRC_END, return -1 without incrementing Src.  */
 687
 688 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 689
 690
 691 /* Decode a character represented as a component of composition
 692    sequence of Emacs 20 style at SRC.  Set C to that character, store
 693    its multibyte form sequence at P, and set P to the end of that
 694    sequence.  If no valid character is found, set C to -1.  */
 695
 696 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 697   do {                                                          \
 698     int bytes;                                                  \
 699                                                                 \
 700     c = SAFE_ONE_MORE_BYTE ();                                  \
 701     if (c < 0)                                                  \
 702       break;                                                    \
 703     if (CHAR_HEAD_P (c))                                        \
 704       c = -1;                                                   \
 705     else if (c == 0xA0)                                         \
 706       {                                                         \
 707         c = SAFE_ONE_MORE_BYTE ();                              \
 708         if (c < 0xA0)                                           \
 709           c = -1;                                               \
 710         else                                                    \
 711           {                                                     \
 712             c -= 0xA0;                                          \
 713             *p++ = c;                                           \
 714           }                                                     \
 715       }                                                         \
 716     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 717       {                                                         \
 718         unsigned char *p0 = p;                                  \
 719                                                                 \
 720         c -= 0x20;                                              \
 721         *p++ = c;                                               \
 722         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 723         while (--bytes)                                         \
 724           {                                                     \
 725             c = SAFE_ONE_MORE_BYTE ();                          \
 726             if (c < 0)                                          \
 727               break;                                            \
 728             *p++ = c;                                           \
 729           }                                                     \
 730         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 731           c = STRING_CHAR (p0, bytes);                          \
 732         else                                                    \
 733           c = -1;                                               \
 734       }                                                         \
 735     else                                                        \
 736       c = -1;                                                   \
 737   } while (0)
 738
 739
 740 /* Decode a composition rule represented as a component of composition
 741    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 742    valid rule is found, set C to -1.  */
 743
 744 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 745   do {                                                  \
 746     c = SAFE_ONE_MORE_BYTE ();                          \
 747     c -= 0xA0;                                          \
 748     if (c < 0 || c >= 81)                               \
 749       c = -1;                                           \
 750     else                                                \
 751       {                                                 \
 752         gref = c / 9, nref = c % 9;                     \
 753         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 754       }                                                 \
 755   } while (0)
 756
 757
 758 /* Decode composition sequence encoded by `emacs-mule' at the source
 759    pointed by SRC.  SRC_END is the end of source.  Store information
 760    of the composition in CODING->cmp_data.
 761
 762    For backward compatibility, decode also a composition sequence of
 763    Emacs 20 style.  In that case, the composition sequence contains
 764    characters that should be extracted into a buffer or string.  Store
 765    those characters at *DESTINATION in multibyte form.
 766
 767    If we encounter an invalid byte sequence, return 0.
 768    If we encounter an insufficient source or destination, or
 769    insufficient space in CODING->cmp_data, return 1.
 770    Otherwise, return consumed bytes in the source.
 771
 772 */
 773 static INLINE int
 774 decode_composition_emacs_mule (coding, src, src_end,
 775                                destination, dst_end, dst_bytes)
 776      struct coding_system *coding;
 777      unsigned char *src, *src_end, **destination, *dst_end;
 778      int dst_bytes;
 779 {
 780   unsigned char *dst = *destination;
 781   int method, data_len, nchars;
 782   unsigned char *src_base = src++;
 783   /* Store components of composition.  */
 784   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 785   int ncomponent;
 786   /* Store multibyte form of characters to be composed.  This is for
 787      Emacs 20 style composition sequence.  */
 788   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 789   unsigned char *bufp = buf;
 790   int c, i, gref, nref;
 791
 792   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 793       >= COMPOSITION_DATA_SIZE)
 794     {
 795       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 796       return -1;
 797     }
 798
 799   ONE_MORE_BYTE (c);
 800   if (c - 0xF0 >= COMPOSITION_RELATIVE
 801            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 802     {
 803       int with_rule;
 804
 805       method = c - 0xF0;
 806       with_rule = (method == COMPOSITION_WITH_RULE
 807                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 808       ONE_MORE_BYTE (c);
 809       data_len = c - 0xA0;
 810       if (data_len < 4
 811           || src_base + data_len > src_end)
 812         return 0;
 813       ONE_MORE_BYTE (c);
 814       nchars = c - 0xA0;
 815       if (c < 1)
 816         return 0;
 817       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 818         {
 819           /* If it is longer than this, it can't be valid.  */
 820           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 821             return 0;
 822
 823           if (ncomponent % 2 && with_rule)
 824             {
 825               ONE_MORE_BYTE (gref);
 826               gref -= 32;
 827               ONE_MORE_BYTE (nref);
 828               nref -= 32;
 829               c = COMPOSITION_ENCODE_RULE (gref, nref);
 830             }
 831           else
 832             {
 833               int bytes;
 834               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 835                 c = STRING_CHAR (src, bytes);
 836               else
 837                 c = *src, bytes = 1;
 838               src += bytes;
 839             }
 840           component[ncomponent] = c;
 841         }
 842     }
 843   else
 844     {
 845       /* This may be an old Emacs 20 style format.  See the comment at
 846          the section 2 of this file.  */
 847       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 848       if (src == src_end
 849           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 850         goto label_end_of_loop;
 851
 852       src_end = src;
 853       src = src_base + 1;
 854       if (c < 0xC0)
 855         {
 856           method = COMPOSITION_RELATIVE;
 857           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 858             {
 859               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 860               if (c < 0)
 861                 break;
 862               component[ncomponent++] = c;
 863             }
 864           if (ncomponent < 2)
 865             return 0;
 866           nchars = ncomponent;
 867         }
 868       else if (c == 0xFF)
 869         {
 870           method = COMPOSITION_WITH_RULE;
 871           src++;
 872           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 873           if (c < 0)
 874             return 0;
 875           component[0] = c;
 876           for (ncomponent = 1;
 877                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 878             {
 879               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 880               if (c < 0)
 881                 break;
 882               component[ncomponent++] = c;
 883               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 884               if (c < 0)
 885                 break;
 886               component[ncomponent++] = c;
 887             }
 888           if (ncomponent < 3)
 889             return 0;
 890           nchars = (ncomponent + 1) / 2;
 891         }
 892       else
 893         return 0;
 894     }
 895
 896   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 897     {
 898       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 899       for (i = 0; i < ncomponent; i++)
 900         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 901       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 902       if (buf < bufp)
 903         {
 904           unsigned char *p = buf;
 905           EMIT_BYTES (p, bufp);
 906           *destination += bufp - buf;
 907           coding->produced_char += nchars;
 908         }
 909       return (src - src_base);
 910     }
 911  label_end_of_loop:
 912   return -1;
 913 }
 914
 915 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 916
 917 static void
 918 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 919      struct coding_system *coding;
 920      unsigned char *source, *destination;
 921      int src_bytes, dst_bytes;
 922 {
 923   unsigned char *src = source;
 924   unsigned char *src_end = source + src_bytes;
 925   unsigned char *dst = destination;
 926   unsigned char *dst_end = destination + dst_bytes;
 927   /* SRC_BASE remembers the start position in source in each loop.
 928      The loop will be exited when there's not enough source code, or
 929      when there's not enough destination area to produce a
 930      character.  */
 931   unsigned char *src_base;
 932
 933   coding->produced_char = 0;
 934   while ((src_base = src) < src_end)
 935     {
 936       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 937       int bytes;
 938
 939       if (*src == '\r')
 940         {
 941           int c = *src++;
 942
 943           if (coding->eol_type == CODING_EOL_CR)
 944             c = '\n';
 945           else if (coding->eol_type == CODING_EOL_CRLF)
 946             {
 947               ONE_MORE_BYTE (c);
 948               if (c != '\n')
 949                 {
 950                   src--;
 951                   c = '\r';
 952                 }
 953             }
 954           *dst++ = c;
 955           coding->produced_char++;
 956           continue;
 957         }
 958       else if (*src == '\n')
 959         {
 960           if ((coding->eol_type == CODING_EOL_CR
 961                || coding->eol_type == CODING_EOL_CRLF)
 962               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 963             {
 964               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 965               goto label_end_of_loop;
 966             }
 967           *dst++ = *src++;
 968           coding->produced_char++;
 969           continue;
 970         }
 971       else if (*src == 0x80 && coding->cmp_data)
 972         {
 973           /* Start of composition data.  */
 974           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 975                                                          &dst, dst_end,
 976                                                          dst_bytes);
 977           if (consumed < 0)
 978             goto label_end_of_loop;
 979           else if (consumed > 0)
 980             {
 981               src += consumed;
 982               continue;
 983             }
 984           bytes = CHAR_STRING (*src, tmp);
 985           p = tmp;
 986           src++;
 987         }
 988       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 989         {
 990           p = src;
 991           src += bytes;
 992         }
 993       else
 994         {
 995           bytes = CHAR_STRING (*src, tmp);
 996           p = tmp;
 997           src++;
 998         }
 999       if (dst + bytes >= (dst_bytes ? dst_end : src))
1000         {
1001           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1002           break;
1003         }
1004       while (bytes--) *dst++ = *p++;
1005       coding->produced_char++;
1006     }
1007  label_end_of_loop:
1008   coding->consumed = coding->consumed_char = src_base - source;
1009   coding->produced = dst - destination;
1010 }
1011
1012
1013 /* Encode composition data stored at DATA into a special byte sequence
1014    starting by 0x80.  Update CODING->cmp_data_start and maybe
1015    CODING->cmp_data for the next call.  */
1016
1017 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1018   do {                                                                  \
1019     unsigned char buf[1024], *p0 = buf, *p;                             \
1020     int len = data[0];                                                  \
1021     int i;                                                              \
1022                                                                         \
1023     buf[0] = 0x80;                                                      \
1024     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1025     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1026     p = buf + 4;                                                        \
1027     if (data[3] == COMPOSITION_WITH_RULE                                \
1028         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1029       {                                                                 \
1030         p += CHAR_STRING (data[4], p);                                  \
1031         for (i = 5; i < len; i += 2)                                    \
1032           {                                                             \
1033             int gref, nref;                                             \
1034              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1035             *p++ = 0x20 + gref;                                         \
1036             *p++ = 0x20 + nref;                                         \
1037             p += CHAR_STRING (data[i + 1], p);                          \
1038           }                                                             \
1039       }                                                                 \
1040     else                                                                \
1041       {                                                                 \
1042         for (i = 4; i < len; i++)                                       \
1043           p += CHAR_STRING (data[i], p);                                \
1044       }                                                                 \
1045     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1046                                                                         \
1047     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1048       {                                                                 \
1049         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1050         goto label_end_of_loop;                                         \
1051       }                                                                 \
1052     while (p0 < p)                                                      \
1053       *dst++ = *p0++;                                                   \
1054     coding->cmp_data_start += data[0];                                  \
1055     if (coding->cmp_data_start == coding->cmp_data->used                \
1056         && coding->cmp_data->next)                                      \
1057       {                                                                 \
1058         coding->cmp_data = coding->cmp_data->next;                      \
1059         coding->cmp_data_start = 0;                                     \
1060       }                                                                 \
1061   } while (0)
1062
1063
1064 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1065                             unsigned char *, int, int));
1066
1067 static void
1068 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1069      struct coding_system *coding;
1070      unsigned char *source, *destination;
1071      int src_bytes, dst_bytes;
1072 {
1073   unsigned char *src = source;
1074   unsigned char *src_end = source + src_bytes;
1075   unsigned char *dst = destination;
1076   unsigned char *dst_end = destination + dst_bytes;
1077   unsigned char *src_base;
1078   int c;
1079   int char_offset;
1080   int *data;
1081
1082   Lisp_Object translation_table;
1083
1084   translation_table = Qnil;
1085
1086   /* Optimization for the case that there's no composition.  */
1087   if (!coding->cmp_data || coding->cmp_data->used == 0)
1088     {
1089       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1090       return;
1091     }
1092
1093   char_offset = coding->cmp_data->char_offset;
1094   data = coding->cmp_data->data + coding->cmp_data_start;
1095   while (1)
1096     {
1097       src_base = src;
1098
1099       /* If SRC starts a composition, encode the information about the
1100          composition in advance.  */
1101       if (coding->cmp_data_start < coding->cmp_data->used
1102           && char_offset + coding->consumed_char == data[1])
1103         {
1104           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1105           char_offset = coding->cmp_data->char_offset;
1106           data = coding->cmp_data->data + coding->cmp_data_start;
1107         }
1108
1109       ONE_MORE_CHAR (c);
1110       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1111                         || coding->eol_type == CODING_EOL_CR))
1112         {
1113           if (coding->eol_type == CODING_EOL_CRLF)
1114             EMIT_TWO_BYTES ('\r', c);
1115           else
1116             EMIT_ONE_BYTE ('\r');
1117         }
1118       else if (SINGLE_BYTE_CHAR_P (c))
1119         EMIT_ONE_BYTE (c);
1120       else
1121         EMIT_BYTES (src_base, src);
1122       coding->consumed_char++;
1123     }
1124  label_end_of_loop:
1125   coding->consumed = src_base - source;
1126   coding->produced = coding->produced_char = dst - destination;
1127   return;
1128 }
1129
1130 \f
1131 /*** 3. ISO2022 handlers ***/
1132
1133 /* The following note describes the coding system ISO2022 briefly.
1134    Since the intention of this note is to help understand the
1135    functions in this file, some parts are NOT ACCURATE or are OVERLY
1136    SIMPLIFIED.  For thorough understanding, please refer to the
1137    original document of ISO2022.  This is equivalent to the standard
1138    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1139
1140    ISO2022 provides many mechanisms to encode several character sets
1141    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1142    is encoded using bytes less than 128.  This may make the encoded
1143    text a little bit longer, but the text passes more easily through
1144    several types of gateway, some of which strip off the MSB (Most
1145    Significant Bit).
1146
1147    There are two kinds of character sets: control character sets and
1148    graphic character sets.  The former contain control characters such
1149    as `newline' and `escape' to provide control functions (control
1150    functions are also provided by escape sequences).  The latter
1151    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1152    two control character sets and many graphic character sets.
1153
1154    Graphic character sets are classified into one of the following
1155    four classes, according to the number of bytes (DIMENSION) and
1156    number of characters in one dimension (CHARS) of the set:
1157    - DIMENSION1_CHARS94
1158    - DIMENSION1_CHARS96
1159    - DIMENSION2_CHARS94
1160    - DIMENSION2_CHARS96
1161
1162    In addition, each character set is assigned an identification tag,
1163    unique for each set, called the "final character" (denoted as <F>
1164    hereafter).  The <F> of each character set is decided by ECMA(*)
1165    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1166    (0x30..0x3F are for private use only).
1167
1168    Note (*): ECMA = European Computer Manufacturers Association
1169
1170    Here are examples of graphic character sets [NAME(<F>)]:
1171         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1172         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1173         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1174         o DIMENSION2_CHARS96 -- none for the moment
1175
1176    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1177         C0 [0x00..0x1F] -- control character plane 0
1178         GL [0x20..0x7F] -- graphic character plane 0
1179         C1 [0x80..0x9F] -- control character plane 1
1180         GR [0xA0..0xFF] -- graphic character plane 1
1181
1182    A control character set is directly designated and invoked to C0 or
1183    C1 by an escape sequence.  The most common case is that:
1184    - ISO646's  control character set is designated/invoked to C0, and
1185    - ISO6429's control character set is designated/invoked to C1,
1186    and usually these designations/invocations are omitted in encoded
1187    text.  In a 7-bit environment, only C0 can be used, and a control
1188    character for C1 is encoded by an appropriate escape sequence to
1189    fit into the environment.  All control characters for C1 are
1190    defined to have corresponding escape sequences.
1191
1192    A graphic character set is at first designated to one of four
1193    graphic registers (G0 through G3), then these graphic registers are
1194    invoked to GL or GR.  These designations and invocations can be
1195    done independently.  The most common case is that G0 is invoked to
1196    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1197    these invocations and designations are omitted in encoded text.
1198    In a 7-bit environment, only GL can be used.
1199
1200    When a graphic character set of CHARS94 is invoked to GL, codes
1201    0x20 and 0x7F of the GL area work as control characters SPACE and
1202    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1203    be used.
1204
1205    There are two ways of invocation: locking-shift and single-shift.
1206    With locking-shift, the invocation lasts until the next different
1207    invocation, whereas with single-shift, the invocation affects the
1208    following character only and doesn't affect the locking-shift
1209    state.  Invocations are done by the following control characters or
1210    escape sequences:
1211
1212    ----------------------------------------------------------------------
1213    abbrev  function                  cntrl escape seq   description
1214    ----------------------------------------------------------------------
1215    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1216    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1217    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1218    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1219    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1220    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1221    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1222    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1223    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1224    ----------------------------------------------------------------------
1225    (*) These are not used by any known coding system.
1226
1227    Control characters for these functions are defined by macros
1228    ISO_CODE_XXX in `coding.h'.
1229
1230    Designations are done by the following escape sequences:
1231    ----------------------------------------------------------------------
1232    escape sequence      description
1233    ----------------------------------------------------------------------
1234    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1235    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1236    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1237    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1238    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1239    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1240    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1241    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1242    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1243    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1244    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1245    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1246    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1247    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1248    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1249    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1250    ----------------------------------------------------------------------
1251
1252    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1253    of dimension 1, chars 94, and final character <F>, etc...
1254
1255    Note (*): Although these designations are not allowed in ISO2022,
1256    Emacs accepts them on decoding, and produces them on encoding
1257    CHARS96 character sets in a coding system which is characterized as
1258    7-bit environment, non-locking-shift, and non-single-shift.
1259
1260    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1261    '(' can be omitted.  We refer to this as "short-form" hereafter.
1262
1263    Now you may notice that there are a lot of ways of encoding the
1264    same multilingual text in ISO2022.  Actually, there exist many
1265    coding systems such as Compound Text (used in X11's inter client
1266    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1267    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1268    localized platforms), and all of these are variants of ISO2022.
1269
1270    In addition to the above, Emacs handles two more kinds of escape
1271    sequences: ISO6429's direction specification and Emacs' private
1272    sequence for specifying character composition.
1273
1274    ISO6429's direction specification takes the following form:
1275         o CSI ']'      -- end of the current direction
1276         o CSI '0' ']'  -- end of the current direction
1277         o CSI '1' ']'  -- start of left-to-right text
1278         o CSI '2' ']'  -- start of right-to-left text
1279    The control character CSI (0x9B: control sequence introducer) is
1280    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1281
1282    Character composition specification takes the following form:
1283         o ESC '0' -- start relative composition
1284         o ESC '1' -- end composition
1285         o ESC '2' -- start rule-base composition (*)
1286         o ESC '3' -- start relative composition with alternate chars  (**)
1287         o ESC '4' -- start rule-base composition with alternate chars  (**)
1288   Since these are not standard escape sequences of any ISO standard,
1289   the use of them with these meanings is restricted to Emacs only.
1290
1291   (*) This form is used only in Emacs 20.5 and older versions,
1292   but the newer versions can safely decode it.
1293   (**) This form is used only in Emacs 21.1 and newer versions,
1294   and the older versions can't decode it.
1295
1296   Here's a list of example usages of these composition escape
1297   sequences (categorized by `enum composition_method').
1298
1299   COMPOSITION_RELATIVE:
1300         ESC 0 CHAR [ CHAR ] ESC 1
1301   COMPOSITION_WITH_RULE:
1302         ESC 2 CHAR [ RULE CHAR ] ESC 1
1303   COMPOSITION_WITH_ALTCHARS:
1304         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1305   COMPOSITION_WITH_RULE_ALTCHARS:
1306         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1307
1308 enum iso_code_class_type iso_code_class[256];
1309
1310 #define CHARSET_OK(idx, charset, c)                                     \
1311   (coding_system_table[idx]                                             \
1312    && (charset == CHARSET_ASCII                                         \
1313        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
1314            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1315    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1316                                               charset)                  \
1317        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1318
1319 #define SHIFT_OUT_OK(idx) \
1320   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1321
1322 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1323    Check if a text is encoded in ISO2022.  If it is, return an
1324    integer in which appropriate flag bits any of:
1325         CODING_CATEGORY_MASK_ISO_7
1326         CODING_CATEGORY_MASK_ISO_7_TIGHT
1327         CODING_CATEGORY_MASK_ISO_8_1
1328         CODING_CATEGORY_MASK_ISO_8_2
1329         CODING_CATEGORY_MASK_ISO_7_ELSE
1330         CODING_CATEGORY_MASK_ISO_8_ELSE
1331    are set.  If a code which should never appear in ISO2022 is found,
1332    returns 0.  */
1333
1334 static int
1335 detect_coding_iso2022 (src, src_end, multibytep)
1336      unsigned char *src, *src_end;
1337      int multibytep;
1338 {
1339   int mask = CODING_CATEGORY_MASK_ISO;
1340   int mask_found = 0;
1341   int reg[4], shift_out = 0, single_shifting = 0;
1342   int c, c1, charset;
1343   /* Dummy for ONE_MORE_BYTE.  */
1344   struct coding_system dummy_coding;
1345   struct coding_system *coding = &dummy_coding;
1346   Lisp_Object safe_chars;
1347
1348   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1349   while (mask && src < src_end)
1350     {
1351       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1352     retry:
1353       switch (c)
1354         {
1355         case ISO_CODE_ESC:
1356           if (inhibit_iso_escape_detection)
1357             break;
1358           single_shifting = 0;
1359           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1360           if (c >= '(' && c <= '/')
1361             {
1362               /* Designation sequence for a charset of dimension 1.  */
1363               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1364               if (c1 < ' ' || c1 >= 0x80
1365                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1366                 /* Invalid designation sequence.  Just ignore.  */
1367                 break;
1368               reg[(c - '(') % 4] = charset;
1369             }
1370           else if (c == '$')
1371             {
1372               /* Designation sequence for a charset of dimension 2.  */
1373               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1374               if (c >= '@' && c <= 'B')
1375                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1376                 reg[0] = charset = iso_charset_table[1][0][c];
1377               else if (c >= '(' && c <= '/')
1378                 {
1379                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1380                   if (c1 < ' ' || c1 >= 0x80
1381                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1382                     /* Invalid designation sequence.  Just ignore.  */
1383                     break;
1384                   reg[(c - '(') % 4] = charset;
1385                 }
1386               else
1387                 /* Invalid designation sequence.  Just ignore.  */
1388                 break;
1389             }
1390           else if (c == 'N' || c == 'O')
1391             {
1392               /* ESC <Fe> for SS2 or SS3.  */
1393               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1394               break;
1395             }
1396           else if (c >= '0' && c <= '4')
1397             {
1398               /* ESC <Fp> for start/end composition.  */
1399               mask_found |= CODING_CATEGORY_MASK_ISO;
1400               break;
1401             }
1402           else
1403             /* Invalid escape sequence.  Just ignore.  */
1404             break;
1405
1406           /* We found a valid designation sequence for CHARSET.  */
1407           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1408           c = MAKE_CHAR (charset, 0, 0);
1409           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1410             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1411           else
1412             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1413           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1414             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1415           else
1416             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1417           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1418             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1419           else
1420             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1421           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1422             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1423           else
1424             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1425           break;
1426
1427         case ISO_CODE_SO:
1428           if (inhibit_iso_escape_detection)
1429             break;
1430           single_shifting = 0;
1431           if (shift_out == 0
1432               && (reg[1] >= 0
1433                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1434                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1435             {
1436               /* Locking shift out.  */
1437               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1438               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1439             }
1440           break;
1441
1442         case ISO_CODE_SI:
1443           if (inhibit_iso_escape_detection)
1444             break;
1445           single_shifting = 0;
1446           if (shift_out == 1)
1447             {
1448               /* Locking shift in.  */
1449               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1450               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1451             }
1452           break;
1453
1454         case ISO_CODE_CSI:
1455           single_shifting = 0;
1456         case ISO_CODE_SS2:
1457         case ISO_CODE_SS3:
1458           {
1459             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1460
1461             if (inhibit_iso_escape_detection)
1462               break;
1463             if (c != ISO_CODE_CSI)
1464               {
1465                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1466                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1467                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1468                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1469                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1470                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1471                 single_shifting = 1;
1472               }
1473             if (VECTORP (Vlatin_extra_code_table)
1474                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1475               {
1476                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1477                     & CODING_FLAG_ISO_LATIN_EXTRA)
1478                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1479                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1480                     & CODING_FLAG_ISO_LATIN_EXTRA)
1481                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1482               }
1483             mask &= newmask;
1484             mask_found |= newmask;
1485           }
1486           break;
1487
1488         default:
1489           if (c < 0x80)
1490             {
1491               single_shifting = 0;
1492               break;
1493             }
1494           else if (c < 0xA0)
1495             {
1496               single_shifting = 0;
1497               if (VECTORP (Vlatin_extra_code_table)
1498                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1499                 {
1500                   int newmask = 0;
1501
1502                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1503                       & CODING_FLAG_ISO_LATIN_EXTRA)
1504                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1505                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1506                       & CODING_FLAG_ISO_LATIN_EXTRA)
1507                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1508                   mask &= newmask;
1509                   mask_found |= newmask;
1510                 }
1511               else
1512                 return 0;
1513             }
1514           else
1515             {
1516               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1517                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1518               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1519               /* Check the length of succeeding codes of the range
1520                  0xA0..0FF.  If the byte length is odd, we exclude
1521                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1522                  when we are not single shifting.  */
1523               if (!single_shifting
1524                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1525                 {
1526                   int i = 1;
1527
1528                   c = -1;
1529                   while (src < src_end)
1530                     {
1531                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1532                       if (c < 0xA0)
1533                         break;
1534                       i++;
1535                     }
1536
1537                   if (i & 1 && src < src_end)
1538                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1539                   else
1540                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1541                   if (c >= 0)
1542                     /* This means that we have read one extra byte.  */
1543                     goto retry;
1544                 }
1545             }
1546           break;
1547         }
1548     }
1549  label_end_of_loop:
1550   return (mask & mask_found);
1551 }
1552
1553 /* Decode a character of which charset is CHARSET, the 1st position
1554    code is C1, the 2nd position code is C2, and return the decoded
1555    character code.  If the variable `translation_table' is non-nil,
1556    returned the translated code.  */
1557
1558 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1559   (NILP (translation_table)                     \
1560    ? MAKE_CHAR (charset, c1, c2)                \
1561    : translate_char (translation_table, -1, charset, c1, c2))
1562
1563 /* Set designation state into CODING.  */
1564 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1565   do {                                                                     \
1566     int charset, c;                                                        \
1567                                                                            \
1568     if (final_char < '0' || final_char >= 128)                             \
1569       goto label_invalid_code;                                             \
1570     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1571                                  make_number (chars),                      \
1572                                  make_number (final_char));                \
1573     c = MAKE_CHAR (charset, 0, 0);                                         \
1574     if (charset >= 0                                                       \
1575         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1576             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1577       {                                                                    \
1578         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1579             && reg == 0                                                    \
1580             && charset == CHARSET_ASCII)                                   \
1581           {                                                                \
1582             /* We should insert this designation sequence as is so         \
1583                that it is surely written back to a file.  */               \
1584             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1585             goto label_invalid_code;                                       \
1586           }                                                                \
1587         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1588         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1589             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1590           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1591         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1592       }                                                                    \
1593     else                                                                   \
1594       {                                                                    \
1595         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1596         goto label_invalid_code;                                           \
1597       }                                                                    \
1598   } while (0)
1599
1600 /* Allocate a memory block for storing information about compositions.
1601    The block is chained to the already allocated blocks.  */
1602
1603 void
1604 coding_allocate_composition_data (coding, char_offset)
1605      struct coding_system *coding;
1606      int char_offset;
1607 {
1608   struct composition_data *cmp_data
1609     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1610
1611   cmp_data->char_offset = char_offset;
1612   cmp_data->used = 0;
1613   cmp_data->prev = coding->cmp_data;
1614   cmp_data->next = NULL;
1615   if (coding->cmp_data)
1616     coding->cmp_data->next = cmp_data;
1617   coding->cmp_data = cmp_data;
1618   coding->cmp_data_start = 0;
1619 }
1620
1621 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1622    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1623    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1624    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1625    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1626   */
1627
1628 #define DECODE_COMPOSITION_START(c1)                                       \
1629   do {                                                                     \
1630     if (coding->composing == COMPOSITION_DISABLED)                         \
1631       {                                                                    \
1632         *dst++ = ISO_CODE_ESC;                                             \
1633         *dst++ = c1 & 0x7f;                                                \
1634         coding->produced_char += 2;                                        \
1635       }                                                                    \
1636     else if (!COMPOSING_P (coding))                                        \
1637       {                                                                    \
1638         /* This is surely the start of a composition.  We must be sure     \
1639            that coding->cmp_data has enough space to store the             \
1640            information about the composition.  If not, terminate the       \
1641            current decoding loop, allocate one more memory block for       \
1642            coding->cmp_data in the caller, then start the decoding         \
1643            loop again.  We can't allocate memory here directly because     \
1644            it may cause buffer/string relocation.  */                      \
1645         if (!coding->cmp_data                                              \
1646             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1647                 >= COMPOSITION_DATA_SIZE))                                 \
1648           {                                                                \
1649             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1650             goto label_end_of_loop;                                        \
1651           }                                                                \
1652         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1653                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1654                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1655                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1656         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1657                                       coding->composing);                  \
1658         coding->composition_rule_follows = 0;                              \
1659       }                                                                    \
1660     else                                                                   \
1661       {                                                                    \
1662         /* We are already handling a composition.  If the method is        \
1663            the following two, the codes following the current escape       \
1664            sequence are actual characters stored in a buffer.  */          \
1665         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1666             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1667           {                                                                \
1668             coding->composing = COMPOSITION_RELATIVE;                      \
1669             coding->composition_rule_follows = 0;                          \
1670           }                                                                \
1671       }                                                                    \
1672   } while (0)
1673
1674 /* Handle composition end sequence ESC 1.  */
1675
1676 #define DECODE_COMPOSITION_END(c1)                                      \
1677   do {                                                                  \
1678     if (! COMPOSING_P (coding))                                         \
1679       {                                                                 \
1680         *dst++ = ISO_CODE_ESC;                                          \
1681         *dst++ = c1;                                                    \
1682         coding->produced_char += 2;                                     \
1683       }                                                                 \
1684     else                                                                \
1685       {                                                                 \
1686         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1687         coding->composing = COMPOSITION_NO;                             \
1688       }                                                                 \
1689   } while (0)
1690
1691 /* Decode a composition rule from the byte C1 (and maybe one more byte
1692    from SRC) and store one encoded composition rule in
1693    coding->cmp_data.  */
1694
1695 #define DECODE_COMPOSITION_RULE(c1)                                     \
1696   do {                                                                  \
1697     int rule = 0;                                                       \
1698     (c1) -= 32;                                                         \
1699     if (c1 < 81)                /* old format (before ver.21) */        \
1700       {                                                                 \
1701         int gref = (c1) / 9;                                            \
1702         int nref = (c1) % 9;                                            \
1703         if (gref == 4) gref = 10;                                       \
1704         if (nref == 4) nref = 10;                                       \
1705         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1706       }                                                                 \
1707     else if (c1 < 93)           /* new format (after ver.21) */         \
1708       {                                                                 \
1709         ONE_MORE_BYTE (c2);                                             \
1710         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1711       }                                                                 \
1712     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1713     coding->composition_rule_follows = 0;                               \
1714   } while (0)
1715
1716
1717 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1718
1719 static void
1720 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1721      struct coding_system *coding;
1722      unsigned char *source, *destination;
1723      int src_bytes, dst_bytes;
1724 {
1725   unsigned char *src = source;
1726   unsigned char *src_end = source + src_bytes;
1727   unsigned char *dst = destination;
1728   unsigned char *dst_end = destination + dst_bytes;
1729   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1730   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1731   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1732   /* SRC_BASE remembers the start position in source in each loop.
1733      The loop will be exited when there's not enough source code
1734      (within macro ONE_MORE_BYTE), or when there's not enough
1735      destination area to produce a character (within macro
1736      EMIT_CHAR).  */
1737   unsigned char *src_base;
1738   int c, charset;
1739   Lisp_Object translation_table;
1740   Lisp_Object safe_chars;
1741
1742   safe_chars = coding_safe_chars (coding);
1743
1744   if (NILP (Venable_character_translation))
1745     translation_table = Qnil;
1746   else
1747     {
1748       translation_table = coding->translation_table_for_decode;
1749       if (NILP (translation_table))
1750         translation_table = Vstandard_translation_table_for_decode;
1751     }
1752
1753   coding->result = CODING_FINISH_NORMAL;
1754
1755   while (1)
1756     {
1757       int c1, c2;
1758
1759       src_base = src;
1760       ONE_MORE_BYTE (c1);
1761
1762       /* We produce no character or one character.  */
1763       switch (iso_code_class [c1])
1764         {
1765         case ISO_0x20_or_0x7F:
1766           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1767             {
1768               DECODE_COMPOSITION_RULE (c1);
1769               continue;
1770             }
1771           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1772             {
1773               /* This is SPACE or DEL.  */
1774               charset = CHARSET_ASCII;
1775               break;
1776             }
1777           /* This is a graphic character, we fall down ...  */
1778
1779         case ISO_graphic_plane_0:
1780           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1781             {
1782               DECODE_COMPOSITION_RULE (c1);
1783               continue;
1784             }
1785           charset = charset0;
1786           break;
1787
1788         case ISO_0xA0_or_0xFF:
1789           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1790               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1791             goto label_invalid_code;
1792           /* This is a graphic character, we fall down ... */
1793
1794         case ISO_graphic_plane_1:
1795           if (charset1 < 0)
1796             goto label_invalid_code;
1797           charset = charset1;
1798           break;
1799
1800         case ISO_control_0:
1801           if (COMPOSING_P (coding))
1802             DECODE_COMPOSITION_END ('1');
1803
1804           /* All ISO2022 control characters in this class have the
1805              same representation in Emacs internal format.  */
1806           if (c1 == '\n'
1807               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1808               && (coding->eol_type == CODING_EOL_CR
1809                   || coding->eol_type == CODING_EOL_CRLF))
1810             {
1811               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1812               goto label_end_of_loop;
1813             }
1814           charset = CHARSET_ASCII;
1815           break;
1816
1817         case ISO_control_1:
1818           if (COMPOSING_P (coding))
1819             DECODE_COMPOSITION_END ('1');
1820           goto label_invalid_code;
1821
1822         case ISO_carriage_return:
1823           if (COMPOSING_P (coding))
1824             DECODE_COMPOSITION_END ('1');
1825
1826           if (coding->eol_type == CODING_EOL_CR)
1827             c1 = '\n';
1828           else if (coding->eol_type == CODING_EOL_CRLF)
1829             {
1830               ONE_MORE_BYTE (c1);
1831               if (c1 != ISO_CODE_LF)
1832                 {
1833                   src--;
1834                   c1 = '\r';
1835                 }
1836             }
1837           charset = CHARSET_ASCII;
1838           break;
1839
1840         case ISO_shift_out:
1841           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1842               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1843             goto label_invalid_code;
1844           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1845           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1846           continue;
1847
1848         case ISO_shift_in:
1849           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1850             goto label_invalid_code;
1851           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1852           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1853           continue;
1854
1855         case ISO_single_shift_2_7:
1856         case ISO_single_shift_2:
1857           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1858             goto label_invalid_code;
1859           /* SS2 is handled as an escape sequence of ESC 'N' */
1860           c1 = 'N';
1861           goto label_escape_sequence;
1862
1863         case ISO_single_shift_3:
1864           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1865             goto label_invalid_code;
1866           /* SS2 is handled as an escape sequence of ESC 'O' */
1867           c1 = 'O';
1868           goto label_escape_sequence;
1869
1870         case ISO_control_sequence_introducer:
1871           /* CSI is handled as an escape sequence of ESC '[' ...  */
1872           c1 = '[';
1873           goto label_escape_sequence;
1874
1875         case ISO_escape:
1876           ONE_MORE_BYTE (c1);
1877         label_escape_sequence:
1878           /* Escape sequences handled by Emacs are invocation,
1879              designation, direction specification, and character
1880              composition specification.  */
1881           switch (c1)
1882             {
1883             case '&':           /* revision of following character set */
1884               ONE_MORE_BYTE (c1);
1885               if (!(c1 >= '@' && c1 <= '~'))
1886                 goto label_invalid_code;
1887               ONE_MORE_BYTE (c1);
1888               if (c1 != ISO_CODE_ESC)
1889                 goto label_invalid_code;
1890               ONE_MORE_BYTE (c1);
1891               goto label_escape_sequence;
1892
1893             case '$':           /* designation of 2-byte character set */
1894               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1895                 goto label_invalid_code;
1896               ONE_MORE_BYTE (c1);
1897               if (c1 >= '@' && c1 <= 'B')
1898                 {       /* designation of JISX0208.1978, GB2312.1980,
1899                            or JISX0208.1980 */
1900                   DECODE_DESIGNATION (0, 2, 94, c1);
1901                 }
1902               else if (c1 >= 0x28 && c1 <= 0x2B)
1903                 {       /* designation of DIMENSION2_CHARS94 character set */
1904                   ONE_MORE_BYTE (c2);
1905                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1906                 }
1907               else if (c1 >= 0x2C && c1 <= 0x2F)
1908                 {       /* designation of DIMENSION2_CHARS96 character set */
1909                   ONE_MORE_BYTE (c2);
1910                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1911                 }
1912               else
1913                 goto label_invalid_code;
1914               /* We must update these variables now.  */
1915               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1916               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1917               continue;
1918
1919             case 'n':           /* invocation of locking-shift-2 */
1920               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1921                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1922                 goto label_invalid_code;
1923               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1924               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1925               continue;
1926
1927             case 'o':           /* invocation of locking-shift-3 */
1928               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1929                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1930                 goto label_invalid_code;
1931               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1932               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1933               continue;
1934
1935             case 'N':           /* invocation of single-shift-2 */
1936               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1937                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1938                 goto label_invalid_code;
1939               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1940               ONE_MORE_BYTE (c1);
1941               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1942                 goto label_invalid_code;
1943               break;
1944
1945             case 'O':           /* invocation of single-shift-3 */
1946               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1947                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1948                 goto label_invalid_code;
1949               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1950               ONE_MORE_BYTE (c1);
1951               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1952                 goto label_invalid_code;
1953               break;
1954
1955             case '0': case '2': case '3': case '4': /* start composition */
1956               DECODE_COMPOSITION_START (c1);
1957               continue;
1958
1959             case '1':           /* end composition */
1960               DECODE_COMPOSITION_END (c1);
1961               continue;
1962
1963             case '[':           /* specification of direction */
1964               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1965                 goto label_invalid_code;
1966               /* For the moment, nested direction is not supported.
1967                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1968                  left-to-right, and nonzero means right-to-left.  */
1969               ONE_MORE_BYTE (c1);
1970               switch (c1)
1971                 {
1972                 case ']':       /* end of the current direction */
1973                   coding->mode &= ~CODING_MODE_DIRECTION;
1974
1975                 case '0':       /* end of the current direction */
1976                 case '1':       /* start of left-to-right direction */
1977                   ONE_MORE_BYTE (c1);
1978                   if (c1 == ']')
1979                     coding->mode &= ~CODING_MODE_DIRECTION;
1980                   else
1981                     goto label_invalid_code;
1982                   break;
1983
1984                 case '2':       /* start of right-to-left direction */
1985                   ONE_MORE_BYTE (c1);
1986                   if (c1 == ']')
1987                     coding->mode |= CODING_MODE_DIRECTION;
1988                   else
1989                     goto label_invalid_code;
1990                   break;
1991
1992                 default:
1993                   goto label_invalid_code;
1994                 }
1995               continue;
1996
1997             default:
1998               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999                 goto label_invalid_code;
2000               if (c1 >= 0x28 && c1 <= 0x2B)
2001                 {       /* designation of DIMENSION1_CHARS94 character set */
2002                   ONE_MORE_BYTE (c2);
2003                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2004                 }
2005               else if (c1 >= 0x2C && c1 <= 0x2F)
2006                 {       /* designation of DIMENSION1_CHARS96 character set */
2007                   ONE_MORE_BYTE (c2);
2008                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2009                 }
2010               else
2011                 goto label_invalid_code;
2012               /* We must update these variables now.  */
2013               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2014               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2015               continue;
2016             }
2017         }
2018
2019       /* Now we know CHARSET and 1st position code C1 of a character.
2020          Produce a multibyte sequence for that character while getting
2021          2nd position code C2 if necessary.  */
2022       if (CHARSET_DIMENSION (charset) == 2)
2023         {
2024           ONE_MORE_BYTE (c2);
2025           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2026             /* C2 is not in a valid range.  */
2027             goto label_invalid_code;
2028         }
2029       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2030       EMIT_CHAR (c);
2031       continue;
2032
2033     label_invalid_code:
2034       coding->errors++;
2035       if (COMPOSING_P (coding))
2036         DECODE_COMPOSITION_END ('1');
2037       src = src_base;
2038       c = *src++;
2039       EMIT_CHAR (c);
2040     }
2041
2042  label_end_of_loop:
2043   coding->consumed = coding->consumed_char = src_base - source;
2044   coding->produced = dst - destination;
2045   return;
2046 }
2047
2048
2049 /* ISO2022 encoding stuff.  */
2050
2051 /*
2052    It is not enough to say just "ISO2022" on encoding, we have to
2053    specify more details.  In Emacs, each ISO2022 coding system
2054    variant has the following specifications:
2055         1. Initial designation to G0 through G3.
2056         2. Allows short-form designation?
2057         3. ASCII should be designated to G0 before control characters?
2058         4. ASCII should be designated to G0 at end of line?
2059         5. 7-bit environment or 8-bit environment?
2060         6. Use locking-shift?
2061         7. Use Single-shift?
2062    And the following two are only for Japanese:
2063         8. Use ASCII in place of JIS0201-1976-Roman?
2064         9. Use JISX0208-1983 in place of JISX0208-1978?
2065    These specifications are encoded in `coding->flags' as flag bits
2066    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2067    details.
2068 */
2069
2070 /* Produce codes (escape sequence) for designating CHARSET to graphic
2071    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2072    '@', 'A', or 'B' and the coding system CODING allows, produce
2073    designation sequence of short-form.  */
2074
2075 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2076   do {                                                                  \
2077     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2078     char *intermediate_char_94 = "()*+";                                \
2079     char *intermediate_char_96 = ",-./";                                \
2080     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2081                                                                         \
2082     if (revision < 255)                                                 \
2083       {                                                                 \
2084         *dst++ = ISO_CODE_ESC;                                          \
2085         *dst++ = '&';                                                   \
2086         *dst++ = '@' + revision;                                        \
2087       }                                                                 \
2088     *dst++ = ISO_CODE_ESC;                                              \
2089     if (CHARSET_DIMENSION (charset) == 1)                               \
2090       {                                                                 \
2091         if (CHARSET_CHARS (charset) == 94)                              \
2092           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2093         else                                                            \
2094           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2095       }                                                                 \
2096     else                                                                \
2097       {                                                                 \
2098         *dst++ = '$';                                                   \
2099         if (CHARSET_CHARS (charset) == 94)                              \
2100           {                                                             \
2101             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2102                 || reg != 0                                             \
2103                 || final_char < '@' || final_char > 'B')                \
2104               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2105           }                                                             \
2106         else                                                            \
2107           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2108       }                                                                 \
2109     *dst++ = final_char;                                                \
2110     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2111   } while (0)
2112
2113 /* The following two macros produce codes (control character or escape
2114    sequence) for ISO2022 single-shift functions (single-shift-2 and
2115    single-shift-3).  */
2116
2117 #define ENCODE_SINGLE_SHIFT_2                           \
2118   do {                                                  \
2119     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2120       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2121     else                                                \
2122       *dst++ = ISO_CODE_SS2;                            \
2123     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2124   } while (0)
2125
2126 #define ENCODE_SINGLE_SHIFT_3                           \
2127   do {                                                  \
2128     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2129       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2130     else                                                \
2131       *dst++ = ISO_CODE_SS3;                            \
2132     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2133   } while (0)
2134
2135 /* The following four macros produce codes (control character or
2136    escape sequence) for ISO2022 locking-shift functions (shift-in,
2137    shift-out, locking-shift-2, and locking-shift-3).  */
2138
2139 #define ENCODE_SHIFT_IN                         \
2140   do {                                          \
2141     *dst++ = ISO_CODE_SI;                       \
2142     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2143   } while (0)
2144
2145 #define ENCODE_SHIFT_OUT                        \
2146   do {                                          \
2147     *dst++ = ISO_CODE_SO;                       \
2148     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2149   } while (0)
2150
2151 #define ENCODE_LOCKING_SHIFT_2                  \
2152   do {                                          \
2153     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2154     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2155   } while (0)
2156
2157 #define ENCODE_LOCKING_SHIFT_3                  \
2158   do {                                          \
2159     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2160     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2161   } while (0)
2162
2163 /* Produce codes for a DIMENSION1 character whose character set is
2164    CHARSET and whose position-code is C1.  Designation and invocation
2165    sequences are also produced in advance if necessary.  */
2166
2167 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2168   do {                                                                  \
2169     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2170       {                                                                 \
2171         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2172           *dst++ = c1 & 0x7F;                                           \
2173         else                                                            \
2174           *dst++ = c1 | 0x80;                                           \
2175         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2176         break;                                                          \
2177       }                                                                 \
2178     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2179       {                                                                 \
2180         *dst++ = c1 & 0x7F;                                             \
2181         break;                                                          \
2182       }                                                                 \
2183     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2184       {                                                                 \
2185         *dst++ = c1 | 0x80;                                             \
2186         break;                                                          \
2187       }                                                                 \
2188     else                                                                \
2189       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2190          must invoke it, or, at first, designate it to some graphic     \
2191          register.  Then repeat the loop to actually produce the        \
2192          character.  */                                                 \
2193       dst = encode_invocation_designation (charset, coding, dst);       \
2194   } while (1)
2195
2196 /* Produce codes for a DIMENSION2 character whose character set is
2197    CHARSET and whose position-codes are C1 and C2.  Designation and
2198    invocation codes are also produced in advance if necessary.  */
2199
2200 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2201   do {                                                                  \
2202     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2203       {                                                                 \
2204         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2205           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2206         else                                                            \
2207           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2208         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2209         break;                                                          \
2210       }                                                                 \
2211     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2212       {                                                                 \
2213         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2214         break;                                                          \
2215       }                                                                 \
2216     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2217       {                                                                 \
2218         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2219         break;                                                          \
2220       }                                                                 \
2221     else                                                                \
2222       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2223          must invoke it, or, at first, designate it to some graphic     \
2224          register.  Then repeat the loop to actually produce the        \
2225          character.  */                                                 \
2226       dst = encode_invocation_designation (charset, coding, dst);       \
2227   } while (1)
2228
2229 #define ENCODE_ISO_CHARACTER(c)                                 \
2230   do {                                                          \
2231     int charset, c1, c2;                                        \
2232                                                                 \
2233     SPLIT_CHAR (c, charset, c1, c2);                            \
2234     if (CHARSET_DEFINED_P (charset))                            \
2235       {                                                         \
2236         if (CHARSET_DIMENSION (charset) == 1)                   \
2237           {                                                     \
2238             if (charset == CHARSET_ASCII                        \
2239                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2240               charset = charset_latin_jisx0201;                 \
2241             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2242           }                                                     \
2243         else                                                    \
2244           {                                                     \
2245             if (charset == charset_jisx0208                     \
2246                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2247               charset = charset_jisx0208_1978;                  \
2248             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2249           }                                                     \
2250       }                                                         \
2251     else                                                        \
2252       {                                                         \
2253         *dst++ = c1;                                            \
2254         if (c2 >= 0)                                            \
2255           *dst++ = c2;                                          \
2256       }                                                         \
2257   } while (0)
2258
2259
2260 /* Instead of encoding character C, produce one or two `?'s.  */
2261
2262 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
2263   do {                                                                  \
2264     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
2265     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
2266       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
2267   } while (0)
2268
2269
2270 /* Produce designation and invocation codes at a place pointed by DST
2271    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2272    Return new DST.  */
2273
2274 unsigned char *
2275 encode_invocation_designation (charset, coding, dst)
2276      int charset;
2277      struct coding_system *coding;
2278      unsigned char *dst;
2279 {
2280   int reg;                      /* graphic register number */
2281
2282   /* At first, check designations.  */
2283   for (reg = 0; reg < 4; reg++)
2284     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2285       break;
2286
2287   if (reg >= 4)
2288     {
2289       /* CHARSET is not yet designated to any graphic registers.  */
2290       /* At first check the requested designation.  */
2291       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2292       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2293         /* Since CHARSET requests no special designation, designate it
2294            to graphic register 0.  */
2295         reg = 0;
2296
2297       ENCODE_DESIGNATION (charset, reg, coding);
2298     }
2299
2300   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2301       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2302     {
2303       /* Since the graphic register REG is not invoked to any graphic
2304          planes, invoke it to graphic plane 0.  */
2305       switch (reg)
2306         {
2307         case 0:                 /* graphic register 0 */
2308           ENCODE_SHIFT_IN;
2309           break;
2310
2311         case 1:                 /* graphic register 1 */
2312           ENCODE_SHIFT_OUT;
2313           break;
2314
2315         case 2:                 /* graphic register 2 */
2316           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2317             ENCODE_SINGLE_SHIFT_2;
2318           else
2319             ENCODE_LOCKING_SHIFT_2;
2320           break;
2321
2322         case 3:                 /* graphic register 3 */
2323           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2324             ENCODE_SINGLE_SHIFT_3;
2325           else
2326             ENCODE_LOCKING_SHIFT_3;
2327           break;
2328         }
2329     }
2330
2331   return dst;
2332 }
2333
2334 /* Produce 2-byte codes for encoded composition rule RULE.  */
2335
2336 #define ENCODE_COMPOSITION_RULE(rule)           \
2337   do {                                          \
2338     int gref, nref;                             \
2339     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2340     *dst++ = 32 + 81 + gref;                    \
2341     *dst++ = 32 + nref;                         \
2342   } while (0)
2343
2344 /* Produce codes for indicating the start of a composition sequence
2345    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2346    which specify information about the composition.  See the comment
2347    in coding.h for the format of DATA.  */
2348
2349 #define ENCODE_COMPOSITION_START(coding, data)                          \
2350   do {                                                                  \
2351     coding->composing = data[3];                                        \
2352     *dst++ = ISO_CODE_ESC;                                              \
2353     if (coding->composing == COMPOSITION_RELATIVE)                      \
2354       *dst++ = '0';                                                     \
2355     else                                                                \
2356       {                                                                 \
2357         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2358                   ? '3' : '4');                                         \
2359         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2360         coding->composition_rule_follows = 0;                           \
2361       }                                                                 \
2362   } while (0)
2363
2364 /* Produce codes for indicating the end of the current composition.  */
2365
2366 #define ENCODE_COMPOSITION_END(coding, data)                    \
2367   do {                                                          \
2368     *dst++ = ISO_CODE_ESC;                                      \
2369     *dst++ = '1';                                               \
2370     coding->cmp_data_start += data[0];                          \
2371     coding->composing = COMPOSITION_NO;                         \
2372     if (coding->cmp_data_start == coding->cmp_data->used        \
2373         && coding->cmp_data->next)                              \
2374       {                                                         \
2375         coding->cmp_data = coding->cmp_data->next;              \
2376         coding->cmp_data_start = 0;                             \
2377       }                                                         \
2378   } while (0)
2379
2380 /* Produce composition start sequence ESC 0.  Here, this sequence
2381    doesn't mean the start of a new composition but means that we have
2382    just produced components (alternate chars and composition rules) of
2383    the composition and the actual text follows in SRC.  */
2384
2385 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2386   do {                                          \
2387     *dst++ = ISO_CODE_ESC;                      \
2388     *dst++ = '0';                               \
2389     coding->composing = COMPOSITION_RELATIVE;   \
2390   } while (0)
2391
2392 /* The following three macros produce codes for indicating direction
2393    of text.  */
2394 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2395   do {                                                  \
2396     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2397       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2398     else                                                \
2399       *dst++ = ISO_CODE_CSI;                            \
2400   } while (0)
2401
2402 #define ENCODE_DIRECTION_R2L    \
2403   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2404
2405 #define ENCODE_DIRECTION_L2R    \
2406   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2407
2408 /* Produce codes for designation and invocation to reset the graphic
2409    planes and registers to initial state.  */
2410 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2411   do {                                                                      \
2412     int reg;                                                                \
2413     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2414       ENCODE_SHIFT_IN;                                                      \
2415     for (reg = 0; reg < 4; reg++)                                           \
2416       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2417           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2418               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2419         ENCODE_DESIGNATION                                                  \
2420           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2421   } while (0)
2422
2423 /* Produce designation sequences of charsets in the line started from
2424    SRC to a place pointed by DST, and return updated DST.
2425
2426    If the current block ends before any end-of-line, we may fail to
2427    find all the necessary designations.  */
2428
2429 static unsigned char *
2430 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2431      struct coding_system *coding;
2432      Lisp_Object translation_table;
2433      unsigned char *src, *src_end, *dst;
2434 {
2435   int charset, c, found = 0, reg;
2436   /* Table of charsets to be designated to each graphic register.  */
2437   int r[4];
2438
2439   for (reg = 0; reg < 4; reg++)
2440     r[reg] = -1;
2441
2442   while (found < 4)
2443     {
2444       ONE_MORE_CHAR (c);
2445       if (c == '\n')
2446         break;
2447
2448       charset = CHAR_CHARSET (c);
2449       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2450       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2451         {
2452           found++;
2453           r[reg] = charset;
2454         }
2455     }
2456
2457  label_end_of_loop:
2458   if (found)
2459     {
2460       for (reg = 0; reg < 4; reg++)
2461         if (r[reg] >= 0
2462             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2463           ENCODE_DESIGNATION (r[reg], reg, coding);
2464     }
2465
2466   return dst;
2467 }
2468
2469 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2470
2471 static void
2472 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2473      struct coding_system *coding;
2474      unsigned char *source, *destination;
2475      int src_bytes, dst_bytes;
2476 {
2477   unsigned char *src = source;
2478   unsigned char *src_end = source + src_bytes;
2479   unsigned char *dst = destination;
2480   unsigned char *dst_end = destination + dst_bytes;
2481   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2482      from DST_END to assure overflow checking is necessary only at the
2483      head of loop.  */
2484   unsigned char *adjusted_dst_end = dst_end - 19;
2485   /* SRC_BASE remembers the start position in source in each loop.
2486      The loop will be exited when there's not enough source text to
2487      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2488      there's not enough destination area to produce encoded codes
2489      (within macro EMIT_BYTES).  */
2490   unsigned char *src_base;
2491   int c;
2492   Lisp_Object translation_table;
2493   Lisp_Object safe_chars;
2494
2495   safe_chars = coding_safe_chars (coding);
2496
2497   if (NILP (Venable_character_translation))
2498     translation_table = Qnil;
2499   else
2500     {
2501       translation_table = coding->translation_table_for_encode;
2502       if (NILP (translation_table))
2503         translation_table = Vstandard_translation_table_for_encode;
2504     }
2505
2506   coding->consumed_char = 0;
2507   coding->errors = 0;
2508   while (1)
2509     {
2510       src_base = src;
2511
2512       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2513         {
2514           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2515           break;
2516         }
2517
2518       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2519           && CODING_SPEC_ISO_BOL (coding))
2520         {
2521           /* We have to produce designation sequences if any now.  */
2522           dst = encode_designation_at_bol (coding, translation_table,
2523                                            src, src_end, dst);
2524           CODING_SPEC_ISO_BOL (coding) = 0;
2525         }
2526
2527       /* Check composition start and end.  */
2528       if (coding->composing != COMPOSITION_DISABLED
2529           && coding->cmp_data_start < coding->cmp_data->used)
2530         {
2531           struct composition_data *cmp_data = coding->cmp_data;
2532           int *data = cmp_data->data + coding->cmp_data_start;
2533           int this_pos = cmp_data->char_offset + coding->consumed_char;
2534
2535           if (coding->composing == COMPOSITION_RELATIVE)
2536             {
2537               if (this_pos == data[2])
2538                 {
2539                   ENCODE_COMPOSITION_END (coding, data);
2540                   cmp_data = coding->cmp_data;
2541                   data = cmp_data->data + coding->cmp_data_start;
2542                 }
2543             }
2544           else if (COMPOSING_P (coding))
2545             {
2546               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2547               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2548                 /* We have consumed components of the composition.
2549                    What follows in SRC is the composition's base
2550                    text.  */
2551                 ENCODE_COMPOSITION_FAKE_START (coding);
2552               else
2553                 {
2554                   int c = cmp_data->data[coding->cmp_data_index++];
2555                   if (coding->composition_rule_follows)
2556                     {
2557                       ENCODE_COMPOSITION_RULE (c);
2558                       coding->composition_rule_follows = 0;
2559                     }
2560                   else
2561                     {
2562                       if (coding->flags & CODING_FLAG_ISO_SAFE
2563                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2564                         ENCODE_UNSAFE_CHARACTER (c);
2565                       else
2566                         ENCODE_ISO_CHARACTER (c);
2567                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2568                         coding->composition_rule_follows = 1;
2569                     }
2570                   continue;
2571                 }
2572             }
2573           if (!COMPOSING_P (coding))
2574             {
2575               if (this_pos == data[1])
2576                 {
2577                   ENCODE_COMPOSITION_START (coding, data);
2578                   continue;
2579                 }
2580             }
2581         }
2582
2583       ONE_MORE_CHAR (c);
2584
2585       /* Now encode the character C.  */
2586       if (c < 0x20 || c == 0x7F)
2587         {
2588           if (c == '\r')
2589             {
2590               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2591                 {
2592                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2593                     ENCODE_RESET_PLANE_AND_REGISTER;
2594                   *dst++ = c;
2595                   continue;
2596                 }
2597               /* fall down to treat '\r' as '\n' ...  */
2598               c = '\n';
2599             }
2600           if (c == '\n')
2601             {
2602               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2603                 ENCODE_RESET_PLANE_AND_REGISTER;
2604               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2605                 bcopy (coding->spec.iso2022.initial_designation,
2606                        coding->spec.iso2022.current_designation,
2607                        sizeof coding->spec.iso2022.initial_designation);
2608               if (coding->eol_type == CODING_EOL_LF
2609                   || coding->eol_type == CODING_EOL_UNDECIDED)
2610                 *dst++ = ISO_CODE_LF;
2611               else if (coding->eol_type == CODING_EOL_CRLF)
2612                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2613               else
2614                 *dst++ = ISO_CODE_CR;
2615               CODING_SPEC_ISO_BOL (coding) = 1;
2616             }
2617           else
2618             {
2619               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2620                 ENCODE_RESET_PLANE_AND_REGISTER;
2621               *dst++ = c;
2622             }
2623         }
2624       else if (ASCII_BYTE_P (c))
2625         ENCODE_ISO_CHARACTER (c);
2626       else if (SINGLE_BYTE_CHAR_P (c))
2627         {
2628           *dst++ = c;
2629           coding->errors++;
2630         }
2631       else if (coding->flags & CODING_FLAG_ISO_SAFE
2632                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2633         ENCODE_UNSAFE_CHARACTER (c);
2634       else
2635         ENCODE_ISO_CHARACTER (c);
2636
2637       coding->consumed_char++;
2638     }
2639
2640  label_end_of_loop:
2641   coding->consumed = src_base - source;
2642   coding->produced = coding->produced_char = dst - destination;
2643 }
2644
2645 \f
2646 /*** 4. SJIS and BIG5 handlers ***/
2647
2648 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2649    quite widely.  So, for the moment, Emacs supports them in the bare
2650    C code.  But, in the future, they may be supported only by CCL.  */
2651
2652 /* SJIS is a coding system encoding three character sets: ASCII, right
2653    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2654    as is.  A character of charset katakana-jisx0201 is encoded by
2655    "position-code + 0x80".  A character of charset japanese-jisx0208
2656    is encoded in 2-byte but two position-codes are divided and shifted
2657    so that it fits in the range below.
2658
2659    --- CODE RANGE of SJIS ---
2660    (character set)      (range)
2661    ASCII                0x00 .. 0x7F
2662    KATAKANA-JISX0201    0xA1 .. 0xDF
2663    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2664             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2665    -------------------------------
2666
2667 */
2668
2669 /* BIG5 is a coding system encoding two character sets: ASCII and
2670    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2671    character set and is encoded in two bytes.
2672
2673    --- CODE RANGE of BIG5 ---
2674    (character set)      (range)
2675    ASCII                0x00 .. 0x7F
2676    Big5 (1st byte)      0xA1 .. 0xFE
2677         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2678    --------------------------
2679
2680    Since the number of characters in Big5 is larger than maximum
2681    characters in Emacs' charset (96x96), it can't be handled as one
2682    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2683    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2684    contains frequently used characters and the latter contains less
2685    frequently used characters.  */
2686
2687 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2688    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2689    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2690    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2691
2692 /* Number of Big5 characters which have the same code in 1st byte.  */
2693 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2694
2695 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2696   do {                                                                  \
2697     unsigned int temp                                                   \
2698       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2699     if (b1 < 0xC9)                                                      \
2700       charset = charset_big5_1;                                         \
2701     else                                                                \
2702       {                                                                 \
2703         charset = charset_big5_2;                                       \
2704         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2705       }                                                                 \
2706     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2707     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2708   } while (0)
2709
2710 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2711   do {                                                                  \
2712     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2713     if (charset == charset_big5_2)                                      \
2714       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2715     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2716     b2 = temp % BIG5_SAME_ROW;                                          \
2717     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2718   } while (0)
2719
2720 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2721    Check if a text is encoded in SJIS.  If it is, return
2722    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2723
2724 static int
2725 detect_coding_sjis (src, src_end, multibytep)
2726      unsigned char *src, *src_end;
2727      int multibytep;
2728 {
2729   int c;
2730   /* Dummy for ONE_MORE_BYTE.  */
2731   struct coding_system dummy_coding;
2732   struct coding_system *coding = &dummy_coding;
2733
2734   while (1)
2735     {
2736       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2737       if (c < 0x80)
2738         continue;
2739       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2740         return 0;
2741       if (c <= 0x9F || c >= 0xE0)
2742         {
2743           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2744           if (c < 0x40 || c == 0x7F || c > 0xFC)
2745             return 0;
2746         }
2747     }
2748  label_end_of_loop:
2749   return CODING_CATEGORY_MASK_SJIS;
2750 }
2751
2752 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2753    Check if a text is encoded in BIG5.  If it is, return
2754    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2755
2756 static int
2757 detect_coding_big5 (src, src_end, multibytep)
2758      unsigned char *src, *src_end;
2759      int multibytep;
2760 {
2761   int c;
2762   /* Dummy for ONE_MORE_BYTE.  */
2763   struct coding_system dummy_coding;
2764   struct coding_system *coding = &dummy_coding;
2765
2766   while (1)
2767     {
2768       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2769       if (c < 0x80)
2770         continue;
2771       if (c < 0xA1 || c > 0xFE)
2772         return 0;
2773       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2774       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2775         return 0;
2776     }
2777  label_end_of_loop:
2778   return CODING_CATEGORY_MASK_BIG5;
2779 }
2780
2781 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2782    Check if a text is encoded in UTF-8.  If it is, return
2783    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2784
2785 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2786 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2787 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2788 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2789 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2790 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2791 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2792
2793 static int
2794 detect_coding_utf_8 (src, src_end, multibytep)
2795      unsigned char *src, *src_end;
2796      int multibytep;
2797 {
2798   unsigned char c;
2799   int seq_maybe_bytes;
2800   /* Dummy for ONE_MORE_BYTE.  */
2801   struct coding_system dummy_coding;
2802   struct coding_system *coding = &dummy_coding;
2803
2804   while (1)
2805     {
2806       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2807       if (UTF_8_1_OCTET_P (c))
2808         continue;
2809       else if (UTF_8_2_OCTET_LEADING_P (c))
2810         seq_maybe_bytes = 1;
2811       else if (UTF_8_3_OCTET_LEADING_P (c))
2812         seq_maybe_bytes = 2;
2813       else if (UTF_8_4_OCTET_LEADING_P (c))
2814         seq_maybe_bytes = 3;
2815       else if (UTF_8_5_OCTET_LEADING_P (c))
2816         seq_maybe_bytes = 4;
2817       else if (UTF_8_6_OCTET_LEADING_P (c))
2818         seq_maybe_bytes = 5;
2819       else
2820         return 0;
2821
2822       do
2823         {
2824           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2825           if (!UTF_8_EXTRA_OCTET_P (c))
2826             return 0;
2827           seq_maybe_bytes--;
2828         }
2829       while (seq_maybe_bytes > 0);
2830     }
2831
2832  label_end_of_loop:
2833   return CODING_CATEGORY_MASK_UTF_8;
2834 }
2835
2836 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2837    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2838    Little Endian (otherwise).  If it is, return
2839    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2840    else return 0.  */
2841
2842 #define UTF_16_INVALID_P(val)   \
2843   (((val) == 0xFFFE)            \
2844    || ((val) == 0xFFFF))
2845
2846 #define UTF_16_HIGH_SURROGATE_P(val) \
2847   (((val) & 0xD800) == 0xD800)
2848
2849 #define UTF_16_LOW_SURROGATE_P(val) \
2850   (((val) & 0xDC00) == 0xDC00)
2851
2852 static int
2853 detect_coding_utf_16 (src, src_end, multibytep)
2854      unsigned char *src, *src_end;
2855      int multibytep;
2856 {
2857   unsigned char c1, c2;
2858   /* Dummy for TWO_MORE_BYTES.  */
2859   struct coding_system dummy_coding;
2860   struct coding_system *coding = &dummy_coding;
2861
2862   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2863   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2864
2865   if ((c1 == 0xFF) && (c2 == 0xFE))
2866     return CODING_CATEGORY_MASK_UTF_16_LE;
2867   else if ((c1 == 0xFE) && (c2 == 0xFF))
2868     return CODING_CATEGORY_MASK_UTF_16_BE;
2869
2870  label_end_of_loop:
2871   return 0;
2872 }
2873
2874 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2875    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2876
2877 static void
2878 decode_coding_sjis_big5 (coding, source, destination,
2879                          src_bytes, dst_bytes, sjis_p)
2880      struct coding_system *coding;
2881      unsigned char *source, *destination;
2882      int src_bytes, dst_bytes;
2883      int sjis_p;
2884 {
2885   unsigned char *src = source;
2886   unsigned char *src_end = source + src_bytes;
2887   unsigned char *dst = destination;
2888   unsigned char *dst_end = destination + dst_bytes;
2889   /* SRC_BASE remembers the start position in source in each loop.
2890      The loop will be exited when there's not enough source code
2891      (within macro ONE_MORE_BYTE), or when there's not enough
2892      destination area to produce a character (within macro
2893      EMIT_CHAR).  */
2894   unsigned char *src_base;
2895   Lisp_Object translation_table;
2896
2897   if (NILP (Venable_character_translation))
2898     translation_table = Qnil;
2899   else
2900     {
2901       translation_table = coding->translation_table_for_decode;
2902       if (NILP (translation_table))
2903         translation_table = Vstandard_translation_table_for_decode;
2904     }
2905
2906   coding->produced_char = 0;
2907   while (1)
2908     {
2909       int c, charset, c1, c2;
2910
2911       src_base = src;
2912       ONE_MORE_BYTE (c1);
2913
2914       if (c1 < 0x80)
2915         {
2916           charset = CHARSET_ASCII;
2917           if (c1 < 0x20)
2918             {
2919               if (c1 == '\r')
2920                 {
2921                   if (coding->eol_type == CODING_EOL_CRLF)
2922                     {
2923                       ONE_MORE_BYTE (c2);
2924                       if (c2 == '\n')
2925                         c1 = c2;
2926                       else
2927                         /* To process C2 again, SRC is subtracted by 1.  */
2928                         src--;
2929                     }
2930                   else if (coding->eol_type == CODING_EOL_CR)
2931                     c1 = '\n';
2932                 }
2933               else if (c1 == '\n'
2934                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2935                        && (coding->eol_type == CODING_EOL_CR
2936                            || coding->eol_type == CODING_EOL_CRLF))
2937                 {
2938                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2939                   goto label_end_of_loop;
2940                 }
2941             }
2942         }
2943       else
2944         {
2945           if (sjis_p)
2946             {
2947               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2948                 goto label_invalid_code;
2949               if (c1 <= 0x9F || c1 >= 0xE0)
2950                 {
2951                   /* SJIS -> JISX0208 */
2952                   ONE_MORE_BYTE (c2);
2953                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2954                     goto label_invalid_code;
2955                   DECODE_SJIS (c1, c2, c1, c2);
2956                   charset = charset_jisx0208;
2957                 }
2958               else
2959                 /* SJIS -> JISX0201-Kana */
2960                 charset = charset_katakana_jisx0201;
2961             }
2962           else
2963             {
2964               /* BIG5 -> Big5 */
2965               if (c1 < 0xA0 || c1 > 0xFE)
2966                 goto label_invalid_code;
2967               ONE_MORE_BYTE (c2);
2968               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2969                 goto label_invalid_code;
2970               DECODE_BIG5 (c1, c2, charset, c1, c2);
2971             }
2972         }
2973
2974       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2975       EMIT_CHAR (c);
2976       continue;
2977
2978     label_invalid_code:
2979       coding->errors++;
2980       src = src_base;
2981       c = *src++;
2982       EMIT_CHAR (c);
2983     }
2984
2985  label_end_of_loop:
2986   coding->consumed = coding->consumed_char = src_base - source;
2987   coding->produced = dst - destination;
2988   return;
2989 }
2990
2991 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2992    This function can encode charsets `ascii', `katakana-jisx0201',
2993    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2994    are sure that all these charsets are registered as official charset
2995    (i.e. do not have extended leading-codes).  Characters of other
2996    charsets are produced without any encoding.  If SJIS_P is 1, encode
2997    SJIS text, else encode BIG5 text.  */
2998
2999 static void
3000 encode_coding_sjis_big5 (coding, source, destination,
3001                          src_bytes, dst_bytes, sjis_p)
3002      struct coding_system *coding;
3003      unsigned char *source, *destination;
3004      int src_bytes, dst_bytes;
3005      int sjis_p;
3006 {
3007   unsigned char *src = source;
3008   unsigned char *src_end = source + src_bytes;
3009   unsigned char *dst = destination;
3010   unsigned char *dst_end = destination + dst_bytes;
3011   /* SRC_BASE remembers the start position in source in each loop.
3012      The loop will be exited when there's not enough source text to
3013      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3014      there's not enough destination area to produce encoded codes
3015      (within macro EMIT_BYTES).  */
3016   unsigned char *src_base;
3017   Lisp_Object translation_table;
3018
3019   if (NILP (Venable_character_translation))
3020     translation_table = Qnil;
3021   else
3022     {
3023       translation_table = coding->translation_table_for_encode;
3024       if (NILP (translation_table))
3025         translation_table = Vstandard_translation_table_for_encode;
3026     }
3027
3028   while (1)
3029     {
3030       int c, charset, c1, c2;
3031
3032       src_base = src;
3033       ONE_MORE_CHAR (c);
3034
3035       /* Now encode the character C.  */
3036       if (SINGLE_BYTE_CHAR_P (c))
3037         {
3038           switch (c)
3039             {
3040             case '\r':
3041               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3042                 {
3043                   EMIT_ONE_BYTE (c);
3044                   break;
3045                 }
3046               c = '\n';
3047             case '\n':
3048               if (coding->eol_type == CODING_EOL_CRLF)
3049                 {
3050                   EMIT_TWO_BYTES ('\r', c);
3051                   break;
3052                 }
3053               else if (coding->eol_type == CODING_EOL_CR)
3054                 c = '\r';
3055             default:
3056               EMIT_ONE_BYTE (c);
3057             }
3058         }
3059       else
3060         {
3061           SPLIT_CHAR (c, charset, c1, c2);
3062           if (sjis_p)
3063             {
3064               if (charset == charset_jisx0208
3065                   || charset == charset_jisx0208_1978)
3066                 {
3067                   ENCODE_SJIS (c1, c2, c1, c2);
3068                   EMIT_TWO_BYTES (c1, c2);
3069                 }
3070               else if (charset == charset_katakana_jisx0201)
3071                 EMIT_ONE_BYTE (c1 | 0x80);
3072               else if (charset == charset_latin_jisx0201)
3073                 EMIT_ONE_BYTE (c1);
3074               else
3075                 /* There's no way other than producing the internal
3076                    codes as is.  */
3077                 EMIT_BYTES (src_base, src);
3078             }
3079           else
3080             {
3081               if (charset == charset_big5_1 || charset == charset_big5_2)
3082                 {
3083                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3084                   EMIT_TWO_BYTES (c1, c2);
3085                 }
3086               else
3087                 /* There's no way other than producing the internal
3088                    codes as is.  */
3089                 EMIT_BYTES (src_base, src);
3090             }
3091         }
3092       coding->consumed_char++;
3093     }
3094
3095  label_end_of_loop:
3096   coding->consumed = src_base - source;
3097   coding->produced = coding->produced_char = dst - destination;
3098 }
3099
3100 \f
3101 /*** 5. CCL handlers ***/
3102
3103 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3104    Check if a text is encoded in a coding system of which
3105    encoder/decoder are written in CCL program.  If it is, return
3106    CODING_CATEGORY_MASK_CCL, else return 0.  */
3107
3108 static int
3109 detect_coding_ccl (src, src_end, multibytep)
3110      unsigned char *src, *src_end;
3111      int multibytep;
3112 {
3113   unsigned char *valid;
3114   int c;
3115   /* Dummy for ONE_MORE_BYTE.  */
3116   struct coding_system dummy_coding;
3117   struct coding_system *coding = &dummy_coding;
3118
3119   /* No coding system is assigned to coding-category-ccl.  */
3120   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3121     return 0;
3122
3123   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3124   while (1)
3125     {
3126       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3127       if (! valid[c])
3128         return 0;
3129     }
3130  label_end_of_loop:
3131   return CODING_CATEGORY_MASK_CCL;
3132 }
3133
3134 \f
3135 /*** 6. End-of-line handlers ***/
3136
3137 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3138
3139 static void
3140 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3141      struct coding_system *coding;
3142      unsigned char *source, *destination;
3143      int src_bytes, dst_bytes;
3144 {
3145   unsigned char *src = source;
3146   unsigned char *dst = destination;
3147   unsigned char *src_end = src + src_bytes;
3148   unsigned char *dst_end = dst + dst_bytes;
3149   Lisp_Object translation_table;
3150   /* SRC_BASE remembers the start position in source in each loop.
3151      The loop will be exited when there's not enough source code
3152      (within macro ONE_MORE_BYTE), or when there's not enough
3153      destination area to produce a character (within macro
3154      EMIT_CHAR).  */
3155   unsigned char *src_base;
3156   int c;
3157
3158   translation_table = Qnil;
3159   switch (coding->eol_type)
3160     {
3161     case CODING_EOL_CRLF:
3162       while (1)
3163         {
3164           src_base = src;
3165           ONE_MORE_BYTE (c);
3166           if (c == '\r')
3167             {
3168               ONE_MORE_BYTE (c);
3169               if (c != '\n')
3170                 {
3171                   src--;
3172                   c = '\r';
3173                 }
3174             }
3175           else if (c == '\n'
3176                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3177             {
3178               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3179               goto label_end_of_loop;
3180             }
3181           EMIT_CHAR (c);
3182         }
3183       break;
3184
3185     case CODING_EOL_CR:
3186       while (1)
3187         {
3188           src_base = src;
3189           ONE_MORE_BYTE (c);
3190           if (c == '\n')
3191             {
3192               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3193                 {
3194                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3195                   goto label_end_of_loop;
3196                 }
3197             }
3198           else if (c == '\r')
3199             c = '\n';
3200           EMIT_CHAR (c);
3201         }
3202       break;
3203
3204     default:                    /* no need for EOL handling */
3205       while (1)
3206         {
3207           src_base = src;
3208           ONE_MORE_BYTE (c);
3209           EMIT_CHAR (c);
3210         }
3211     }
3212
3213  label_end_of_loop:
3214   coding->consumed = coding->consumed_char = src_base - source;
3215   coding->produced = dst - destination;
3216   return;
3217 }
3218
3219 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3220    format of end-of-line according to `coding->eol_type'.  It also
3221    convert multibyte form 8-bit characters to unibyte if
3222    CODING->src_multibyte is nonzero.  If `coding->mode &
3223    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3224    also means end-of-line.  */
3225
3226 static void
3227 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3228      struct coding_system *coding;
3229      const unsigned char *source;
3230      unsigned char *destination;
3231      int src_bytes, dst_bytes;
3232 {
3233   const unsigned char *src = source;
3234   unsigned char *dst = destination;
3235   const unsigned char *src_end = src + src_bytes;
3236   unsigned char *dst_end = dst + dst_bytes;
3237   Lisp_Object translation_table;
3238   /* SRC_BASE remembers the start position in source in each loop.
3239      The loop will be exited when there's not enough source text to
3240      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3241      there's not enough destination area to produce encoded codes
3242      (within macro EMIT_BYTES).  */
3243   const unsigned char *src_base;
3244   unsigned char *tmp;
3245   int c;
3246   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3247
3248   translation_table = Qnil;
3249   if (coding->src_multibyte
3250       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3251     {
3252       src_end--;
3253       src_bytes--;
3254       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3255     }
3256
3257   if (coding->eol_type == CODING_EOL_CRLF)
3258     {
3259       while (src < src_end)
3260         {
3261           src_base = src;
3262           c = *src++;
3263           if (c >= 0x20)
3264             EMIT_ONE_BYTE (c);
3265           else if (c == '\n' || (c == '\r' && selective_display))
3266             EMIT_TWO_BYTES ('\r', '\n');
3267           else
3268             EMIT_ONE_BYTE (c);
3269         }
3270       src_base = src;
3271     label_end_of_loop:
3272       ;
3273     }
3274   else
3275     {
3276       if (!dst_bytes || src_bytes <= dst_bytes)
3277         {
3278           safe_bcopy (src, dst, src_bytes);
3279           src_base = src_end;
3280           dst += src_bytes;
3281         }
3282       else
3283         {
3284           if (coding->src_multibyte
3285               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3286             dst_bytes--;
3287           safe_bcopy (src, dst, dst_bytes);
3288           src_base = src + dst_bytes;
3289           dst = destination + dst_bytes;
3290           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3291         }
3292       if (coding->eol_type == CODING_EOL_CR)
3293         {
3294           for (tmp = destination; tmp < dst; tmp++)
3295             if (*tmp == '\n') *tmp = '\r';
3296         }
3297       else if (selective_display)
3298         {
3299           for (tmp = destination; tmp < dst; tmp++)
3300             if (*tmp == '\r') *tmp = '\n';
3301         }
3302     }
3303   if (coding->src_multibyte)
3304     dst = destination + str_as_unibyte (destination, dst - destination);
3305
3306   coding->consumed = src_base - source;
3307   coding->produced = dst - destination;
3308   coding->produced_char = coding->produced;
3309 }
3310
3311 \f
3312 /*** 7. C library functions ***/
3313
3314 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3315    has a property `coding-system'.  The value of this property is a
3316    vector of length 5 (called the coding-vector).  Among elements of
3317    this vector, the first (element[0]) and the fifth (element[4])
3318    carry important information for decoding/encoding.  Before
3319    decoding/encoding, this information should be set in fields of a
3320    structure of type `coding_system'.
3321
3322    The value of the property `coding-system' can be a symbol of another
3323    subsidiary coding-system.  In that case, Emacs gets coding-vector
3324    from that symbol.
3325
3326    `element[0]' contains information to be set in `coding->type'.  The
3327    value and its meaning is as follows:
3328
3329    0 -- coding_type_emacs_mule
3330    1 -- coding_type_sjis
3331    2 -- coding_type_iso2022
3332    3 -- coding_type_big5
3333    4 -- coding_type_ccl encoder/decoder written in CCL
3334    nil -- coding_type_no_conversion
3335    t -- coding_type_undecided (automatic conversion on decoding,
3336                                no-conversion on encoding)
3337
3338    `element[4]' contains information to be set in `coding->flags' and
3339    `coding->spec'.  The meaning varies by `coding->type'.
3340
3341    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3342    of length 32 (of which the first 13 sub-elements are used now).
3343    Meanings of these sub-elements are:
3344
3345    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3346         If the value is an integer of valid charset, the charset is
3347         assumed to be designated to graphic register N initially.
3348
3349         If the value is minus, it is a minus value of charset which
3350         reserves graphic register N, which means that the charset is
3351         not designated initially but should be designated to graphic
3352         register N just before encoding a character in that charset.
3353
3354         If the value is nil, graphic register N is never used on
3355         encoding.
3356
3357    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3358         Each value takes t or nil.  See the section ISO2022 of
3359         `coding.h' for more information.
3360
3361    If `coding->type' is `coding_type_big5', element[4] is t to denote
3362    BIG5-ETen or nil to denote BIG5-HKU.
3363
3364    If `coding->type' takes the other value, element[4] is ignored.
3365
3366    Emacs Lisp's coding systems also carry information about format of
3367    end-of-line in a value of property `eol-type'.  If the value is
3368    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3369    means CODING_EOL_CR.  If it is not integer, it should be a vector
3370    of subsidiary coding systems of which property `eol-type' has one
3371    of the above values.
3372
3373 */
3374
3375 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3376    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3377    is setup so that no conversion is necessary and return -1, else
3378    return 0.  */
3379
3380 int
3381 setup_coding_system (coding_system, coding)
3382      Lisp_Object coding_system;
3383      struct coding_system *coding;
3384 {
3385   Lisp_Object coding_spec, coding_type, eol_type, plist;
3386   Lisp_Object val;
3387
3388   /* At first, zero clear all members.  */
3389   bzero (coding, sizeof (struct coding_system));
3390
3391   /* Initialize some fields required for all kinds of coding systems.  */
3392   coding->symbol = coding_system;
3393   coding->heading_ascii = -1;
3394   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3395   coding->composing = COMPOSITION_DISABLED;
3396   coding->cmp_data = NULL;
3397
3398   if (NILP (coding_system))
3399     goto label_invalid_coding_system;
3400
3401   coding_spec = Fget (coding_system, Qcoding_system);
3402
3403   if (!VECTORP (coding_spec)
3404       || XVECTOR (coding_spec)->size != 5
3405       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3406     goto label_invalid_coding_system;
3407
3408   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3409   if (VECTORP (eol_type))
3410     {
3411       coding->eol_type = CODING_EOL_UNDECIDED;
3412       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3413     }
3414   else if (XFASTINT (eol_type) == 1)
3415     {
3416       coding->eol_type = CODING_EOL_CRLF;
3417       coding->common_flags
3418         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3419     }
3420   else if (XFASTINT (eol_type) == 2)
3421     {
3422       coding->eol_type = CODING_EOL_CR;
3423       coding->common_flags
3424         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3425     }
3426   else
3427     coding->eol_type = CODING_EOL_LF;
3428
3429   coding_type = XVECTOR (coding_spec)->contents[0];
3430   /* Try short cut.  */
3431   if (SYMBOLP (coding_type))
3432     {
3433       if (EQ (coding_type, Qt))
3434         {
3435           coding->type = coding_type_undecided;
3436           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3437         }
3438       else
3439         coding->type = coding_type_no_conversion;
3440       /* Initialize this member.  Any thing other than
3441          CODING_CATEGORY_IDX_UTF_16_BE and
3442          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3443          special treatment in detect_eol.  */
3444       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3445
3446       return 0;
3447     }
3448
3449   /* Get values of coding system properties:
3450      `post-read-conversion', `pre-write-conversion',
3451      `translation-table-for-decode', `translation-table-for-encode'.  */
3452   plist = XVECTOR (coding_spec)->contents[3];
3453   /* Pre & post conversion functions should be disabled if
3454      inhibit_eol_conversion is nonzero.  This is the case that a code
3455      conversion function is called while those functions are running.  */
3456   if (! inhibit_pre_post_conversion)
3457     {
3458       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3459       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3460     }
3461   val = Fplist_get (plist, Qtranslation_table_for_decode);
3462   if (SYMBOLP (val))
3463     val = Fget (val, Qtranslation_table_for_decode);
3464   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3465   val = Fplist_get (plist, Qtranslation_table_for_encode);
3466   if (SYMBOLP (val))
3467     val = Fget (val, Qtranslation_table_for_encode);
3468   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3469   val = Fplist_get (plist, Qcoding_category);
3470   if (!NILP (val))
3471     {
3472       val = Fget (val, Qcoding_category_index);
3473       if (INTEGERP (val))
3474         coding->category_idx = XINT (val);
3475       else
3476         goto label_invalid_coding_system;
3477     }
3478   else
3479     goto label_invalid_coding_system;
3480
3481   /* If the coding system has non-nil `composition' property, enable
3482      composition handling.  */
3483   val = Fplist_get (plist, Qcomposition);
3484   if (!NILP (val))
3485     coding->composing = COMPOSITION_NO;
3486
3487   switch (XFASTINT (coding_type))
3488     {
3489     case 0:
3490       coding->type = coding_type_emacs_mule;
3491       coding->common_flags
3492         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3493       if (!NILP (coding->post_read_conversion))
3494         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3495       if (!NILP (coding->pre_write_conversion))
3496         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3497       break;
3498
3499     case 1:
3500       coding->type = coding_type_sjis;
3501       coding->common_flags
3502         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3503       break;
3504
3505     case 2:
3506       coding->type = coding_type_iso2022;
3507       coding->common_flags
3508         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3509       {
3510         Lisp_Object val, temp;
3511         Lisp_Object *flags;
3512         int i, charset, reg_bits = 0;
3513
3514         val = XVECTOR (coding_spec)->contents[4];
3515
3516         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3517           goto label_invalid_coding_system;
3518
3519         flags = XVECTOR (val)->contents;
3520         coding->flags
3521           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3522              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3523              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3524              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3525              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3526              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3527              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3528              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3529              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3530              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3531              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3532              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3533              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3534              );
3535
3536         /* Invoke graphic register 0 to plane 0.  */
3537         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3538         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3539         CODING_SPEC_ISO_INVOCATION (coding, 1)
3540           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3541         /* Not single shifting at first.  */
3542         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3543         /* Beginning of buffer should also be regarded as bol. */
3544         CODING_SPEC_ISO_BOL (coding) = 1;
3545
3546         for (charset = 0; charset <= MAX_CHARSET; charset++)
3547           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3548         val = Vcharset_revision_alist;
3549         while (CONSP (val))
3550           {
3551             charset = get_charset_id (Fcar_safe (XCAR (val)));
3552             if (charset >= 0
3553                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3554                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3555               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3556             val = XCDR (val);
3557           }
3558
3559         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3560            FLAGS[REG] can be one of below:
3561                 integer CHARSET: CHARSET occupies register I,
3562                 t: designate nothing to REG initially, but can be used
3563                   by any charsets,
3564                 list of integer, nil, or t: designate the first
3565                   element (if integer) to REG initially, the remaining
3566                   elements (if integer) is designated to REG on request,
3567                   if an element is t, REG can be used by any charsets,
3568                 nil: REG is never used.  */
3569         for (charset = 0; charset <= MAX_CHARSET; charset++)
3570           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3571             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3572         for (i = 0; i < 4; i++)
3573           {
3574             if ((INTEGERP (flags[i])
3575                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3576                 || (charset = get_charset_id (flags[i])) >= 0)
3577               {
3578                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3579                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3580               }
3581             else if (EQ (flags[i], Qt))
3582               {
3583                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3584                 reg_bits |= 1 << i;
3585                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3586               }
3587             else if (CONSP (flags[i]))
3588               {
3589                 Lisp_Object tail;
3590                 tail = flags[i];
3591
3592                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3593                 if ((INTEGERP (XCAR (tail))
3594                      && (charset = XINT (XCAR (tail)),
3595                          CHARSET_VALID_P (charset)))
3596                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3597                   {
3598                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3599                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3600                   }
3601                 else
3602                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3603                 tail = XCDR (tail);
3604                 while (CONSP (tail))
3605                   {
3606                     if ((INTEGERP (XCAR (tail))
3607                          && (charset = XINT (XCAR (tail)),
3608                              CHARSET_VALID_P (charset)))
3609                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3610                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3611                         = i;
3612                     else if (EQ (XCAR (tail), Qt))
3613                       reg_bits |= 1 << i;
3614                     tail = XCDR (tail);
3615                   }
3616               }
3617             else
3618               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3619
3620             CODING_SPEC_ISO_DESIGNATION (coding, i)
3621               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3622           }
3623
3624         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3625           {
3626             /* REG 1 can be used only by locking shift in 7-bit env.  */
3627             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3628               reg_bits &= ~2;
3629             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3630               /* Without any shifting, only REG 0 and 1 can be used.  */
3631               reg_bits &= 3;
3632           }
3633
3634         if (reg_bits)
3635           for (charset = 0; charset <= MAX_CHARSET; charset++)
3636             {
3637               if (CHARSET_DEFINED_P (charset)
3638                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3639                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3640                 {
3641                   /* There exist some default graphic registers to be
3642                      used by CHARSET.  */
3643
3644                   /* We had better avoid designating a charset of
3645                      CHARS96 to REG 0 as far as possible.  */
3646                   if (CHARSET_CHARS (charset) == 96)
3647                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3648                       = (reg_bits & 2
3649                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3650                   else
3651                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3652                       = (reg_bits & 1
3653                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3654                 }
3655             }
3656       }
3657       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3658       coding->spec.iso2022.last_invalid_designation_register = -1;
3659       break;
3660
3661     case 3:
3662       coding->type = coding_type_big5;
3663       coding->common_flags
3664         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3665       coding->flags
3666         = (NILP (XVECTOR (coding_spec)->contents[4])
3667            ? CODING_FLAG_BIG5_HKU
3668            : CODING_FLAG_BIG5_ETEN);
3669       break;
3670
3671     case 4:
3672       coding->type = coding_type_ccl;
3673       coding->common_flags
3674         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3675       {
3676         val = XVECTOR (coding_spec)->contents[4];
3677         if (! CONSP (val)
3678             || setup_ccl_program (&(coding->spec.ccl.decoder),
3679                                   XCAR (val)) < 0
3680             || setup_ccl_program (&(coding->spec.ccl.encoder),
3681                                   XCDR (val)) < 0)
3682           goto label_invalid_coding_system;
3683
3684         bzero (coding->spec.ccl.valid_codes, 256);
3685         val = Fplist_get (plist, Qvalid_codes);
3686         if (CONSP (val))
3687           {
3688             Lisp_Object this;
3689
3690             for (; CONSP (val); val = XCDR (val))
3691               {
3692                 this = XCAR (val);
3693                 if (INTEGERP (this)
3694                     && XINT (this) >= 0 && XINT (this) < 256)
3695                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3696                 else if (CONSP (this)
3697                          && INTEGERP (XCAR (this))
3698                          && INTEGERP (XCDR (this)))
3699                   {
3700                     int start = XINT (XCAR (this));
3701                     int end = XINT (XCDR (this));
3702
3703                     if (start >= 0 && start <= end && end < 256)
3704                       while (start <= end)
3705                         coding->spec.ccl.valid_codes[start++] = 1;
3706                   }
3707               }
3708           }
3709       }
3710       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3711       coding->spec.ccl.cr_carryover = 0;
3712       coding->spec.ccl.eight_bit_carryover[0] = 0;
3713       break;
3714
3715     case 5:
3716       coding->type = coding_type_raw_text;
3717       break;
3718
3719     default:
3720       goto label_invalid_coding_system;
3721     }
3722   return 0;
3723
3724  label_invalid_coding_system:
3725   coding->type = coding_type_no_conversion;
3726   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3727   coding->common_flags = 0;
3728   coding->eol_type = CODING_EOL_LF;
3729   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3730   return -1;
3731 }
3732
3733 /* Free memory blocks allocated for storing composition information.  */
3734
3735 void
3736 coding_free_composition_data (coding)
3737      struct coding_system *coding;
3738 {
3739   struct composition_data *cmp_data = coding->cmp_data, *next;
3740
3741   if (!cmp_data)
3742     return;
3743   /* Memory blocks are chained.  At first, rewind to the first, then,
3744      free blocks one by one.  */
3745   while (cmp_data->prev)
3746     cmp_data = cmp_data->prev;
3747   while (cmp_data)
3748     {
3749       next = cmp_data->next;
3750       xfree (cmp_data);
3751       cmp_data = next;
3752     }
3753   coding->cmp_data = NULL;
3754 }
3755
3756 /* Set `char_offset' member of all memory blocks pointed by
3757    coding->cmp_data to POS.  */
3758
3759 void
3760 coding_adjust_composition_offset (coding, pos)
3761      struct coding_system *coding;
3762      int pos;
3763 {
3764   struct composition_data *cmp_data;
3765
3766   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3767     cmp_data->char_offset = pos;
3768 }
3769
3770 /* Setup raw-text or one of its subsidiaries in the structure
3771    coding_system CODING according to the already setup value eol_type
3772    in CODING.  CODING should be setup for some coding system in
3773    advance.  */
3774
3775 void
3776 setup_raw_text_coding_system (coding)
3777      struct coding_system *coding;
3778 {
3779   if (coding->type != coding_type_raw_text)
3780     {
3781       coding->symbol = Qraw_text;
3782       coding->type = coding_type_raw_text;
3783       if (coding->eol_type != CODING_EOL_UNDECIDED)
3784         {
3785           Lisp_Object subsidiaries;
3786           subsidiaries = Fget (Qraw_text, Qeol_type);
3787
3788           if (VECTORP (subsidiaries)
3789               && XVECTOR (subsidiaries)->size == 3)
3790             coding->symbol
3791               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3792         }
3793       setup_coding_system (coding->symbol, coding);
3794     }
3795   return;
3796 }
3797
3798 /* Emacs has a mechanism to automatically detect a coding system if it
3799    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3800    it's impossible to distinguish some coding systems accurately
3801    because they use the same range of codes.  So, at first, coding
3802    systems are categorized into 7, those are:
3803
3804    o coding-category-emacs-mule
3805
3806         The category for a coding system which has the same code range
3807         as Emacs' internal format.  Assigned the coding-system (Lisp
3808         symbol) `emacs-mule' by default.
3809
3810    o coding-category-sjis
3811
3812         The category for a coding system which has the same code range
3813         as SJIS.  Assigned the coding-system (Lisp
3814         symbol) `japanese-shift-jis' by default.
3815
3816    o coding-category-iso-7
3817
3818         The category for a coding system which has the same code range
3819         as ISO2022 of 7-bit environment.  This doesn't use any locking
3820         shift and single shift functions.  This can encode/decode all
3821         charsets.  Assigned the coding-system (Lisp symbol)
3822         `iso-2022-7bit' by default.
3823
3824    o coding-category-iso-7-tight
3825
3826         Same as coding-category-iso-7 except that this can
3827         encode/decode only the specified charsets.
3828
3829    o coding-category-iso-8-1
3830
3831         The category for a coding system which has the same code range
3832         as ISO2022 of 8-bit environment and graphic plane 1 used only
3833         for DIMENSION1 charset.  This doesn't use any locking shift
3834         and single shift functions.  Assigned the coding-system (Lisp
3835         symbol) `iso-latin-1' by default.
3836
3837    o coding-category-iso-8-2
3838
3839         The category for a coding system which has the same code range
3840         as ISO2022 of 8-bit environment and graphic plane 1 used only
3841         for DIMENSION2 charset.  This doesn't use any locking shift
3842         and single shift functions.  Assigned the coding-system (Lisp
3843         symbol) `japanese-iso-8bit' by default.
3844
3845    o coding-category-iso-7-else
3846
3847         The category for a coding system which has the same code range
3848         as ISO2022 of 7-bit environment but uses locking shift or
3849         single shift functions.  Assigned the coding-system (Lisp
3850         symbol) `iso-2022-7bit-lock' by default.
3851
3852    o coding-category-iso-8-else
3853
3854         The category for a coding system which has the same code range
3855         as ISO2022 of 8-bit environment but uses locking shift or
3856         single shift functions.  Assigned the coding-system (Lisp
3857         symbol) `iso-2022-8bit-ss2' by default.
3858
3859    o coding-category-big5
3860
3861         The category for a coding system which has the same code range
3862         as BIG5.  Assigned the coding-system (Lisp symbol)
3863         `cn-big5' by default.
3864
3865    o coding-category-utf-8
3866
3867         The category for a coding system which has the same code range
3868         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3869         symbol) `utf-8' by default.
3870
3871    o coding-category-utf-16-be
3872
3873         The category for a coding system in which a text has an
3874         Unicode signature (cf. Unicode Standard) in the order of BIG
3875         endian at the head.  Assigned the coding-system (Lisp symbol)
3876         `utf-16-be' by default.
3877
3878    o coding-category-utf-16-le
3879
3880         The category for a coding system in which a text has an
3881         Unicode signature (cf. Unicode Standard) in the order of
3882         LITTLE endian at the head.  Assigned the coding-system (Lisp
3883         symbol) `utf-16-le' by default.
3884
3885    o coding-category-ccl
3886
3887         The category for a coding system of which encoder/decoder is
3888         written in CCL programs.  The default value is nil, i.e., no
3889         coding system is assigned.
3890
3891    o coding-category-binary
3892
3893         The category for a coding system not categorized in any of the
3894         above.  Assigned the coding-system (Lisp symbol)
3895         `no-conversion' by default.
3896
3897    Each of them is a Lisp symbol and the value is an actual
3898    `coding-system' (this is also a Lisp symbol) assigned by a user.
3899    What Emacs does actually is to detect a category of coding system.
3900    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3901    decide a single possible category, it selects a category of the
3902    highest priority.  Priorities of categories are also specified by a
3903    user in a Lisp variable `coding-category-list'.
3904
3905 */
3906
3907 static
3908 int ascii_skip_code[256];
3909
3910 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3911    If it detects possible coding systems, return an integer in which
3912    appropriate flag bits are set.  Flag bits are defined by macros
3913    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3914    it should point the table `coding_priorities'.  In that case, only
3915    the flag bit for a coding system of the highest priority is set in
3916    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3917    range 0x80..0x9F are in multibyte form.
3918
3919    How many ASCII characters are at the head is returned as *SKIP.  */
3920
3921 static int
3922 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3923      unsigned char *source;
3924      int src_bytes, *priorities, *skip;
3925      int multibytep;
3926 {
3927   register unsigned char c;
3928   unsigned char *src = source, *src_end = source + src_bytes;
3929   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3930   int i;
3931
3932   /* At first, skip all ASCII characters and control characters except
3933      for three ISO2022 specific control characters.  */
3934   ascii_skip_code[ISO_CODE_SO] = 0;
3935   ascii_skip_code[ISO_CODE_SI] = 0;
3936   ascii_skip_code[ISO_CODE_ESC] = 0;
3937
3938  label_loop_detect_coding:
3939   while (src < src_end && ascii_skip_code[*src]) src++;
3940   *skip = src - source;
3941
3942   if (src >= src_end)
3943     /* We found nothing other than ASCII.  There's nothing to do.  */
3944     return 0;
3945
3946   c = *src;
3947   /* The text seems to be encoded in some multilingual coding system.
3948      Now, try to find in which coding system the text is encoded.  */
3949   if (c < 0x80)
3950     {
3951       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3952       /* C is an ISO2022 specific control code of C0.  */
3953       mask = detect_coding_iso2022 (src, src_end, multibytep);
3954       if (mask == 0)
3955         {
3956           /* No valid ISO2022 code follows C.  Try again.  */
3957           src++;
3958           if (c == ISO_CODE_ESC)
3959             ascii_skip_code[ISO_CODE_ESC] = 1;
3960           else
3961             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3962           goto label_loop_detect_coding;
3963         }
3964       if (priorities)
3965         {
3966           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3967             {
3968               if (mask & priorities[i])
3969                 return priorities[i];
3970             }
3971           return CODING_CATEGORY_MASK_RAW_TEXT;
3972         }
3973     }
3974   else
3975     {
3976       int try;
3977
3978       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3979         c = src[1] - 0x20;
3980
3981       if (c < 0xA0)
3982         {
3983           /* C is the first byte of SJIS character code,
3984              or a leading-code of Emacs' internal format (emacs-mule),
3985              or the first byte of UTF-16.  */
3986           try = (CODING_CATEGORY_MASK_SJIS
3987                   | CODING_CATEGORY_MASK_EMACS_MULE
3988                   | CODING_CATEGORY_MASK_UTF_16_BE
3989                   | CODING_CATEGORY_MASK_UTF_16_LE);
3990
3991           /* Or, if C is a special latin extra code,
3992              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3993              or is an ISO2022 control-sequence-introducer (CSI),
3994              we should also consider the possibility of ISO2022 codings.  */
3995           if ((VECTORP (Vlatin_extra_code_table)
3996                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3997               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3998               || (c == ISO_CODE_CSI
3999                   && (src < src_end
4000                       && (*src == ']'
4001                           || ((*src == '0' || *src == '1' || *src == '2')
4002                               && src + 1 < src_end
4003                               && src[1] == ']')))))
4004             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4005                      | CODING_CATEGORY_MASK_ISO_8BIT);
4006         }
4007       else
4008         /* C is a character of ISO2022 in graphic plane right,
4009            or a SJIS's 1-byte character code (i.e. JISX0201),
4010            or the first byte of BIG5's 2-byte code,
4011            or the first byte of UTF-8/16.  */
4012         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4013                 | CODING_CATEGORY_MASK_ISO_8BIT
4014                 | CODING_CATEGORY_MASK_SJIS
4015                 | CODING_CATEGORY_MASK_BIG5
4016                 | CODING_CATEGORY_MASK_UTF_8
4017                 | CODING_CATEGORY_MASK_UTF_16_BE
4018                 | CODING_CATEGORY_MASK_UTF_16_LE);
4019
4020       /* Or, we may have to consider the possibility of CCL.  */
4021       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4022           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4023               ->spec.ccl.valid_codes)[c])
4024         try |= CODING_CATEGORY_MASK_CCL;
4025
4026       mask = 0;
4027       utf16_examined_p = iso2022_examined_p = 0;
4028       if (priorities)
4029         {
4030           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4031             {
4032               if (!iso2022_examined_p
4033                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4034                 {
4035                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4036                   iso2022_examined_p = 1;
4037                 }
4038               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4039                 mask |= detect_coding_sjis (src, src_end, multibytep);
4040               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4041                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4042               else if (!utf16_examined_p
4043                        && (priorities[i] & try &
4044                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4045                 {
4046                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4047                   utf16_examined_p = 1;
4048                 }
4049               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4050                 mask |= detect_coding_big5 (src, src_end, multibytep);
4051               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4052                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4053               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4054                 mask |= detect_coding_ccl (src, src_end, multibytep);
4055               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4056                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4057               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4058                 mask |= CODING_CATEGORY_MASK_BINARY;
4059               if (mask & priorities[i])
4060                 return priorities[i];
4061             }
4062           return CODING_CATEGORY_MASK_RAW_TEXT;
4063         }
4064       if (try & CODING_CATEGORY_MASK_ISO)
4065         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4066       if (try & CODING_CATEGORY_MASK_SJIS)
4067         mask |= detect_coding_sjis (src, src_end, multibytep);
4068       if (try & CODING_CATEGORY_MASK_BIG5)
4069         mask |= detect_coding_big5 (src, src_end, multibytep);
4070       if (try & CODING_CATEGORY_MASK_UTF_8)
4071         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4072       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4073         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4074       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4075         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4076       if (try & CODING_CATEGORY_MASK_CCL)
4077         mask |= detect_coding_ccl (src, src_end, multibytep);
4078     }
4079   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4080 }
4081
4082 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4083    The information of the detected coding system is set in CODING.  */
4084
4085 void
4086 detect_coding (coding, src, src_bytes)
4087      struct coding_system *coding;
4088      const unsigned char *src;
4089      int src_bytes;
4090 {
4091   unsigned int idx;
4092   int skip, mask;
4093   Lisp_Object val;
4094
4095   val = Vcoding_category_list;
4096   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4097                              coding->src_multibyte);
4098   coding->heading_ascii = skip;
4099
4100   if (!mask) return;
4101
4102   /* We found a single coding system of the highest priority in MASK.  */
4103   idx = 0;
4104   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4105   if (! mask)
4106     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4107
4108   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4109
4110   if (coding->eol_type != CODING_EOL_UNDECIDED)
4111     {
4112       Lisp_Object tmp;
4113
4114       tmp = Fget (val, Qeol_type);
4115       if (VECTORP (tmp))
4116         val = XVECTOR (tmp)->contents[coding->eol_type];
4117     }
4118
4119   /* Setup this new coding system while preserving some slots.  */
4120   {
4121     int src_multibyte = coding->src_multibyte;
4122     int dst_multibyte = coding->dst_multibyte;
4123
4124     setup_coding_system (val, coding);
4125     coding->src_multibyte = src_multibyte;
4126     coding->dst_multibyte = dst_multibyte;
4127     coding->heading_ascii = skip;
4128   }
4129 }
4130
4131 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4132    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4133    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4134
4135    How many non-eol characters are at the head is returned as *SKIP.  */
4136
4137 #define MAX_EOL_CHECK_COUNT 3
4138
4139 static int
4140 detect_eol_type (source, src_bytes, skip)
4141      unsigned char *source;
4142      int src_bytes, *skip;
4143 {
4144   unsigned char *src = source, *src_end = src + src_bytes;
4145   unsigned char c;
4146   int total = 0;                /* How many end-of-lines are found so far.  */
4147   int eol_type = CODING_EOL_UNDECIDED;
4148   int this_eol_type;
4149
4150   *skip = 0;
4151
4152   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4153     {
4154       c = *src++;
4155       if (c == '\n' || c == '\r')
4156         {
4157           if (*skip == 0)
4158             *skip = src - 1 - source;
4159           total++;
4160           if (c == '\n')
4161             this_eol_type = CODING_EOL_LF;
4162           else if (src >= src_end || *src != '\n')
4163             this_eol_type = CODING_EOL_CR;
4164           else
4165             this_eol_type = CODING_EOL_CRLF, src++;
4166
4167           if (eol_type == CODING_EOL_UNDECIDED)
4168             /* This is the first end-of-line.  */
4169             eol_type = this_eol_type;
4170           else if (eol_type != this_eol_type)
4171             {
4172               /* The found type is different from what found before.  */
4173               eol_type = CODING_EOL_INCONSISTENT;
4174               break;
4175             }
4176         }
4177     }
4178
4179   if (*skip == 0)
4180     *skip = src_end - source;
4181   return eol_type;
4182 }
4183
4184 /* Like detect_eol_type, but detect EOL type in 2-octet
4185    big-endian/little-endian format for coding systems utf-16-be and
4186    utf-16-le.  */
4187
4188 static int
4189 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4190      unsigned char *source;
4191      int src_bytes, *skip, big_endian_p;
4192 {
4193   unsigned char *src = source, *src_end = src + src_bytes;
4194   unsigned int c1, c2;
4195   int total = 0;                /* How many end-of-lines are found so far.  */
4196   int eol_type = CODING_EOL_UNDECIDED;
4197   int this_eol_type;
4198   int msb, lsb;
4199
4200   if (big_endian_p)
4201     msb = 0, lsb = 1;
4202   else
4203     msb = 1, lsb = 0;
4204
4205   *skip = 0;
4206
4207   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4208     {
4209       c1 = (src[msb] << 8) | (src[lsb]);
4210       src += 2;
4211
4212       if (c1 == '\n' || c1 == '\r')
4213         {
4214           if (*skip == 0)
4215             *skip = src - 2 - source;
4216           total++;
4217           if (c1 == '\n')
4218             {
4219               this_eol_type = CODING_EOL_LF;
4220             }
4221           else
4222             {
4223               if ((src + 1) >= src_end)
4224                 {
4225                   this_eol_type = CODING_EOL_CR;
4226                 }
4227               else
4228                 {
4229                   c2 = (src[msb] << 8) | (src[lsb]);
4230                   if (c2 == '\n')
4231                     this_eol_type = CODING_EOL_CRLF, src += 2;
4232                   else
4233                     this_eol_type = CODING_EOL_CR;
4234                 }
4235             }
4236
4237           if (eol_type == CODING_EOL_UNDECIDED)
4238             /* This is the first end-of-line.  */
4239             eol_type = this_eol_type;
4240           else if (eol_type != this_eol_type)
4241             {
4242               /* The found type is different from what found before.  */
4243               eol_type = CODING_EOL_INCONSISTENT;
4244               break;
4245             }
4246         }
4247     }
4248
4249   if (*skip == 0)
4250     *skip = src_end - source;
4251   return eol_type;
4252 }
4253
4254 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4255    is encoded.  If it detects an appropriate format of end-of-line, it
4256    sets the information in *CODING.  */
4257
4258 void
4259 detect_eol (coding, src, src_bytes)
4260      struct coding_system *coding;
4261      const unsigned char *src;
4262      int src_bytes;
4263 {
4264   Lisp_Object val;
4265   int skip;
4266   int eol_type;
4267
4268   switch (coding->category_idx)
4269     {
4270     case CODING_CATEGORY_IDX_UTF_16_BE:
4271       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4272       break;
4273     case CODING_CATEGORY_IDX_UTF_16_LE:
4274       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4275       break;
4276     default:
4277       eol_type = detect_eol_type (src, src_bytes, &skip);
4278       break;
4279     }
4280
4281   if (coding->heading_ascii > skip)
4282     coding->heading_ascii = skip;
4283   else
4284     skip = coding->heading_ascii;
4285
4286   if (eol_type == CODING_EOL_UNDECIDED)
4287     return;
4288   if (eol_type == CODING_EOL_INCONSISTENT)
4289     {
4290 #if 0
4291       /* This code is suppressed until we find a better way to
4292          distinguish raw text file and binary file.  */
4293
4294       /* If we have already detected that the coding is raw-text, the
4295          coding should actually be no-conversion.  */
4296       if (coding->type == coding_type_raw_text)
4297         {
4298           setup_coding_system (Qno_conversion, coding);
4299           return;
4300         }
4301       /* Else, let's decode only text code anyway.  */
4302 #endif /* 0 */
4303       eol_type = CODING_EOL_LF;
4304     }
4305
4306   val = Fget (coding->symbol, Qeol_type);
4307   if (VECTORP (val) && XVECTOR (val)->size == 3)
4308     {
4309       int src_multibyte = coding->src_multibyte;
4310       int dst_multibyte = coding->dst_multibyte;
4311       struct composition_data *cmp_data = coding->cmp_data;
4312
4313       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4314       coding->src_multibyte = src_multibyte;
4315       coding->dst_multibyte = dst_multibyte;
4316       coding->heading_ascii = skip;
4317       coding->cmp_data = cmp_data;
4318     }
4319 }
4320
4321 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4322
4323 #define DECODING_BUFFER_MAG(coding)                     \
4324   (coding->type == coding_type_iso2022                  \
4325    ? 3                                                  \
4326    : (coding->type == coding_type_ccl                   \
4327       ? coding->spec.ccl.decoder.buf_magnification      \
4328       : 2))
4329
4330 /* Return maximum size (bytes) of a buffer enough for decoding
4331    SRC_BYTES of text encoded in CODING.  */
4332
4333 int
4334 decoding_buffer_size (coding, src_bytes)
4335      struct coding_system *coding;
4336      int src_bytes;
4337 {
4338   return (src_bytes * DECODING_BUFFER_MAG (coding)
4339           + CONVERSION_BUFFER_EXTRA_ROOM);
4340 }
4341
4342 /* Return maximum size (bytes) of a buffer enough for encoding
4343    SRC_BYTES of text to CODING.  */
4344
4345 int
4346 encoding_buffer_size (coding, src_bytes)
4347      struct coding_system *coding;
4348      int src_bytes;
4349 {
4350   int magnification;
4351
4352   if (coding->type == coding_type_ccl)
4353     magnification = coding->spec.ccl.encoder.buf_magnification;
4354   else if (CODING_REQUIRE_ENCODING (coding))
4355     magnification = 3;
4356   else
4357     magnification = 1;
4358
4359   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4360 }
4361
4362 /* Working buffer for code conversion.  */
4363 struct conversion_buffer
4364 {
4365   int size;                     /* size of data.  */
4366   int on_stack;                 /* 1 if allocated by alloca.  */
4367   unsigned char *data;
4368 };
4369
4370 /* Don't use alloca for allocating memory space larger than this, lest
4371    we overflow their stack.  */
4372 #define MAX_ALLOCA 16*1024
4373
4374 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4375 #define allocate_conversion_buffer(buf, len)            \
4376   do {                                                  \
4377     if (len < MAX_ALLOCA)                               \
4378       {                                                 \
4379         buf.data = (unsigned char *) alloca (len);      \
4380         buf.on_stack = 1;                               \
4381       }                                                 \
4382     else                                                \
4383       {                                                 \
4384         buf.data = (unsigned char *) xmalloc (len);     \
4385         buf.on_stack = 0;                               \
4386       }                                                 \
4387     buf.size = len;                                     \
4388   } while (0)
4389
4390 /* Double the allocated memory for *BUF.  */
4391 static void
4392 extend_conversion_buffer (buf)
4393      struct conversion_buffer *buf;
4394 {
4395   if (buf->on_stack)
4396     {
4397       unsigned char *save = buf->data;
4398       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4399       bcopy (save, buf->data, buf->size);
4400       buf->on_stack = 0;
4401     }
4402   else
4403     {
4404       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4405     }
4406   buf->size *= 2;
4407 }
4408
4409 /* Free the allocated memory for BUF if it is not on stack.  */
4410 static void
4411 free_conversion_buffer (buf)
4412      struct conversion_buffer *buf;
4413 {
4414   if (!buf->on_stack)
4415     xfree (buf->data);
4416 }
4417
4418 int
4419 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4420      struct coding_system *coding;
4421      unsigned char *source, *destination;
4422      int src_bytes, dst_bytes, encodep;
4423 {
4424   struct ccl_program *ccl
4425     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4426   unsigned char *dst = destination;
4427
4428   ccl->suppress_error = coding->suppress_error;
4429   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4430   if (encodep)
4431     {
4432       /* On encoding, EOL format is converted within ccl_driver.  For
4433          that, setup proper information in the structure CCL.  */
4434       ccl->eol_type = coding->eol_type;
4435       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4436         ccl->eol_type = CODING_EOL_LF;
4437       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4438     }
4439   ccl->multibyte = coding->src_multibyte;
4440   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4441     {
4442       /* Move carryover bytes to DESTINATION.  */
4443       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4444       while (*p)
4445         *dst++ = *p++;
4446       coding->spec.ccl.eight_bit_carryover[0] = 0;
4447       if (dst_bytes)
4448         dst_bytes -= dst - destination;
4449     }
4450
4451   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4452                                   &(coding->consumed))
4453                       + dst - destination);
4454
4455   if (encodep)
4456     {
4457       coding->produced_char = coding->produced;
4458       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4459     }
4460   else if (!ccl->eight_bit_control)
4461     {
4462       /* The produced bytes forms a valid multibyte sequence. */
4463       coding->produced_char
4464         = multibyte_chars_in_text (destination, coding->produced);
4465       coding->spec.ccl.eight_bit_carryover[0] = 0;
4466     }
4467   else
4468     {
4469       /* On decoding, the destination should always multibyte.  But,
4470          CCL program might have been generated an invalid multibyte
4471          sequence.  Here we make such a sequence valid as
4472          multibyte.  */
4473       int bytes
4474         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4475
4476       if ((coding->consumed < src_bytes
4477            || !ccl->last_block)
4478           && coding->produced >= 1
4479           && destination[coding->produced - 1] >= 0x80)
4480         {
4481           /* We should not convert the tailing 8-bit codes to
4482              multibyte form even if they doesn't form a valid
4483              multibyte sequence.  They may form a valid sequence in
4484              the next call.  */
4485           int carryover = 0;
4486
4487           if (destination[coding->produced - 1] < 0xA0)
4488             carryover = 1;
4489           else if (coding->produced >= 2)
4490             {
4491               if (destination[coding->produced - 2] >= 0x80)
4492                 {
4493                   if (destination[coding->produced - 2] < 0xA0)
4494                     carryover = 2;
4495                   else if (coding->produced >= 3
4496                            && destination[coding->produced - 3] >= 0x80
4497                            && destination[coding->produced - 3] < 0xA0)
4498                     carryover = 3;
4499                 }
4500             }
4501           if (carryover > 0)
4502             {
4503               BCOPY_SHORT (destination + coding->produced - carryover,
4504                            coding->spec.ccl.eight_bit_carryover,
4505                            carryover);
4506               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4507               coding->produced -= carryover;
4508             }
4509         }
4510       coding->produced = str_as_multibyte (destination, bytes,
4511                                            coding->produced,
4512                                            &(coding->produced_char));
4513     }
4514
4515   switch (ccl->status)
4516     {
4517     case CCL_STAT_SUSPEND_BY_SRC:
4518       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4519       break;
4520     case CCL_STAT_SUSPEND_BY_DST:
4521       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4522       break;
4523     case CCL_STAT_QUIT:
4524     case CCL_STAT_INVALID_CMD:
4525       coding->result = CODING_FINISH_INTERRUPT;
4526       break;
4527     default:
4528       coding->result = CODING_FINISH_NORMAL;
4529       break;
4530     }
4531   return coding->result;
4532 }
4533
4534 /* Decode EOL format of the text at PTR of BYTES length destructively
4535    according to CODING->eol_type.  This is called after the CCL
4536    program produced a decoded text at PTR.  If we do CRLF->LF
4537    conversion, update CODING->produced and CODING->produced_char.  */
4538
4539 static void
4540 decode_eol_post_ccl (coding, ptr, bytes)
4541      struct coding_system *coding;
4542      unsigned char *ptr;
4543      int bytes;
4544 {
4545   Lisp_Object val, saved_coding_symbol;
4546   unsigned char *pend = ptr + bytes;
4547   int dummy;
4548
4549   /* Remember the current coding system symbol.  We set it back when
4550      an inconsistent EOL is found so that `last-coding-system-used' is
4551      set to the coding system that doesn't specify EOL conversion.  */
4552   saved_coding_symbol = coding->symbol;
4553
4554   coding->spec.ccl.cr_carryover = 0;
4555   if (coding->eol_type == CODING_EOL_UNDECIDED)
4556     {
4557       /* Here, to avoid the call of setup_coding_system, we directly
4558          call detect_eol_type.  */
4559       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4560       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4561         coding->eol_type = CODING_EOL_LF;
4562       if (coding->eol_type != CODING_EOL_UNDECIDED)
4563         {
4564           val = Fget (coding->symbol, Qeol_type);
4565           if (VECTORP (val) && XVECTOR (val)->size == 3)
4566             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4567         }
4568       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4569     }
4570
4571   if (coding->eol_type == CODING_EOL_LF
4572       || coding->eol_type == CODING_EOL_UNDECIDED)
4573     {
4574       /* We have nothing to do.  */
4575       ptr = pend;
4576     }
4577   else if (coding->eol_type == CODING_EOL_CRLF)
4578     {
4579       unsigned char *pstart = ptr, *p = ptr;
4580
4581       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4582           && *(pend - 1) == '\r')
4583         {
4584           /* If the last character is CR, we can't handle it here
4585              because LF will be in the not-yet-decoded source text.
4586              Record that the CR is not yet processed.  */
4587           coding->spec.ccl.cr_carryover = 1;
4588           coding->produced--;
4589           coding->produced_char--;
4590           pend--;
4591         }
4592       while (ptr < pend)
4593         {
4594           if (*ptr == '\r')
4595             {
4596               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4597                 {
4598                   *p++ = '\n';
4599                   ptr += 2;
4600                 }
4601               else
4602                 {
4603                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4604                     goto undo_eol_conversion;
4605                   *p++ = *ptr++;
4606                 }
4607             }
4608           else if (*ptr == '\n'
4609                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4610             goto undo_eol_conversion;
4611           else
4612             *p++ = *ptr++;
4613           continue;
4614
4615         undo_eol_conversion:
4616           /* We have faced with inconsistent EOL format at PTR.
4617              Convert all LFs before PTR back to CRLFs.  */
4618           for (p--, ptr--; p >= pstart; p--)
4619             {
4620               if (*p == '\n')
4621                 *ptr-- = '\n', *ptr-- = '\r';
4622               else
4623                 *ptr-- = *p;
4624             }
4625           /*  If carryover is recorded, cancel it because we don't
4626               convert CRLF anymore.  */
4627           if (coding->spec.ccl.cr_carryover)
4628             {
4629               coding->spec.ccl.cr_carryover = 0;
4630               coding->produced++;
4631               coding->produced_char++;
4632               pend++;
4633             }
4634           p = ptr = pend;
4635           coding->eol_type = CODING_EOL_LF;
4636           coding->symbol = saved_coding_symbol;
4637         }
4638       if (p < pend)
4639         {
4640           /* As each two-byte sequence CRLF was converted to LF, (PEND
4641              - P) is the number of deleted characters.  */
4642           coding->produced -= pend - p;
4643           coding->produced_char -= pend - p;
4644         }
4645     }
4646   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4647     {
4648       unsigned char *p = ptr;
4649
4650       for (; ptr < pend; ptr++)
4651         {
4652           if (*ptr == '\r')
4653             *ptr = '\n';
4654           else if (*ptr == '\n'
4655                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4656             {
4657               for (; p < ptr; p++)
4658                 {
4659                   if (*p == '\n')
4660                     *p = '\r';
4661                 }
4662               ptr = pend;
4663               coding->eol_type = CODING_EOL_LF;
4664               coding->symbol = saved_coding_symbol;
4665             }
4666         }
4667     }
4668 }
4669
4670 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4671    decoding, it may detect coding system and format of end-of-line if
4672    those are not yet decided.  The source should be unibyte, the
4673    result is multibyte if CODING->dst_multibyte is nonzero, else
4674    unibyte.  */
4675
4676 int
4677 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4678      struct coding_system *coding;
4679      const unsigned char *source;
4680      unsigned char *destination;
4681      int src_bytes, dst_bytes;
4682 {
4683   int extra = 0;
4684
4685   if (coding->type == coding_type_undecided)
4686     detect_coding (coding, source, src_bytes);
4687
4688   if (coding->eol_type == CODING_EOL_UNDECIDED
4689       && coding->type != coding_type_ccl)
4690     {
4691       detect_eol (coding, source, src_bytes);
4692       /* We had better recover the original eol format if we
4693          encounter an inconsistent eol format while decoding.  */
4694       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4695     }
4696
4697   coding->produced = coding->produced_char = 0;
4698   coding->consumed = coding->consumed_char = 0;
4699   coding->errors = 0;
4700   coding->result = CODING_FINISH_NORMAL;
4701
4702   switch (coding->type)
4703     {
4704     case coding_type_sjis:
4705       decode_coding_sjis_big5 (coding, source, destination,
4706                                src_bytes, dst_bytes, 1);
4707       break;
4708
4709     case coding_type_iso2022:
4710       decode_coding_iso2022 (coding, source, destination,
4711                              src_bytes, dst_bytes);
4712       break;
4713
4714     case coding_type_big5:
4715       decode_coding_sjis_big5 (coding, source, destination,
4716                                src_bytes, dst_bytes, 0);
4717       break;
4718
4719     case coding_type_emacs_mule:
4720       decode_coding_emacs_mule (coding, source, destination,
4721                                 src_bytes, dst_bytes);
4722       break;
4723
4724     case coding_type_ccl:
4725       if (coding->spec.ccl.cr_carryover)
4726         {
4727           /* Put the CR which was not processed by the previous call
4728              of decode_eol_post_ccl in DESTINATION.  It will be
4729              decoded together with the following LF by the call to
4730              decode_eol_post_ccl below.  */
4731           *destination = '\r';
4732           coding->produced++;
4733           coding->produced_char++;
4734           dst_bytes--;
4735           extra = coding->spec.ccl.cr_carryover;
4736         }
4737       ccl_coding_driver (coding, source, destination + extra,
4738                          src_bytes, dst_bytes, 0);
4739       if (coding->eol_type != CODING_EOL_LF)
4740         {
4741           coding->produced += extra;
4742           coding->produced_char += extra;
4743           decode_eol_post_ccl (coding, destination, coding->produced);
4744         }
4745       break;
4746
4747     default:
4748       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4749     }
4750
4751   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4752       && coding->mode & CODING_MODE_LAST_BLOCK
4753       && coding->consumed == src_bytes)
4754     coding->result = CODING_FINISH_NORMAL;
4755
4756   if (coding->mode & CODING_MODE_LAST_BLOCK
4757       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4758     {
4759       const unsigned char *src = source + coding->consumed;
4760       unsigned char *dst = destination + coding->produced;
4761
4762       src_bytes -= coding->consumed;
4763       coding->errors++;
4764       if (COMPOSING_P (coding))
4765         DECODE_COMPOSITION_END ('1');
4766       while (src_bytes--)
4767         {
4768           int c = *src++;
4769           dst += CHAR_STRING (c, dst);
4770           coding->produced_char++;
4771         }
4772       coding->consumed = coding->consumed_char = src - source;
4773       coding->produced = dst - destination;
4774       coding->result = CODING_FINISH_NORMAL;
4775     }
4776
4777   if (!coding->dst_multibyte)
4778     {
4779       coding->produced = str_as_unibyte (destination, coding->produced);
4780       coding->produced_char = coding->produced;
4781     }
4782
4783   return coding->result;
4784 }
4785
4786 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4787    multibyteness of the source is CODING->src_multibyte, the
4788    multibyteness of the result is always unibyte.  */
4789
4790 int
4791 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4792      struct coding_system *coding;
4793      const unsigned char *source;
4794      unsigned char *destination;
4795      int src_bytes, dst_bytes;
4796 {
4797   coding->produced = coding->produced_char = 0;
4798   coding->consumed = coding->consumed_char = 0;
4799   coding->errors = 0;
4800   coding->result = CODING_FINISH_NORMAL;
4801
4802   switch (coding->type)
4803     {
4804     case coding_type_sjis:
4805       encode_coding_sjis_big5 (coding, source, destination,
4806                                src_bytes, dst_bytes, 1);
4807       break;
4808
4809     case coding_type_iso2022:
4810       encode_coding_iso2022 (coding, source, destination,
4811                              src_bytes, dst_bytes);
4812       break;
4813
4814     case coding_type_big5:
4815       encode_coding_sjis_big5 (coding, source, destination,
4816                                src_bytes, dst_bytes, 0);
4817       break;
4818
4819     case coding_type_emacs_mule:
4820       encode_coding_emacs_mule (coding, source, destination,
4821                                 src_bytes, dst_bytes);
4822       break;
4823
4824     case coding_type_ccl:
4825       ccl_coding_driver (coding, source, destination,
4826                          src_bytes, dst_bytes, 1);
4827       break;
4828
4829     default:
4830       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4831     }
4832
4833   if (coding->mode & CODING_MODE_LAST_BLOCK
4834       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4835     {
4836       const unsigned char *src = source + coding->consumed;
4837       unsigned char *dst = destination + coding->produced;
4838
4839       if (coding->type == coding_type_iso2022)
4840         ENCODE_RESET_PLANE_AND_REGISTER;
4841       if (COMPOSING_P (coding))
4842         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4843       if (coding->consumed < src_bytes)
4844         {
4845           int len = src_bytes - coding->consumed;
4846
4847           BCOPY_SHORT (src, dst, len);
4848           if (coding->src_multibyte)
4849             len = str_as_unibyte (dst, len);
4850           dst += len;
4851           coding->consumed = src_bytes;
4852         }
4853       coding->produced = coding->produced_char = dst - destination;
4854       coding->result = CODING_FINISH_NORMAL;
4855     }
4856
4857   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4858       && coding->consumed == src_bytes)
4859     coding->result = CODING_FINISH_NORMAL;
4860
4861   return coding->result;
4862 }
4863
4864 /* Scan text in the region between *BEG and *END (byte positions),
4865    skip characters which we don't have to decode by coding system
4866    CODING at the head and tail, then set *BEG and *END to the region
4867    of the text we actually have to convert.  The caller should move
4868    the gap out of the region in advance if the region is from a
4869    buffer.
4870
4871    If STR is not NULL, *BEG and *END are indices into STR.  */
4872
4873 static void
4874 shrink_decoding_region (beg, end, coding, str)
4875      int *beg, *end;
4876      struct coding_system *coding;
4877      unsigned char *str;
4878 {
4879   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4880   int eol_conversion;
4881   Lisp_Object translation_table;
4882
4883   if (coding->type == coding_type_ccl
4884       || coding->type == coding_type_undecided
4885       || coding->eol_type != CODING_EOL_LF
4886       || !NILP (coding->post_read_conversion)
4887       || coding->composing != COMPOSITION_DISABLED)
4888     {
4889       /* We can't skip any data.  */
4890       return;
4891     }
4892   if (coding->type == coding_type_no_conversion
4893       || coding->type == coding_type_raw_text
4894       || coding->type == coding_type_emacs_mule)
4895     {
4896       /* We need no conversion, but don't have to skip any data here.
4897          Decoding routine handles them effectively anyway.  */
4898       return;
4899     }
4900
4901   translation_table = coding->translation_table_for_decode;
4902   if (NILP (translation_table) && !NILP (Venable_character_translation))
4903     translation_table = Vstandard_translation_table_for_decode;
4904   if (CHAR_TABLE_P (translation_table))
4905     {
4906       int i;
4907       for (i = 0; i < 128; i++)
4908         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4909           break;
4910       if (i < 128)
4911         /* Some ASCII character should be translated.  We give up
4912            shrinking.  */
4913         return;
4914     }
4915
4916   if (coding->heading_ascii >= 0)
4917     /* Detection routine has already found how much we can skip at the
4918        head.  */
4919     *beg += coding->heading_ascii;
4920
4921   if (str)
4922     {
4923       begp_orig = begp = str + *beg;
4924       endp_orig = endp = str + *end;
4925     }
4926   else
4927     {
4928       begp_orig = begp = BYTE_POS_ADDR (*beg);
4929       endp_orig = endp = begp + *end - *beg;
4930     }
4931
4932   eol_conversion = (coding->eol_type == CODING_EOL_CR
4933                     || coding->eol_type == CODING_EOL_CRLF);
4934
4935   switch (coding->type)
4936     {
4937     case coding_type_sjis:
4938     case coding_type_big5:
4939       /* We can skip all ASCII characters at the head.  */
4940       if (coding->heading_ascii < 0)
4941         {
4942           if (eol_conversion)
4943             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4944           else
4945             while (begp < endp && *begp < 0x80) begp++;
4946         }
4947       /* We can skip all ASCII characters at the tail except for the
4948          second byte of SJIS or BIG5 code.  */
4949       if (eol_conversion)
4950         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4951       else
4952         while (begp < endp && endp[-1] < 0x80) endp--;
4953       /* Do not consider LF as ascii if preceded by CR, since that
4954          confuses eol decoding. */
4955       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4956         endp++;
4957       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4958         endp++;
4959       break;
4960
4961     case coding_type_iso2022:
4962       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4963         /* We can't skip any data.  */
4964         break;
4965       if (coding->heading_ascii < 0)
4966         {
4967           /* We can skip all ASCII characters at the head except for a
4968              few control codes.  */
4969           while (begp < endp && (c = *begp) < 0x80
4970                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4971                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4972                  && (!eol_conversion || c != ISO_CODE_LF))
4973             begp++;
4974         }
4975       switch (coding->category_idx)
4976         {
4977         case CODING_CATEGORY_IDX_ISO_8_1:
4978         case CODING_CATEGORY_IDX_ISO_8_2:
4979           /* We can skip all ASCII characters at the tail.  */
4980           if (eol_conversion)
4981             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4982           else
4983             while (begp < endp && endp[-1] < 0x80) endp--;
4984           /* Do not consider LF as ascii if preceded by CR, since that
4985              confuses eol decoding. */
4986           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4987             endp++;
4988           break;
4989
4990         case CODING_CATEGORY_IDX_ISO_7:
4991         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4992           {
4993             /* We can skip all characters at the tail except for 8-bit
4994                codes and ESC and the following 2-byte at the tail.  */
4995             unsigned char *eight_bit = NULL;
4996
4997             if (eol_conversion)
4998               while (begp < endp
4999                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5000                 {
5001                   if (!eight_bit && c & 0x80) eight_bit = endp;
5002                   endp--;
5003                 }
5004             else
5005               while (begp < endp
5006                      && (c = endp[-1]) != ISO_CODE_ESC)
5007                 {
5008                   if (!eight_bit && c & 0x80) eight_bit = endp;
5009                   endp--;
5010                 }
5011             /* Do not consider LF as ascii if preceded by CR, since that
5012                confuses eol decoding. */
5013             if (begp < endp && endp < endp_orig
5014                 && endp[-1] == '\r' && endp[0] == '\n')
5015               endp++;
5016             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5017               {
5018                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5019                   /* This is an ASCII designation sequence.  We can
5020                      surely skip the tail.  But, if we have
5021                      encountered an 8-bit code, skip only the codes
5022                      after that.  */
5023                   endp = eight_bit ? eight_bit : endp + 2;
5024                 else
5025                   /* Hmmm, we can't skip the tail.  */
5026                   endp = endp_orig;
5027               }
5028             else if (eight_bit)
5029               endp = eight_bit;
5030           }
5031         }
5032       break;
5033
5034     default:
5035       abort ();
5036     }
5037   *beg += begp - begp_orig;
5038   *end += endp - endp_orig;
5039   return;
5040 }
5041
5042 /* Like shrink_decoding_region but for encoding.  */
5043
5044 static void
5045 shrink_encoding_region (beg, end, coding, str)
5046      int *beg, *end;
5047      struct coding_system *coding;
5048      unsigned char *str;
5049 {
5050   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5051   int eol_conversion;
5052   Lisp_Object translation_table;
5053
5054   if (coding->type == coding_type_ccl
5055       || coding->eol_type == CODING_EOL_CRLF
5056       || coding->eol_type == CODING_EOL_CR
5057       || (coding->cmp_data && coding->cmp_data->used > 0))
5058     {
5059       /* We can't skip any data.  */
5060       return;
5061     }
5062   if (coding->type == coding_type_no_conversion
5063       || coding->type == coding_type_raw_text
5064       || coding->type == coding_type_emacs_mule
5065       || coding->type == coding_type_undecided)
5066     {
5067       /* We need no conversion, but don't have to skip any data here.
5068          Encoding routine handles them effectively anyway.  */
5069       return;
5070     }
5071
5072   translation_table = coding->translation_table_for_encode;
5073   if (NILP (translation_table) && !NILP (Venable_character_translation))
5074     translation_table = Vstandard_translation_table_for_encode;
5075   if (CHAR_TABLE_P (translation_table))
5076     {
5077       int i;
5078       for (i = 0; i < 128; i++)
5079         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5080           break;
5081       if (i < 128)
5082         /* Some ASCII character should be translated.  We give up
5083            shrinking.  */
5084         return;
5085     }
5086
5087   if (str)
5088     {
5089       begp_orig = begp = str + *beg;
5090       endp_orig = endp = str + *end;
5091     }
5092   else
5093     {
5094       begp_orig = begp = BYTE_POS_ADDR (*beg);
5095       endp_orig = endp = begp + *end - *beg;
5096     }
5097
5098   eol_conversion = (coding->eol_type == CODING_EOL_CR
5099                     || coding->eol_type == CODING_EOL_CRLF);
5100
5101   /* Here, we don't have to check coding->pre_write_conversion because
5102      the caller is expected to have handled it already.  */
5103   switch (coding->type)
5104     {
5105     case coding_type_iso2022:
5106       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5107         /* We can't skip any data.  */
5108         break;
5109       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5110         {
5111           unsigned char *bol = begp;
5112           while (begp < endp && *begp < 0x80)
5113             {
5114               begp++;
5115               if (begp[-1] == '\n')
5116                 bol = begp;
5117             }
5118           begp = bol;
5119           goto label_skip_tail;
5120         }
5121       /* fall down ... */
5122
5123     case coding_type_sjis:
5124     case coding_type_big5:
5125       /* We can skip all ASCII characters at the head and tail.  */
5126       if (eol_conversion)
5127         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5128       else
5129         while (begp < endp && *begp < 0x80) begp++;
5130     label_skip_tail:
5131       if (eol_conversion)
5132         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5133       else
5134         while (begp < endp && *(endp - 1) < 0x80) endp--;
5135       break;
5136
5137     default:
5138       abort ();
5139     }
5140
5141   *beg += begp - begp_orig;
5142   *end += endp - endp_orig;
5143   return;
5144 }
5145
5146 /* As shrinking conversion region requires some overhead, we don't try
5147    shrinking if the length of conversion region is less than this
5148    value.  */
5149 static int shrink_conversion_region_threshhold = 1024;
5150
5151 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5152   do {                                                                  \
5153     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5154       {                                                                 \
5155         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5156         else shrink_decoding_region (beg, end, coding, str);            \
5157       }                                                                 \
5158   } while (0)
5159
5160 static Lisp_Object
5161 code_convert_region_unwind (dummy)
5162      Lisp_Object dummy;
5163 {
5164   inhibit_pre_post_conversion = 0;
5165   return Qnil;
5166 }
5167
5168 /* Store information about all compositions in the range FROM and TO
5169    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5170    buffer or a string, defaults to the current buffer.  */
5171
5172 void
5173 coding_save_composition (coding, from, to, obj)
5174      struct coding_system *coding;
5175      int from, to;
5176      Lisp_Object obj;
5177 {
5178   Lisp_Object prop;
5179   int start, end;
5180
5181   if (coding->composing == COMPOSITION_DISABLED)
5182     return;
5183   if (!coding->cmp_data)
5184     coding_allocate_composition_data (coding, from);
5185   if (!find_composition (from, to, &start, &end, &prop, obj)
5186       || end > to)
5187     return;
5188   if (start < from
5189       && (!find_composition (end, to, &start, &end, &prop, obj)
5190           || end > to))
5191     return;
5192   coding->composing = COMPOSITION_NO;
5193   do
5194     {
5195       if (COMPOSITION_VALID_P (start, end, prop))
5196         {
5197           enum composition_method method = COMPOSITION_METHOD (prop);
5198           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5199               >= COMPOSITION_DATA_SIZE)
5200             coding_allocate_composition_data (coding, from);
5201           /* For relative composition, we remember start and end
5202              positions, for the other compositions, we also remember
5203              components.  */
5204           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5205           if (method != COMPOSITION_RELATIVE)
5206             {
5207               /* We must store a*/
5208               Lisp_Object val, ch;
5209
5210               val = COMPOSITION_COMPONENTS (prop);
5211               if (CONSP (val))
5212                 while (CONSP (val))
5213                   {
5214                     ch = XCAR (val), val = XCDR (val);
5215                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5216                   }
5217               else if (VECTORP (val) || STRINGP (val))
5218                 {
5219                   int len = (VECTORP (val)
5220                              ? XVECTOR (val)->size : SCHARS (val));
5221                   int i;
5222                   for (i = 0; i < len; i++)
5223                     {
5224                       ch = (STRINGP (val)
5225                             ? Faref (val, make_number (i))
5226                             : XVECTOR (val)->contents[i]);
5227                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5228                     }
5229                 }
5230               else              /* INTEGERP (val) */
5231                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5232             }
5233           CODING_ADD_COMPOSITION_END (coding, end - from);
5234         }
5235       start = end;
5236     }
5237   while (start < to
5238          && find_composition (start, to, &start, &end, &prop, obj)
5239          && end <= to);
5240
5241   /* Make coding->cmp_data point to the first memory block.  */
5242   while (coding->cmp_data->prev)
5243     coding->cmp_data = coding->cmp_data->prev;
5244   coding->cmp_data_start = 0;
5245 }
5246
5247 /* Reflect the saved information about compositions to OBJ.
5248    CODING->cmp_data points to a memory block for the information.  OBJ
5249    is a buffer or a string, defaults to the current buffer.  */
5250
5251 void
5252 coding_restore_composition (coding, obj)
5253      struct coding_system *coding;
5254      Lisp_Object obj;
5255 {
5256   struct composition_data *cmp_data = coding->cmp_data;
5257
5258   if (!cmp_data)
5259     return;
5260
5261   while (cmp_data->prev)
5262     cmp_data = cmp_data->prev;
5263
5264   while (cmp_data)
5265     {
5266       int i;
5267
5268       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5269            i += cmp_data->data[i])
5270         {
5271           int *data = cmp_data->data + i;
5272           enum composition_method method = (enum composition_method) data[3];
5273           Lisp_Object components;
5274
5275           if (method == COMPOSITION_RELATIVE)
5276             components = Qnil;
5277           else
5278             {
5279               int len = data[0] - 4, j;
5280               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5281
5282               for (j = 0; j < len; j++)
5283                 args[j] = make_number (data[4 + j]);
5284               components = (method == COMPOSITION_WITH_ALTCHARS
5285                             ? Fstring (len, args) : Fvector (len, args));
5286             }
5287           compose_text (data[1], data[2], components, Qnil, obj);
5288         }
5289       cmp_data = cmp_data->next;
5290     }
5291 }
5292
5293 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5294    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5295    coding system CODING, and return the status code of code conversion
5296    (currently, this value has no meaning).
5297
5298    How many characters (and bytes) are converted to how many
5299    characters (and bytes) are recorded in members of the structure
5300    CODING.
5301
5302    If REPLACE is nonzero, we do various things as if the original text
5303    is deleted and a new text is inserted.  See the comments in
5304    replace_range (insdel.c) to know what we are doing.
5305
5306    If REPLACE is zero, it is assumed that the source text is unibyte.
5307    Otherwise, it is assumed that the source text is multibyte.  */
5308
5309 int
5310 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5311      int from, from_byte, to, to_byte, encodep, replace;
5312      struct coding_system *coding;
5313 {
5314   int len = to - from, len_byte = to_byte - from_byte;
5315   int nchars_del = 0, nbytes_del = 0;
5316   int require, inserted, inserted_byte;
5317   int head_skip, tail_skip, total_skip = 0;
5318   Lisp_Object saved_coding_symbol;
5319   int first = 1;
5320   unsigned char *src, *dst;
5321   Lisp_Object deletion;
5322   int orig_point = PT, orig_len = len;
5323   int prev_Z;
5324   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5325
5326   deletion = Qnil;
5327   saved_coding_symbol = coding->symbol;
5328
5329   if (from < PT && PT < to)
5330     {
5331       TEMP_SET_PT_BOTH (from, from_byte);
5332       orig_point = from;
5333     }
5334
5335   if (replace)
5336     {
5337       int saved_from = from;
5338       int saved_inhibit_modification_hooks;
5339
5340       prepare_to_modify_buffer (from, to, &from);
5341       if (saved_from != from)
5342         {
5343           to = from + len;
5344           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5345           len_byte = to_byte - from_byte;
5346         }
5347
5348       /* The code conversion routine can not preserve text properties
5349          for now.  So, we must remove all text properties in the
5350          region.  Here, we must suppress all modification hooks.  */
5351       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5352       inhibit_modification_hooks = 1;
5353       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5354       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5355     }
5356
5357   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5358     {
5359       /* We must detect encoding of text and eol format.  */
5360
5361       if (from < GPT && to > GPT)
5362         move_gap_both (from, from_byte);
5363       if (coding->type == coding_type_undecided)
5364         {
5365           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5366           if (coding->type == coding_type_undecided)
5367             {
5368               /* It seems that the text contains only ASCII, but we
5369                  should not leave it undecided because the deeper
5370                  decoding routine (decode_coding) tries to detect the
5371                  encodings again in vain.  */
5372               coding->type = coding_type_emacs_mule;
5373               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5374               /* As emacs-mule decoder will handle composition, we
5375                  need this setting to allocate coding->cmp_data
5376                  later.  */
5377               coding->composing = COMPOSITION_NO;
5378             }
5379         }
5380       if (coding->eol_type == CODING_EOL_UNDECIDED
5381           && coding->type != coding_type_ccl)
5382         {
5383           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5384           if (coding->eol_type == CODING_EOL_UNDECIDED)
5385             coding->eol_type = CODING_EOL_LF;
5386           /* We had better recover the original eol format if we
5387              encounter an inconsistent eol format while decoding.  */
5388           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5389         }
5390     }
5391
5392   /* Now we convert the text.  */
5393
5394   /* For encoding, we must process pre-write-conversion in advance.  */
5395   if (! inhibit_pre_post_conversion
5396       && encodep
5397       && SYMBOLP (coding->pre_write_conversion)
5398       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5399     {
5400       /* The function in pre-write-conversion may put a new text in a
5401          new buffer.  */
5402       struct buffer *prev = current_buffer;
5403       Lisp_Object new;
5404
5405       record_unwind_protect (code_convert_region_unwind, Qnil);
5406       /* We should not call any more pre-write/post-read-conversion
5407          functions while this pre-write-conversion is running.  */
5408       inhibit_pre_post_conversion = 1;
5409       call2 (coding->pre_write_conversion,
5410              make_number (from), make_number (to));
5411       inhibit_pre_post_conversion = 0;
5412       /* Discard the unwind protect.  */
5413       specpdl_ptr--;
5414
5415       if (current_buffer != prev)
5416         {
5417           len = ZV - BEGV;
5418           new = Fcurrent_buffer ();
5419           set_buffer_internal_1 (prev);
5420           del_range_2 (from, from_byte, to, to_byte, 0);
5421           TEMP_SET_PT_BOTH (from, from_byte);
5422           insert_from_buffer (XBUFFER (new), 1, len, 0);
5423           Fkill_buffer (new);
5424           if (orig_point >= to)
5425             orig_point += len - orig_len;
5426           else if (orig_point > from)
5427             orig_point = from;
5428           orig_len = len;
5429           to = from + len;
5430           from_byte = CHAR_TO_BYTE (from);
5431           to_byte = CHAR_TO_BYTE (to);
5432           len_byte = to_byte - from_byte;
5433           TEMP_SET_PT_BOTH (from, from_byte);
5434         }
5435     }
5436
5437   if (replace)
5438     {
5439       if (! EQ (current_buffer->undo_list, Qt))
5440         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5441       else
5442         {
5443           nchars_del = to - from;
5444           nbytes_del = to_byte - from_byte;
5445         }
5446     }
5447
5448   if (coding->composing != COMPOSITION_DISABLED)
5449     {
5450       if (encodep)
5451         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5452       else
5453         coding_allocate_composition_data (coding, from);
5454     }
5455
5456   /* Try to skip the heading and tailing ASCIIs.  */
5457   if (coding->type != coding_type_ccl)
5458     {
5459       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5460
5461       if (from < GPT && GPT < to)
5462         move_gap_both (from, from_byte);
5463       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5464       if (from_byte == to_byte
5465           && (encodep || NILP (coding->post_read_conversion))
5466           && ! CODING_REQUIRE_FLUSHING (coding))
5467         {
5468           coding->produced = len_byte;
5469           coding->produced_char = len;
5470           if (!replace)
5471             /* We must record and adjust for this new text now.  */
5472             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5473           return 0;
5474         }
5475
5476       head_skip = from_byte - from_byte_orig;
5477       tail_skip = to_byte_orig - to_byte;
5478       total_skip = head_skip + tail_skip;
5479       from += head_skip;
5480       to -= tail_skip;
5481       len -= total_skip; len_byte -= total_skip;
5482     }
5483
5484   /* For conversion, we must put the gap before the text in addition to
5485      making the gap larger for efficient decoding.  The required gap
5486      size starts from 2000 which is the magic number used in make_gap.
5487      But, after one batch of conversion, it will be incremented if we
5488      find that it is not enough .  */
5489   require = 2000;
5490
5491   if (GAP_SIZE  < require)
5492     make_gap (require - GAP_SIZE);
5493   move_gap_both (from, from_byte);
5494
5495   inserted = inserted_byte = 0;
5496
5497   GAP_SIZE += len_byte;
5498   ZV -= len;
5499   Z -= len;
5500   ZV_BYTE -= len_byte;
5501   Z_BYTE -= len_byte;
5502
5503   if (GPT - BEG < BEG_UNCHANGED)
5504     BEG_UNCHANGED = GPT - BEG;
5505   if (Z - GPT < END_UNCHANGED)
5506     END_UNCHANGED = Z - GPT;
5507
5508   if (!encodep && coding->src_multibyte)
5509     {
5510       /* Decoding routines expects that the source text is unibyte.
5511          We must convert 8-bit characters of multibyte form to
5512          unibyte.  */
5513       int len_byte_orig = len_byte;
5514       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5515       if (len_byte < len_byte_orig)
5516         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5517                     len_byte);
5518       coding->src_multibyte = 0;
5519     }
5520
5521   for (;;)
5522     {
5523       int result;
5524
5525       /* The buffer memory is now:
5526          +--------+converted-text+---------+-------original-text-------+---+
5527          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5528                   |<---------------------- GAP ----------------------->|  */
5529       src = GAP_END_ADDR - len_byte;
5530       dst = GPT_ADDR + inserted_byte;
5531
5532       if (encodep)
5533         result = encode_coding (coding, src, dst, len_byte, 0);
5534       else
5535         {
5536           if (coding->composing != COMPOSITION_DISABLED)
5537             coding->cmp_data->char_offset = from + inserted;
5538           result = decode_coding (coding, src, dst, len_byte, 0);
5539         }
5540
5541       /* The buffer memory is now:
5542          +--------+-------converted-text----+--+------original-text----+---+
5543          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5544                   |<---------------------- GAP ----------------------->|  */
5545
5546       inserted += coding->produced_char;
5547       inserted_byte += coding->produced;
5548       len_byte -= coding->consumed;
5549
5550       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5551         {
5552           coding_allocate_composition_data (coding, from + inserted);
5553           continue;
5554         }
5555
5556       src += coding->consumed;
5557       dst += coding->produced;
5558
5559       if (result == CODING_FINISH_NORMAL)
5560         {
5561           src += len_byte;
5562           break;
5563         }
5564       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5565         {
5566           unsigned char *pend = dst, *p = pend - inserted_byte;
5567           Lisp_Object eol_type;
5568
5569           /* Encode LFs back to the original eol format (CR or CRLF).  */
5570           if (coding->eol_type == CODING_EOL_CR)
5571             {
5572               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5573             }
5574           else
5575             {
5576               int count = 0;
5577
5578               while (p < pend) if (*p++ == '\n') count++;
5579               if (src - dst < count)
5580                 {
5581                   /* We don't have sufficient room for encoding LFs
5582                      back to CRLF.  We must record converted and
5583                      not-yet-converted text back to the buffer
5584                      content, enlarge the gap, then record them out of
5585                      the buffer contents again.  */
5586                   int add = len_byte + inserted_byte;
5587
5588                   GAP_SIZE -= add;
5589                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5590                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5591                   make_gap (count - GAP_SIZE);
5592                   GAP_SIZE += add;
5593                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5594                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5595                   /* Don't forget to update SRC, DST, and PEND.  */
5596                   src = GAP_END_ADDR - len_byte;
5597                   dst = GPT_ADDR + inserted_byte;
5598                   pend = dst;
5599                 }
5600               inserted += count;
5601               inserted_byte += count;
5602               coding->produced += count;
5603               p = dst = pend + count;
5604               while (count)
5605                 {
5606                   *--p = *--pend;
5607                   if (*p == '\n') count--, *--p = '\r';
5608                 }
5609             }
5610
5611           /* Suppress eol-format conversion in the further conversion.  */
5612           coding->eol_type = CODING_EOL_LF;
5613
5614           /* Set the coding system symbol to that for Unix-like EOL.  */
5615           eol_type = Fget (saved_coding_symbol, Qeol_type);
5616           if (VECTORP (eol_type)
5617               && XVECTOR (eol_type)->size == 3
5618               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5619             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5620           else
5621             coding->symbol = saved_coding_symbol;
5622
5623           continue;
5624         }
5625       if (len_byte <= 0)
5626         {
5627           if (coding->type != coding_type_ccl
5628               || coding->mode & CODING_MODE_LAST_BLOCK)
5629             break;
5630           coding->mode |= CODING_MODE_LAST_BLOCK;
5631           continue;
5632         }
5633       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5634         {
5635           /* The source text ends in invalid codes.  Let's just
5636              make them valid buffer contents, and finish conversion.  */
5637           if (multibyte_p)
5638             {
5639               unsigned char *start = dst;
5640
5641               inserted += len_byte;
5642               while (len_byte--)
5643                 {
5644                   int c = *src++;
5645                   dst += CHAR_STRING (c, dst);
5646                 }
5647
5648               inserted_byte += dst - start;
5649             }
5650           else
5651             {
5652               inserted += len_byte;
5653               inserted_byte += len_byte;
5654               while (len_byte--)
5655                 *dst++ = *src++;
5656             }
5657           break;
5658         }
5659       if (result == CODING_FINISH_INTERRUPT)
5660         {
5661           /* The conversion procedure was interrupted by a user.  */
5662           break;
5663         }
5664       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5665       if (coding->consumed < 1)
5666         {
5667           /* It's quite strange to require more memory without
5668              consuming any bytes.  Perhaps CCL program bug.  */
5669           break;
5670         }
5671       if (first)
5672         {
5673           /* We have just done the first batch of conversion which was
5674              stopped because of insufficient gap.  Let's reconsider the
5675              required gap size (i.e. SRT - DST) now.
5676
5677              We have converted ORIG bytes (== coding->consumed) into
5678              NEW bytes (coding->produced).  To convert the remaining
5679              LEN bytes, we may need REQUIRE bytes of gap, where:
5680                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5681                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5682              Here, we are sure that NEW >= ORIG.  */
5683           float ratio;
5684
5685           if (coding->produced <= coding->consumed)
5686             {
5687               /* This happens because of CCL-based coding system with
5688                  eol-type CRLF.  */
5689               require = 0;
5690             }
5691           else
5692             {
5693               ratio = (coding->produced - coding->consumed) / coding->consumed;
5694               require = len_byte * ratio;
5695             }
5696           first = 0;
5697         }
5698       if ((src - dst) < (require + 2000))
5699         {
5700           /* See the comment above the previous call of make_gap.  */
5701           int add = len_byte + inserted_byte;
5702
5703           GAP_SIZE -= add;
5704           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5705           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5706           make_gap (require + 2000);
5707           GAP_SIZE += add;
5708           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5709           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5710         }
5711     }
5712   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5713
5714   if (encodep && coding->dst_multibyte)
5715     {
5716       /* The output is unibyte.  We must convert 8-bit characters to
5717          multibyte form.  */
5718       if (inserted_byte * 2 > GAP_SIZE)
5719         {
5720           GAP_SIZE -= inserted_byte;
5721           ZV += inserted_byte; Z += inserted_byte;
5722           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5723           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5724           make_gap (inserted_byte - GAP_SIZE);
5725           GAP_SIZE += inserted_byte;
5726           ZV -= inserted_byte; Z -= inserted_byte;
5727           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5728           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5729         }
5730       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5731     }
5732
5733   /* If we shrank the conversion area, adjust it now.  */
5734   if (total_skip > 0)
5735     {
5736       if (tail_skip > 0)
5737         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5738       inserted += total_skip; inserted_byte += total_skip;
5739       GAP_SIZE += total_skip;
5740       GPT -= head_skip; GPT_BYTE -= head_skip;
5741       ZV -= total_skip; ZV_BYTE -= total_skip;
5742       Z -= total_skip; Z_BYTE -= total_skip;
5743       from -= head_skip; from_byte -= head_skip;
5744       to += tail_skip; to_byte += tail_skip;
5745     }
5746
5747   prev_Z = Z;
5748   if (! EQ (current_buffer->undo_list, Qt))
5749     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5750   else
5751     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5752                                  inserted, inserted_byte);
5753   inserted = Z - prev_Z;
5754
5755   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5756     coding_restore_composition (coding, Fcurrent_buffer ());
5757   coding_free_composition_data (coding);
5758
5759   if (! inhibit_pre_post_conversion
5760       && ! encodep && ! NILP (coding->post_read_conversion))
5761     {
5762       Lisp_Object val;
5763
5764       if (from != PT)
5765         TEMP_SET_PT_BOTH (from, from_byte);
5766       prev_Z = Z;
5767       record_unwind_protect (code_convert_region_unwind, Qnil);
5768       /* We should not call any more pre-write/post-read-conversion
5769          functions while this post-read-conversion is running.  */
5770       inhibit_pre_post_conversion = 1;
5771       val = call1 (coding->post_read_conversion, make_number (inserted));
5772       inhibit_pre_post_conversion = 0;
5773       /* Discard the unwind protect.  */
5774       specpdl_ptr--;
5775       CHECK_NUMBER (val);
5776       inserted += Z - prev_Z;
5777     }
5778
5779   if (orig_point >= from)
5780     {
5781       if (orig_point >= from + orig_len)
5782         orig_point += inserted - orig_len;
5783       else
5784         orig_point = from;
5785       TEMP_SET_PT (orig_point);
5786     }
5787
5788   if (replace)
5789     {
5790       signal_after_change (from, to - from, inserted);
5791       update_compositions (from, from + inserted, CHECK_BORDER);
5792     }
5793
5794   {
5795     coding->consumed = to_byte - from_byte;
5796     coding->consumed_char = to - from;
5797     coding->produced = inserted_byte;
5798     coding->produced_char = inserted;
5799   }
5800
5801   return 0;
5802 }
5803
5804 Lisp_Object
5805 run_pre_post_conversion_on_str (str, coding, encodep)
5806      Lisp_Object str;
5807      struct coding_system *coding;
5808      int encodep;
5809 {
5810   int count = SPECPDL_INDEX ();
5811   struct gcpro gcpro1, gcpro2;
5812   int multibyte = STRING_MULTIBYTE (str);
5813   Lisp_Object buffer;
5814   struct buffer *buf;
5815   Lisp_Object old_deactivate_mark;
5816
5817   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5818   record_unwind_protect (code_convert_region_unwind, Qnil);
5819   /* It is not crucial to specbind this.  */
5820   old_deactivate_mark = Vdeactivate_mark;
5821   GCPRO2 (str, old_deactivate_mark);
5822
5823   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5824   buf = XBUFFER (buffer);
5825
5826   buf->directory = current_buffer->directory;
5827   buf->read_only = Qnil;
5828   buf->filename = Qnil;
5829   buf->undo_list = Qt;
5830   buf->overlays_before = Qnil;
5831   buf->overlays_after = Qnil;
5832
5833   set_buffer_internal (buf);
5834   /* We must insert the contents of STR as is without
5835      unibyte<->multibyte conversion.  For that, we adjust the
5836      multibyteness of the working buffer to that of STR.  */
5837   Ferase_buffer ();
5838   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5839
5840   insert_from_string (str, 0, 0,
5841                       SCHARS (str), SBYTES (str), 0);
5842   UNGCPRO;
5843   inhibit_pre_post_conversion = 1;
5844   if (encodep)
5845     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5846   else
5847     {
5848       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5849       call1 (coding->post_read_conversion, make_number (Z - BEG));
5850     }
5851   inhibit_pre_post_conversion = 0;
5852   Vdeactivate_mark = old_deactivate_mark;
5853   str = make_buffer_string (BEG, Z, 1);
5854   return unbind_to (count, str);
5855 }
5856
5857 Lisp_Object
5858 decode_coding_string (str, coding, nocopy)
5859      Lisp_Object str;
5860      struct coding_system *coding;
5861      int nocopy;
5862 {
5863   int len;
5864   struct conversion_buffer buf;
5865   int from, to_byte;
5866   Lisp_Object saved_coding_symbol;
5867   int result;
5868   int require_decoding;
5869   int shrinked_bytes = 0;
5870   Lisp_Object newstr;
5871   int consumed, consumed_char, produced, produced_char;
5872
5873   from = 0;
5874   to_byte = SBYTES (str);
5875
5876   saved_coding_symbol = coding->symbol;
5877   coding->src_multibyte = STRING_MULTIBYTE (str);
5878   coding->dst_multibyte = 1;
5879   if (CODING_REQUIRE_DETECTION (coding))
5880     {
5881       /* See the comments in code_convert_region.  */
5882       if (coding->type == coding_type_undecided)
5883         {
5884           detect_coding (coding, SDATA (str), to_byte);
5885           if (coding->type == coding_type_undecided)
5886             {
5887               coding->type = coding_type_emacs_mule;
5888               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5889               /* As emacs-mule decoder will handle composition, we
5890                  need this setting to allocate coding->cmp_data
5891                  later.  */
5892               coding->composing = COMPOSITION_NO;
5893             }
5894         }
5895       if (coding->eol_type == CODING_EOL_UNDECIDED
5896           && coding->type != coding_type_ccl)
5897         {
5898           saved_coding_symbol = coding->symbol;
5899           detect_eol (coding, SDATA (str), to_byte);
5900           if (coding->eol_type == CODING_EOL_UNDECIDED)
5901             coding->eol_type = CODING_EOL_LF;
5902           /* We had better recover the original eol format if we
5903              encounter an inconsistent eol format while decoding.  */
5904           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5905         }
5906     }
5907
5908   if (coding->type == coding_type_no_conversion
5909       || coding->type == coding_type_raw_text)
5910     coding->dst_multibyte = 0;
5911
5912   require_decoding = CODING_REQUIRE_DECODING (coding);
5913
5914   if (STRING_MULTIBYTE (str))
5915     {
5916       /* Decoding routines expect the source text to be unibyte.  */
5917       str = Fstring_as_unibyte (str);
5918       to_byte = SBYTES (str);
5919       nocopy = 1;
5920       coding->src_multibyte = 0;
5921     }
5922
5923   /* Try to skip the heading and tailing ASCIIs.  */
5924   if (require_decoding && coding->type != coding_type_ccl)
5925     {
5926       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
5927                                 0);
5928       if (from == to_byte)
5929         require_decoding = 0;
5930       shrinked_bytes = from + (SBYTES (str) - to_byte);
5931     }
5932
5933   if (!require_decoding)
5934     {
5935       coding->consumed = SBYTES (str);
5936       coding->consumed_char = SCHARS (str);
5937       if (coding->dst_multibyte)
5938         {
5939           str = Fstring_as_multibyte (str);
5940           nocopy = 1;
5941         }
5942       coding->produced = SBYTES (str);
5943       coding->produced_char = SCHARS (str);
5944       return (nocopy ? str : Fcopy_sequence (str));
5945     }
5946
5947   if (coding->composing != COMPOSITION_DISABLED)
5948     coding_allocate_composition_data (coding, from);
5949   len = decoding_buffer_size (coding, to_byte - from);
5950   allocate_conversion_buffer (buf, len);
5951
5952   consumed = consumed_char = produced = produced_char = 0;
5953   while (1)
5954     {
5955       result = decode_coding (coding, SDATA (str) + from + consumed,
5956                               buf.data + produced, to_byte - from - consumed,
5957                               buf.size - produced);
5958       consumed += coding->consumed;
5959       consumed_char += coding->consumed_char;
5960       produced += coding->produced;
5961       produced_char += coding->produced_char;
5962       if (result == CODING_FINISH_NORMAL
5963           || (result == CODING_FINISH_INSUFFICIENT_SRC
5964               && coding->consumed == 0))
5965         break;
5966       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5967         coding_allocate_composition_data (coding, from + produced_char);
5968       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5969         extend_conversion_buffer (&buf);
5970       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5971         {
5972           Lisp_Object eol_type;
5973
5974           /* Recover the original EOL format.  */
5975           if (coding->eol_type == CODING_EOL_CR)
5976             {
5977               unsigned char *p;
5978               for (p = buf.data; p < buf.data + produced; p++)
5979                 if (*p == '\n') *p = '\r';
5980             }
5981           else if (coding->eol_type == CODING_EOL_CRLF)
5982             {
5983               int num_eol = 0;
5984               unsigned char *p0, *p1;
5985               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5986                 if (*p0 == '\n') num_eol++;
5987               if (produced + num_eol >= buf.size)
5988                 extend_conversion_buffer (&buf);
5989               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5990                 {
5991                   *--p1 = *--p0;
5992                   if (*p0 == '\n') *--p1 = '\r';
5993                 }
5994               produced += num_eol;
5995               produced_char += num_eol;
5996             }
5997           /* Suppress eol-format conversion in the further conversion.  */
5998           coding->eol_type = CODING_EOL_LF;
5999
6000           /* Set the coding system symbol to that for Unix-like EOL.  */
6001           eol_type = Fget (saved_coding_symbol, Qeol_type);
6002           if (VECTORP (eol_type)
6003               && XVECTOR (eol_type)->size == 3
6004               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6005             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6006           else
6007             coding->symbol = saved_coding_symbol;
6008
6009
6010         }
6011     }
6012
6013   coding->consumed = consumed;
6014   coding->consumed_char = consumed_char;
6015   coding->produced = produced;
6016   coding->produced_char = produced_char;
6017
6018   if (coding->dst_multibyte)
6019     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6020                                            produced + shrinked_bytes);
6021   else
6022     newstr = make_uninit_string (produced + shrinked_bytes);
6023   if (from > 0)
6024     STRING_COPYIN (newstr, 0, SDATA (str), from);
6025   STRING_COPYIN (newstr, from, buf.data, produced);
6026   if (shrinked_bytes > from)
6027     STRING_COPYIN (newstr, from + produced,
6028                    SDATA (str) + to_byte,
6029                    shrinked_bytes - from);
6030   free_conversion_buffer (&buf);
6031
6032   if (coding->cmp_data && coding->cmp_data->used)
6033     coding_restore_composition (coding, newstr);
6034   coding_free_composition_data (coding);
6035
6036   if (SYMBOLP (coding->post_read_conversion)
6037       && !NILP (Ffboundp (coding->post_read_conversion)))
6038     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6039
6040   return newstr;
6041 }
6042
6043 Lisp_Object
6044 encode_coding_string (str, coding, nocopy)
6045      Lisp_Object str;
6046      struct coding_system *coding;
6047      int nocopy;
6048 {
6049   int len;
6050   struct conversion_buffer buf;
6051   int from, to, to_byte;
6052   int result;
6053   int shrinked_bytes = 0;
6054   Lisp_Object newstr;
6055   int consumed, consumed_char, produced, produced_char;
6056
6057   if (SYMBOLP (coding->pre_write_conversion)
6058       && !NILP (Ffboundp (coding->pre_write_conversion)))
6059     str = run_pre_post_conversion_on_str (str, coding, 1);
6060
6061   from = 0;
6062   to = SCHARS (str);
6063   to_byte = SBYTES (str);
6064
6065   /* Encoding routines determine the multibyteness of the source text
6066      by coding->src_multibyte.  */
6067   coding->src_multibyte = STRING_MULTIBYTE (str);
6068   coding->dst_multibyte = 0;
6069   if (! CODING_REQUIRE_ENCODING (coding))
6070     {
6071       coding->consumed = SBYTES (str);
6072       coding->consumed_char = SCHARS (str);
6073       if (STRING_MULTIBYTE (str))
6074         {
6075           str = Fstring_as_unibyte (str);
6076           nocopy = 1;
6077         }
6078       coding->produced = SBYTES (str);
6079       coding->produced_char = SCHARS (str);
6080       return (nocopy ? str : Fcopy_sequence (str));
6081     }
6082
6083   if (coding->composing != COMPOSITION_DISABLED)
6084     coding_save_composition (coding, from, to, str);
6085
6086   /* Try to skip the heading and tailing ASCIIs.  */
6087   if (coding->type != coding_type_ccl)
6088     {
6089       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6090                                 1);
6091       if (from == to_byte)
6092         return (nocopy ? str : Fcopy_sequence (str));
6093       shrinked_bytes = from + (SBYTES (str) - to_byte);
6094     }
6095
6096   len = encoding_buffer_size (coding, to_byte - from);
6097   allocate_conversion_buffer (buf, len);
6098
6099   consumed = consumed_char = produced = produced_char = 0;
6100   while (1)
6101     {
6102       result = encode_coding (coding, SDATA (str) + from + consumed,
6103                               buf.data + produced, to_byte - from - consumed,
6104                               buf.size - produced);
6105       consumed += coding->consumed;
6106       consumed_char += coding->consumed_char;
6107       produced += coding->produced;
6108       produced_char += coding->produced_char;
6109       if (result == CODING_FINISH_NORMAL
6110           || (result == CODING_FINISH_INSUFFICIENT_SRC
6111               && coding->consumed == 0))
6112         break;
6113       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6114       extend_conversion_buffer (&buf);
6115     }
6116
6117   coding->consumed = consumed;
6118   coding->consumed_char = consumed_char;
6119   coding->produced = produced;
6120   coding->produced_char = produced_char;
6121
6122   newstr = make_uninit_string (produced + shrinked_bytes);
6123   if (from > 0)
6124     STRING_COPYIN (newstr, 0, SDATA (str), from);
6125   STRING_COPYIN (newstr, from, buf.data, produced);
6126   if (shrinked_bytes > from)
6127     STRING_COPYIN (newstr, from + produced,
6128                    SDATA (str) + to_byte,
6129                    shrinked_bytes - from);
6130
6131   free_conversion_buffer (&buf);
6132   coding_free_composition_data (coding);
6133
6134   return newstr;
6135 }
6136
6137 \f
6138 #ifdef emacs
6139 /*** 8. Emacs Lisp library functions ***/
6140
6141 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6142        doc: /* Return t if OBJECT is nil or a coding-system.
6143 See the documentation of `make-coding-system' for information
6144 about coding-system objects.  */)
6145      (obj)
6146      Lisp_Object obj;
6147 {
6148   if (NILP (obj))
6149     return Qt;
6150   if (!SYMBOLP (obj))
6151     return Qnil;
6152   /* Get coding-spec vector for OBJ.  */
6153   obj = Fget (obj, Qcoding_system);
6154   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6155           ? Qt : Qnil);
6156 }
6157
6158 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6159        Sread_non_nil_coding_system, 1, 1, 0,
6160        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6161      (prompt)
6162      Lisp_Object prompt;
6163 {
6164   Lisp_Object val;
6165   do
6166     {
6167       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6168                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6169     }
6170   while (SCHARS (val) == 0);
6171   return (Fintern (val, Qnil));
6172 }
6173
6174 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6175        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6176 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6177      (prompt, default_coding_system)
6178      Lisp_Object prompt, default_coding_system;
6179 {
6180   Lisp_Object val;
6181   if (SYMBOLP (default_coding_system))
6182     default_coding_system = SYMBOL_NAME (default_coding_system);
6183   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6184                           Qt, Qnil, Qcoding_system_history,
6185                           default_coding_system, Qnil);
6186   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6187 }
6188
6189 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6190        1, 1, 0,
6191        doc: /* Check validity of CODING-SYSTEM.
6192 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6193 It is valid if it is a symbol with a non-nil `coding-system' property.
6194 The value of property should be a vector of length 5.  */)
6195      (coding_system)
6196      Lisp_Object coding_system;
6197 {
6198   CHECK_SYMBOL (coding_system);
6199   if (!NILP (Fcoding_system_p (coding_system)))
6200     return coding_system;
6201   while (1)
6202     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6203 }
6204 \f
6205 Lisp_Object
6206 detect_coding_system (src, src_bytes, highest, multibytep)
6207      const unsigned char *src;
6208      int src_bytes, highest;
6209      int multibytep;
6210 {
6211   int coding_mask, eol_type;
6212   Lisp_Object val, tmp;
6213   int dummy;
6214
6215   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6216   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6217   if (eol_type == CODING_EOL_INCONSISTENT)
6218     eol_type = CODING_EOL_UNDECIDED;
6219
6220   if (!coding_mask)
6221     {
6222       val = Qundecided;
6223       if (eol_type != CODING_EOL_UNDECIDED)
6224         {
6225           Lisp_Object val2;
6226           val2 = Fget (Qundecided, Qeol_type);
6227           if (VECTORP (val2))
6228             val = XVECTOR (val2)->contents[eol_type];
6229         }
6230       return (highest ? val : Fcons (val, Qnil));
6231     }
6232
6233   /* At first, gather possible coding systems in VAL.  */
6234   val = Qnil;
6235   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6236     {
6237       Lisp_Object category_val, category_index;
6238
6239       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6240       category_val = Fsymbol_value (XCAR (tmp));
6241       if (!NILP (category_val)
6242           && NATNUMP (category_index)
6243           && (coding_mask & (1 << XFASTINT (category_index))))
6244         {
6245           val = Fcons (category_val, val);
6246           if (highest)
6247             break;
6248         }
6249     }
6250   if (!highest)
6251     val = Fnreverse (val);
6252
6253   /* Then, replace the elements with subsidiary coding systems.  */
6254   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6255     {
6256       if (eol_type != CODING_EOL_UNDECIDED
6257           && eol_type != CODING_EOL_INCONSISTENT)
6258         {
6259           Lisp_Object eol;
6260           eol = Fget (XCAR (tmp), Qeol_type);
6261           if (VECTORP (eol))
6262             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6263         }
6264     }
6265   return (highest ? XCAR (val) : val);
6266 }
6267
6268 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6269        2, 3, 0,
6270        doc: /* Detect coding system of the text in the region between START and END.
6271 Return a list of possible coding systems ordered by priority.
6272
6273 If only ASCII characters are found, it returns a list of single element
6274 `undecided' or its subsidiary coding system according to a detected
6275 end-of-line format.
6276
6277 If optional argument HIGHEST is non-nil, return the coding system of
6278 highest priority.  */)
6279      (start, end, highest)
6280      Lisp_Object start, end, highest;
6281 {
6282   int from, to;
6283   int from_byte, to_byte;
6284   int include_anchor_byte = 0;
6285
6286   CHECK_NUMBER_COERCE_MARKER (start);
6287   CHECK_NUMBER_COERCE_MARKER (end);
6288
6289   validate_region (&start, &end);
6290   from = XINT (start), to = XINT (end);
6291   from_byte = CHAR_TO_BYTE (from);
6292   to_byte = CHAR_TO_BYTE (to);
6293
6294   if (from < GPT && to >= GPT)
6295     move_gap_both (to, to_byte);
6296   /* If we an anchor byte `\0' follows the region, we include it in
6297      the detecting source.  Then code detectors can handle the tailing
6298      byte sequence more accurately.
6299
6300      Fix me: This is not a perfect solution.  It is better that we
6301      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6302   */
6303   if (to == Z || (to == GPT && GAP_SIZE > 0))
6304     include_anchor_byte = 1;
6305   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6306                                to_byte - from_byte + include_anchor_byte,
6307                                !NILP (highest),
6308                                !NILP (current_buffer
6309                                       ->enable_multibyte_characters));
6310 }
6311
6312 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6313        1, 2, 0,
6314        doc: /* Detect coding system of the text in STRING.
6315 Return a list of possible coding systems ordered by priority.
6316
6317 If only ASCII characters are found, it returns a list of single element
6318 `undecided' or its subsidiary coding system according to a detected
6319 end-of-line format.
6320
6321 If optional argument HIGHEST is non-nil, return the coding system of
6322 highest priority.  */)
6323      (string, highest)
6324      Lisp_Object string, highest;
6325 {
6326   CHECK_STRING (string);
6327
6328   return detect_coding_system (SDATA (string),
6329                                /* "+ 1" is to include the anchor byte
6330                                   `\0'.  With this, code detectors can
6331                                   handle the tailing bytes more
6332                                   accurately.  */
6333                                SBYTES (string) + 1,
6334                                !NILP (highest),
6335                                STRING_MULTIBYTE (string));
6336 }
6337
6338 /* Return an intersection of lists L1 and L2.  */
6339
6340 static Lisp_Object
6341 intersection (l1, l2)
6342      Lisp_Object l1, l2;
6343 {
6344   Lisp_Object val = Fcons (Qnil, Qnil), tail;
6345
6346   for (tail = val; CONSP (l1); l1 = XCDR (l1))
6347     {
6348       if (!NILP (Fmemq (XCAR (l1), l2)))
6349         {
6350           XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6351           tail = XCDR (tail);
6352         }
6353     }
6354   return XCDR (val);
6355 }
6356
6357
6358 /*  Subroutine for Fsafe_coding_systems_region_internal.
6359
6360     Return a list of coding systems that safely encode the multibyte
6361     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
6362     possible coding systems.  If it is nil, it means that we have not
6363     yet found any coding systems.
6364
6365     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6366     element of WORK_TABLE is set to t once the element is looked up.
6367
6368     If a non-ASCII single byte char is found, set
6369     *single_byte_char_found to 1.  */
6370
6371 static Lisp_Object
6372 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6373      unsigned char *p, *pend;
6374      Lisp_Object safe_codings, work_table;
6375      int *single_byte_char_found;
6376 {
6377   int c, len, idx;
6378   Lisp_Object val;
6379
6380   while (p < pend)
6381     {
6382       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6383       p += len;
6384       if (ASCII_BYTE_P (c))
6385         /* We can ignore ASCII characters here.  */
6386         continue;
6387       if (SINGLE_BYTE_CHAR_P (c))
6388         *single_byte_char_found = 1;
6389       if (NILP (safe_codings))
6390         continue;
6391       /* Check the safe coding systems for C.  */
6392       val = char_table_ref_and_index (work_table, c, &idx);
6393       if (EQ (val, Qt))
6394         /* This element was already checked.  Ignore it.  */
6395         continue;
6396       /* Remember that we checked this element.  */
6397       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6398
6399       /* If there are some safe coding systems for C and we have
6400          already found the other set of coding systems for the
6401          different characters, get the intersection of them.  */
6402       if (!EQ (safe_codings, Qt) && !NILP (val))
6403         val = intersection (safe_codings, val);
6404       safe_codings = val;
6405     }
6406   return safe_codings;
6407 }
6408
6409
6410 /* Return a list of coding systems that safely encode the text between
6411    START and END.  If the text contains only ASCII or is unibyte,
6412    return t.  */
6413
6414 DEFUN ("find-coding-systems-region-internal",
6415        Ffind_coding_systems_region_internal,
6416        Sfind_coding_systems_region_internal, 2, 2, 0,
6417        doc: /* Internal use only.  */)
6418      (start, end)
6419      Lisp_Object start, end;
6420 {
6421   Lisp_Object work_table, safe_codings;
6422   int non_ascii_p = 0;
6423   int single_byte_char_found = 0;
6424   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6425
6426   if (STRINGP (start))
6427     {
6428       if (!STRING_MULTIBYTE (start))
6429         return Qt;
6430       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6431       p2 = p2end = p1end;
6432       if (SCHARS (start) != SBYTES (start))
6433         non_ascii_p = 1;
6434     }
6435   else
6436     {
6437       int from, to, stop;
6438
6439       CHECK_NUMBER_COERCE_MARKER (start);
6440       CHECK_NUMBER_COERCE_MARKER (end);
6441       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6442         args_out_of_range (start, end);
6443       if (NILP (current_buffer->enable_multibyte_characters))
6444         return Qt;
6445       from = CHAR_TO_BYTE (XINT (start));
6446       to = CHAR_TO_BYTE (XINT (end));
6447       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6448       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6449       if (stop == to)
6450         p2 = p2end = p1end;
6451       else
6452         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6453       if (XINT (end) - XINT (start) != to - from)
6454         non_ascii_p = 1;
6455     }
6456
6457   if (!non_ascii_p)
6458     {
6459       /* We are sure that the text contains no multibyte character.
6460          Check if it contains eight-bit-graphic.  */
6461       p = p1;
6462       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6463       if (p == p1end)
6464         {
6465           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6466           if (p == p2end)
6467             return Qt;
6468         }
6469     }
6470
6471   /* The text contains non-ASCII characters.  */
6472   work_table = Fcopy_sequence (Vchar_coding_system_table);
6473   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6474                                     &single_byte_char_found);
6475   if (p2 < p2end)
6476     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6477                                       &single_byte_char_found);
6478
6479   if (EQ (safe_codings, Qt))
6480     ; /* Nothing to be done.  */
6481   else if (!single_byte_char_found)
6482     {
6483       /* Append generic coding systems.  */
6484       Lisp_Object args[2];
6485       args[0] = safe_codings;
6486       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6487                                         make_number (0));
6488       safe_codings = Fappend (2, args);
6489     }
6490   else
6491     safe_codings = Fcons (Qraw_text,
6492                           Fcons (Qemacs_mule,
6493                                  Fcons (Qno_conversion, safe_codings)));
6494   return safe_codings;
6495 }
6496
6497
6498 /* Search from position POS for such characters that are unencodable
6499    accoding to SAFE_CHARS, and return a list of their positions.  P
6500    points where in the memory the character at POS exists.  Limit the
6501    search at PEND or when Nth unencodable characters are found.
6502
6503    If SAFE_CHARS is a char table, an element for an unencodable
6504    character is nil.
6505
6506    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6507
6508    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6509    eight-bit-graphic characters are unencodable.  */
6510
6511 static Lisp_Object
6512 unencodable_char_position (safe_chars, pos, p, pend, n)
6513      Lisp_Object safe_chars;
6514      int pos;
6515      unsigned char *p, *pend;
6516      int n;
6517 {
6518   Lisp_Object pos_list;
6519
6520   pos_list = Qnil;
6521   while (p < pend)
6522     {
6523       int len;
6524       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6525
6526       if (c >= 128
6527           && (CHAR_TABLE_P (safe_chars)
6528               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6529               : (NILP (safe_chars) || c < 256)))
6530         {
6531           pos_list = Fcons (make_number (pos), pos_list);
6532           if (--n <= 0)
6533             break;
6534         }
6535       pos++;
6536       p += len;
6537     }
6538   return Fnreverse (pos_list);
6539 }
6540
6541
6542 DEFUN ("unencodable-char-position", Funencodable_char_position,
6543        Sunencodable_char_position, 3, 5, 0,
6544        doc: /*
6545 Return position of first un-encodable character in a region.
6546 START and END specfiy the region and CODING-SYSTEM specifies the
6547 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6548
6549 If optional 4th argument COUNT is non-nil, it specifies at most how
6550 many un-encodable characters to search.  In this case, the value is a
6551 list of positions.
6552
6553 If optional 5th argument STRING is non-nil, it is a string to search
6554 for un-encodable characters.  In that case, START and END are indexes
6555 to the string.  */)
6556      (start, end, coding_system, count, string)
6557      Lisp_Object start, end, coding_system, count, string;
6558 {
6559   int n;
6560   Lisp_Object safe_chars;
6561   struct coding_system coding;
6562   Lisp_Object positions;
6563   int from, to;
6564   unsigned char *p, *pend;
6565
6566   if (NILP (string))
6567     {
6568       validate_region (&start, &end);
6569       from = XINT (start);
6570       to = XINT (end);
6571       if (NILP (current_buffer->enable_multibyte_characters))
6572         return Qnil;
6573       p = CHAR_POS_ADDR (from);
6574       if (to == GPT)
6575         pend = GPT_ADDR;
6576       else
6577         pend = CHAR_POS_ADDR (to);
6578     }
6579   else
6580     {
6581       CHECK_STRING (string);
6582       CHECK_NATNUM (start);
6583       CHECK_NATNUM (end);
6584       from = XINT (start);
6585       to = XINT (end);
6586       if (from > to
6587           || to > SCHARS (string))
6588         args_out_of_range_3 (string, start, end);
6589       if (! STRING_MULTIBYTE (string))
6590         return Qnil;
6591       p = SDATA (string) + string_char_to_byte (string, from);
6592       pend = SDATA (string) + string_char_to_byte (string, to);
6593     }
6594
6595   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6596
6597   if (NILP (count))
6598     n = 1;
6599   else
6600     {
6601       CHECK_NATNUM (count);
6602       n = XINT (count);
6603     }
6604
6605   if (coding.type == coding_type_no_conversion
6606       || coding.type == coding_type_raw_text)
6607     return Qnil;
6608
6609   if (coding.type == coding_type_undecided)
6610     safe_chars = Qnil;
6611   else
6612     safe_chars = coding_safe_chars (&coding);
6613
6614   if (STRINGP (string)
6615       || from >= GPT || to <= GPT)
6616     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6617   else
6618     {
6619       Lisp_Object args[2];
6620
6621       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6622       n -= XINT (Flength (args[0]));
6623       if (n <= 0)
6624         positions = args[0];
6625       else
6626         {
6627           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6628                                                pend, n);
6629           positions = Fappend (2, args);
6630         }
6631     }
6632
6633   return  (NILP (count) ? Fcar (positions) : positions);
6634 }
6635
6636
6637 Lisp_Object
6638 code_convert_region1 (start, end, coding_system, encodep)
6639      Lisp_Object start, end, coding_system;
6640      int encodep;
6641 {
6642   struct coding_system coding;
6643   int from, to;
6644
6645   CHECK_NUMBER_COERCE_MARKER (start);
6646   CHECK_NUMBER_COERCE_MARKER (end);
6647   CHECK_SYMBOL (coding_system);
6648
6649   validate_region (&start, &end);
6650   from = XFASTINT (start);
6651   to = XFASTINT (end);
6652
6653   if (NILP (coding_system))
6654     return make_number (to - from);
6655
6656   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6657     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6658
6659   coding.mode |= CODING_MODE_LAST_BLOCK;
6660   coding.src_multibyte = coding.dst_multibyte
6661     = !NILP (current_buffer->enable_multibyte_characters);
6662   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6663                        &coding, encodep, 1);
6664   Vlast_coding_system_used = coding.symbol;
6665   return make_number (coding.produced_char);
6666 }
6667
6668 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6669        3, 3, "r\nzCoding system: ",
6670        doc: /* Decode the current region from the specified coding system.
6671 When called from a program, takes three arguments:
6672 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6673 This function sets `last-coding-system-used' to the precise coding system
6674 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6675 not fully specified.)
6676 It returns the length of the decoded text.  */)
6677      (start, end, coding_system)
6678      Lisp_Object start, end, coding_system;
6679 {
6680   return code_convert_region1 (start, end, coding_system, 0);
6681 }
6682
6683 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6684        3, 3, "r\nzCoding system: ",
6685        doc: /* Encode the current region into the specified coding system.
6686 When called from a program, takes three arguments:
6687 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6688 This function sets `last-coding-system-used' to the precise coding system
6689 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6690 not fully specified.)
6691 It returns the length of the encoded text.  */)
6692      (start, end, coding_system)
6693      Lisp_Object start, end, coding_system;
6694 {
6695   return code_convert_region1 (start, end, coding_system, 1);
6696 }
6697
6698 Lisp_Object
6699 code_convert_string1 (string, coding_system, nocopy, encodep)
6700      Lisp_Object string, coding_system, nocopy;
6701      int encodep;
6702 {
6703   struct coding_system coding;
6704
6705   CHECK_STRING (string);
6706   CHECK_SYMBOL (coding_system);
6707
6708   if (NILP (coding_system))
6709     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6710
6711   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6712     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6713
6714   coding.mode |= CODING_MODE_LAST_BLOCK;
6715   string = (encodep
6716             ? encode_coding_string (string, &coding, !NILP (nocopy))
6717             : decode_coding_string (string, &coding, !NILP (nocopy)));
6718   Vlast_coding_system_used = coding.symbol;
6719
6720   return string;
6721 }
6722
6723 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6724        2, 3, 0,
6725        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6726 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6727 if the decoding operation is trivial.
6728 This function sets `last-coding-system-used' to the precise coding system
6729 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6730 not fully specified.)  */)
6731      (string, coding_system, nocopy)
6732      Lisp_Object string, coding_system, nocopy;
6733 {
6734   return code_convert_string1 (string, coding_system, nocopy, 0);
6735 }
6736
6737 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6738        2, 3, 0,
6739        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6740 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6741 if the encoding operation is trivial.
6742 This function sets `last-coding-system-used' to the precise coding system
6743 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6744 not fully specified.)  */)
6745      (string, coding_system, nocopy)
6746      Lisp_Object string, coding_system, nocopy;
6747 {
6748   return code_convert_string1 (string, coding_system, nocopy, 1);
6749 }
6750
6751 /* Encode or decode STRING according to CODING_SYSTEM.
6752    Do not set Vlast_coding_system_used.
6753
6754    This function is called only from macros DECODE_FILE and
6755    ENCODE_FILE, thus we ignore character composition.  */
6756
6757 Lisp_Object
6758 code_convert_string_norecord (string, coding_system, encodep)
6759      Lisp_Object string, coding_system;
6760      int encodep;
6761 {
6762   struct coding_system coding;
6763
6764   CHECK_STRING (string);
6765   CHECK_SYMBOL (coding_system);
6766
6767   if (NILP (coding_system))
6768     return string;
6769
6770   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6771     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6772
6773   coding.composing = COMPOSITION_DISABLED;
6774   coding.mode |= CODING_MODE_LAST_BLOCK;
6775   return (encodep
6776           ? encode_coding_string (string, &coding, 1)
6777           : decode_coding_string (string, &coding, 1));
6778 }
6779 \f
6780 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6781        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6782 Return the corresponding character.  */)
6783      (code)
6784      Lisp_Object code;
6785 {
6786   unsigned char c1, c2, s1, s2;
6787   Lisp_Object val;
6788
6789   CHECK_NUMBER (code);
6790   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6791   if (s1 == 0)
6792     {
6793       if (s2 < 0x80)
6794         XSETFASTINT (val, s2);
6795       else if (s2 >= 0xA0 || s2 <= 0xDF)
6796         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6797       else
6798         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6799     }
6800   else
6801     {
6802       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6803           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6804         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6805       DECODE_SJIS (s1, s2, c1, c2);
6806       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6807     }
6808   return val;
6809 }
6810
6811 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6812        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6813 Return the corresponding code in SJIS.  */)
6814      (ch)
6815      Lisp_Object ch;
6816 {
6817   int charset, c1, c2, s1, s2;
6818   Lisp_Object val;
6819
6820   CHECK_NUMBER (ch);
6821   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6822   if (charset == CHARSET_ASCII)
6823     {
6824       val = ch;
6825     }
6826   else if (charset == charset_jisx0208
6827            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6828     {
6829       ENCODE_SJIS (c1, c2, s1, s2);
6830       XSETFASTINT (val, (s1 << 8) | s2);
6831     }
6832   else if (charset == charset_katakana_jisx0201
6833            && c1 > 0x20 && c2 < 0xE0)
6834     {
6835       XSETFASTINT (val, c1 | 0x80);
6836     }
6837   else
6838     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6839   return val;
6840 }
6841
6842 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6843        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6844 Return the corresponding character.  */)
6845      (code)
6846      Lisp_Object code;
6847 {
6848   int charset;
6849   unsigned char b1, b2, c1, c2;
6850   Lisp_Object val;
6851
6852   CHECK_NUMBER (code);
6853   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6854   if (b1 == 0)
6855     {
6856       if (b2 >= 0x80)
6857         error ("Invalid BIG5 code: %x", XFASTINT (code));
6858       val = code;
6859     }
6860   else
6861     {
6862       if ((b1 < 0xA1 || b1 > 0xFE)
6863           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6864         error ("Invalid BIG5 code: %x", XFASTINT (code));
6865       DECODE_BIG5 (b1, b2, charset, c1, c2);
6866       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6867     }
6868   return val;
6869 }
6870
6871 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6872        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6873 Return the corresponding character code in Big5.  */)
6874      (ch)
6875      Lisp_Object ch;
6876 {
6877   int charset, c1, c2, b1, b2;
6878   Lisp_Object val;
6879
6880   CHECK_NUMBER (ch);
6881   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6882   if (charset == CHARSET_ASCII)
6883     {
6884       val = ch;
6885     }
6886   else if ((charset == charset_big5_1
6887             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6888            || (charset == charset_big5_2
6889                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6890     {
6891       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6892       XSETFASTINT (val, (b1 << 8) | b2);
6893     }
6894   else
6895     error ("Can't encode to Big5: %d", XFASTINT (ch));
6896   return val;
6897 }
6898 \f
6899 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
6900        Sset_terminal_coding_system_internal, 1, 1, 0,
6901        doc: /* Internal use only.  */)
6902      (coding_system)
6903      Lisp_Object coding_system;
6904 {
6905   CHECK_SYMBOL (coding_system);
6906   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6907   /* We had better not send unsafe characters to terminal.  */
6908   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6909   /* Character composition should be disabled.  */
6910   terminal_coding.composing = COMPOSITION_DISABLED;
6911   /* Error notification should be suppressed.  */
6912   terminal_coding.suppress_error = 1;
6913   terminal_coding.src_multibyte = 1;
6914   terminal_coding.dst_multibyte = 0;
6915   return Qnil;
6916 }
6917
6918 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
6919        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6920        doc: /* Internal use only.  */)
6921      (coding_system)
6922      Lisp_Object coding_system;
6923 {
6924   CHECK_SYMBOL (coding_system);
6925   setup_coding_system (Fcheck_coding_system (coding_system),
6926                        &safe_terminal_coding);
6927   /* Character composition should be disabled.  */
6928   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6929   /* Error notification should be suppressed.  */
6930   terminal_coding.suppress_error = 1;
6931   safe_terminal_coding.src_multibyte = 1;
6932   safe_terminal_coding.dst_multibyte = 0;
6933   return Qnil;
6934 }
6935
6936 DEFUN ("terminal-coding-system", Fterminal_coding_system,
6937        Sterminal_coding_system, 0, 0, 0,
6938        doc: /* Return coding system specified for terminal output.  */)
6939      ()
6940 {
6941   return terminal_coding.symbol;
6942 }
6943
6944 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
6945        Sset_keyboard_coding_system_internal, 1, 1, 0,
6946        doc: /* Internal use only.  */)
6947      (coding_system)
6948      Lisp_Object coding_system;
6949 {
6950   CHECK_SYMBOL (coding_system);
6951   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6952   /* Character composition should be disabled.  */
6953   keyboard_coding.composing = COMPOSITION_DISABLED;
6954   return Qnil;
6955 }
6956
6957 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
6958        Skeyboard_coding_system, 0, 0, 0,
6959        doc: /* Return coding system specified for decoding keyboard input.  */)
6960      ()
6961 {
6962   return keyboard_coding.symbol;
6963 }
6964
6965 \f
6966 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6967        Sfind_operation_coding_system,  1, MANY, 0,
6968        doc: /* Choose a coding system for an operation based on the target name.
6969 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6970 DECODING-SYSTEM is the coding system to use for decoding
6971 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6972 for encoding (in case OPERATION does encoding).
6973
6974 The first argument OPERATION specifies an I/O primitive:
6975   For file I/O, `insert-file-contents' or `write-region'.
6976   For process I/O, `call-process', `call-process-region', or `start-process'.
6977   For network I/O, `open-network-stream'.
6978
6979 The remaining arguments should be the same arguments that were passed
6980 to the primitive.  Depending on which primitive, one of those arguments
6981 is selected as the TARGET.  For example, if OPERATION does file I/O,
6982 whichever argument specifies the file name is TARGET.
6983
6984 TARGET has a meaning which depends on OPERATION:
6985   For file I/O, TARGET is a file name.
6986   For process I/O, TARGET is a process name.
6987   For network I/O, TARGET is a service name or a port number
6988
6989 This function looks up what specified for TARGET in,
6990 `file-coding-system-alist', `process-coding-system-alist',
6991 or `network-coding-system-alist' depending on OPERATION.
6992 They may specify a coding system, a cons of coding systems,
6993 or a function symbol to call.
6994 In the last case, we call the function with one argument,
6995 which is a list of all the arguments given to this function.
6996
6997 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
6998      (nargs, args)
6999      int nargs;
7000      Lisp_Object *args;
7001 {
7002   Lisp_Object operation, target_idx, target, val;
7003   register Lisp_Object chain;
7004
7005   if (nargs < 2)
7006     error ("Too few arguments");
7007   operation = args[0];
7008   if (!SYMBOLP (operation)
7009       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7010     error ("Invalid first argument");
7011   if (nargs < 1 + XINT (target_idx))
7012     error ("Too few arguments for operation: %s",
7013            SDATA (SYMBOL_NAME (operation)));
7014   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7015      argument to write-region) is string, it must be treated as a
7016      target file name.  */
7017   if (EQ (operation, Qwrite_region)
7018       && nargs > 5
7019       && STRINGP (args[5]))
7020     target_idx = make_number (4);
7021   target = args[XINT (target_idx) + 1];
7022   if (!(STRINGP (target)
7023         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7024     error ("Invalid argument %d", XINT (target_idx) + 1);
7025
7026   chain = ((EQ (operation, Qinsert_file_contents)
7027             || EQ (operation, Qwrite_region))
7028            ? Vfile_coding_system_alist
7029            : (EQ (operation, Qopen_network_stream)
7030               ? Vnetwork_coding_system_alist
7031               : Vprocess_coding_system_alist));
7032   if (NILP (chain))
7033     return Qnil;
7034
7035   for (; CONSP (chain); chain = XCDR (chain))
7036     {
7037       Lisp_Object elt;
7038       elt = XCAR (chain);
7039
7040       if (CONSP (elt)
7041           && ((STRINGP (target)
7042                && STRINGP (XCAR (elt))
7043                && fast_string_match (XCAR (elt), target) >= 0)
7044               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7045         {
7046           val = XCDR (elt);
7047           /* Here, if VAL is both a valid coding system and a valid
7048              function symbol, we return VAL as a coding system.  */
7049           if (CONSP (val))
7050             return val;
7051           if (! SYMBOLP (val))
7052             return Qnil;
7053           if (! NILP (Fcoding_system_p (val)))
7054             return Fcons (val, val);
7055           if (! NILP (Ffboundp (val)))
7056             {
7057               val = call1 (val, Flist (nargs, args));
7058               if (CONSP (val))
7059                 return val;
7060               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7061                 return Fcons (val, val);
7062             }
7063           return Qnil;
7064         }
7065     }
7066   return Qnil;
7067 }
7068
7069 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7070        Supdate_coding_systems_internal, 0, 0, 0,
7071        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7072 When values of any coding categories are changed, you must
7073 call this function.  */)
7074      ()
7075 {
7076   int i;
7077
7078   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7079     {
7080       Lisp_Object val;
7081
7082       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7083       if (!NILP (val))
7084         {
7085           if (! coding_system_table[i])
7086             coding_system_table[i] = ((struct coding_system *)
7087                                       xmalloc (sizeof (struct coding_system)));
7088           setup_coding_system (val, coding_system_table[i]);
7089         }
7090       else if (coding_system_table[i])
7091         {
7092           xfree (coding_system_table[i]);
7093           coding_system_table[i] = NULL;
7094         }
7095     }
7096
7097   return Qnil;
7098 }
7099
7100 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7101        Sset_coding_priority_internal, 0, 0, 0,
7102        doc: /* Update internal database for the current value of `coding-category-list'.
7103 This function is internal use only.  */)
7104      ()
7105 {
7106   int i = 0, idx;
7107   Lisp_Object val;
7108
7109   val = Vcoding_category_list;
7110
7111   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7112     {
7113       if (! SYMBOLP (XCAR (val)))
7114         break;
7115       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7116       if (idx >= CODING_CATEGORY_IDX_MAX)
7117         break;
7118       coding_priorities[i++] = (1 << idx);
7119       val = XCDR (val);
7120     }
7121   /* If coding-category-list is valid and contains all coding
7122      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7123      the following code saves Emacs from crashing.  */
7124   while (i < CODING_CATEGORY_IDX_MAX)
7125     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7126
7127   return Qnil;
7128 }
7129
7130 #endif /* emacs */
7131
7132 \f
7133 /*** 9. Post-amble ***/
7134
7135 void
7136 init_coding_once ()
7137 {
7138   int i;
7139
7140   /* Emacs' internal format specific initialize routine.  */
7141   for (i = 0; i <= 0x20; i++)
7142     emacs_code_class[i] = EMACS_control_code;
7143   emacs_code_class[0x0A] = EMACS_linefeed_code;
7144   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7145   for (i = 0x21 ; i < 0x7F; i++)
7146     emacs_code_class[i] = EMACS_ascii_code;
7147   emacs_code_class[0x7F] = EMACS_control_code;
7148   for (i = 0x80; i < 0xFF; i++)
7149     emacs_code_class[i] = EMACS_invalid_code;
7150   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7151   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7152   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7153   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7154
7155   /* ISO2022 specific initialize routine.  */
7156   for (i = 0; i < 0x20; i++)
7157     iso_code_class[i] = ISO_control_0;
7158   for (i = 0x21; i < 0x7F; i++)
7159     iso_code_class[i] = ISO_graphic_plane_0;
7160   for (i = 0x80; i < 0xA0; i++)
7161     iso_code_class[i] = ISO_control_1;
7162   for (i = 0xA1; i < 0xFF; i++)
7163     iso_code_class[i] = ISO_graphic_plane_1;
7164   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7165   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7166   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7167   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7168   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7169   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7170   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7171   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7172   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7173   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7174
7175   setup_coding_system (Qnil, &keyboard_coding);
7176   setup_coding_system (Qnil, &terminal_coding);
7177   setup_coding_system (Qnil, &safe_terminal_coding);
7178   setup_coding_system (Qnil, &default_buffer_file_coding);
7179
7180   bzero (coding_system_table, sizeof coding_system_table);
7181
7182   bzero (ascii_skip_code, sizeof ascii_skip_code);
7183   for (i = 0; i < 128; i++)
7184     ascii_skip_code[i] = 1;
7185
7186 #if defined (MSDOS) || defined (WINDOWSNT)
7187   system_eol_type = CODING_EOL_CRLF;
7188 #else
7189   system_eol_type = CODING_EOL_LF;
7190 #endif
7191
7192   inhibit_pre_post_conversion = 0;
7193 }
7194
7195 #ifdef emacs
7196
7197 void
7198 syms_of_coding ()
7199 {
7200   Qtarget_idx = intern ("target-idx");
7201   staticpro (&Qtarget_idx);
7202
7203   Qcoding_system_history = intern ("coding-system-history");
7204   staticpro (&Qcoding_system_history);
7205   Fset (Qcoding_system_history, Qnil);
7206
7207   /* Target FILENAME is the first argument.  */
7208   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7209   /* Target FILENAME is the third argument.  */
7210   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7211
7212   Qcall_process = intern ("call-process");
7213   staticpro (&Qcall_process);
7214   /* Target PROGRAM is the first argument.  */
7215   Fput (Qcall_process, Qtarget_idx, make_number (0));
7216
7217   Qcall_process_region = intern ("call-process-region");
7218   staticpro (&Qcall_process_region);
7219   /* Target PROGRAM is the third argument.  */
7220   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7221
7222   Qstart_process = intern ("start-process");
7223   staticpro (&Qstart_process);
7224   /* Target PROGRAM is the third argument.  */
7225   Fput (Qstart_process, Qtarget_idx, make_number (2));
7226
7227   Qopen_network_stream = intern ("open-network-stream");
7228   staticpro (&Qopen_network_stream);
7229   /* Target SERVICE is the fourth argument.  */
7230   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7231
7232   Qcoding_system = intern ("coding-system");
7233   staticpro (&Qcoding_system);
7234
7235   Qeol_type = intern ("eol-type");
7236   staticpro (&Qeol_type);
7237
7238   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7239   staticpro (&Qbuffer_file_coding_system);
7240
7241   Qpost_read_conversion = intern ("post-read-conversion");
7242   staticpro (&Qpost_read_conversion);
7243
7244   Qpre_write_conversion = intern ("pre-write-conversion");
7245   staticpro (&Qpre_write_conversion);
7246
7247   Qno_conversion = intern ("no-conversion");
7248   staticpro (&Qno_conversion);
7249
7250   Qundecided = intern ("undecided");
7251   staticpro (&Qundecided);
7252
7253   Qcoding_system_p = intern ("coding-system-p");
7254   staticpro (&Qcoding_system_p);
7255
7256   Qcoding_system_error = intern ("coding-system-error");
7257   staticpro (&Qcoding_system_error);
7258
7259   Fput (Qcoding_system_error, Qerror_conditions,
7260         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7261   Fput (Qcoding_system_error, Qerror_message,
7262         build_string ("Invalid coding system"));
7263
7264   Qcoding_category = intern ("coding-category");
7265   staticpro (&Qcoding_category);
7266   Qcoding_category_index = intern ("coding-category-index");
7267   staticpro (&Qcoding_category_index);
7268
7269   Vcoding_category_table
7270     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7271   staticpro (&Vcoding_category_table);
7272   {
7273     int i;
7274     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7275       {
7276         XVECTOR (Vcoding_category_table)->contents[i]
7277           = intern (coding_category_name[i]);
7278         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7279               Qcoding_category_index, make_number (i));
7280       }
7281   }
7282
7283   Qtranslation_table = intern ("translation-table");
7284   staticpro (&Qtranslation_table);
7285   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7286
7287   Qtranslation_table_id = intern ("translation-table-id");
7288   staticpro (&Qtranslation_table_id);
7289
7290   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7291   staticpro (&Qtranslation_table_for_decode);
7292
7293   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7294   staticpro (&Qtranslation_table_for_encode);
7295
7296   Qsafe_chars = intern ("safe-chars");
7297   staticpro (&Qsafe_chars);
7298
7299   Qchar_coding_system = intern ("char-coding-system");
7300   staticpro (&Qchar_coding_system);
7301
7302   /* Intern this now in case it isn't already done.
7303      Setting this variable twice is harmless.
7304      But don't staticpro it here--that is done in alloc.c.  */
7305   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7306   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7307   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7308
7309   Qvalid_codes = intern ("valid-codes");
7310   staticpro (&Qvalid_codes);
7311
7312   Qemacs_mule = intern ("emacs-mule");
7313   staticpro (&Qemacs_mule);
7314
7315   Qraw_text = intern ("raw-text");
7316   staticpro (&Qraw_text);
7317
7318   defsubr (&Scoding_system_p);
7319   defsubr (&Sread_coding_system);
7320   defsubr (&Sread_non_nil_coding_system);
7321   defsubr (&Scheck_coding_system);
7322   defsubr (&Sdetect_coding_region);
7323   defsubr (&Sdetect_coding_string);
7324   defsubr (&Sfind_coding_systems_region_internal);
7325   defsubr (&Sunencodable_char_position);
7326   defsubr (&Sdecode_coding_region);
7327   defsubr (&Sencode_coding_region);
7328   defsubr (&Sdecode_coding_string);
7329   defsubr (&Sencode_coding_string);
7330   defsubr (&Sdecode_sjis_char);
7331   defsubr (&Sencode_sjis_char);
7332   defsubr (&Sdecode_big5_char);
7333   defsubr (&Sencode_big5_char);
7334   defsubr (&Sset_terminal_coding_system_internal);
7335   defsubr (&Sset_safe_terminal_coding_system_internal);
7336   defsubr (&Sterminal_coding_system);
7337   defsubr (&Sset_keyboard_coding_system_internal);
7338   defsubr (&Skeyboard_coding_system);
7339   defsubr (&Sfind_operation_coding_system);
7340   defsubr (&Supdate_coding_systems_internal);
7341   defsubr (&Sset_coding_priority_internal);
7342
7343   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7344                doc: /* List of coding systems.
7345
7346 Do not alter the value of this variable manually.  This variable should be
7347 updated by the functions `make-coding-system' and
7348 `define-coding-system-alias'.  */);
7349   Vcoding_system_list = Qnil;
7350
7351   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7352                doc: /* Alist of coding system names.
7353 Each element is one element list of coding system name.
7354 This variable is given to `completing-read' as TABLE argument.
7355
7356 Do not alter the value of this variable manually.  This variable should be
7357 updated by the functions `make-coding-system' and
7358 `define-coding-system-alias'.  */);
7359   Vcoding_system_alist = Qnil;
7360
7361   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7362                doc: /* List of coding-categories (symbols) ordered by priority.
7363
7364 On detecting a coding system, Emacs tries code detection algorithms
7365 associated with each coding-category one by one in this order.  When
7366 one algorithm agrees with a byte sequence of source text, the coding
7367 system bound to the corresponding coding-category is selected.  */);
7368   {
7369     int i;
7370
7371     Vcoding_category_list = Qnil;
7372     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7373       Vcoding_category_list
7374         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7375                  Vcoding_category_list);
7376   }
7377
7378   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7379                doc: /* Specify the coding system for read operations.
7380 It is useful to bind this variable with `let', but do not set it globally.
7381 If the value is a coding system, it is used for decoding on read operation.
7382 If not, an appropriate element is used from one of the coding system alists:
7383 There are three such tables, `file-coding-system-alist',
7384 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7385   Vcoding_system_for_read = Qnil;
7386
7387   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7388                doc: /* Specify the coding system for write operations.
7389 Programs bind this variable with `let', but you should not set it globally.
7390 If the value is a coding system, it is used for encoding of output,
7391 when writing it to a file and when sending it to a file or subprocess.
7392
7393 If this does not specify a coding system, an appropriate element
7394 is used from one of the coding system alists:
7395 There are three such tables, `file-coding-system-alist',
7396 `process-coding-system-alist', and `network-coding-system-alist'.
7397 For output to files, if the above procedure does not specify a coding system,
7398 the value of `buffer-file-coding-system' is used.  */);
7399   Vcoding_system_for_write = Qnil;
7400
7401   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7402                doc: /* Coding system used in the latest file or process I/O.  */);
7403   Vlast_coding_system_used = Qnil;
7404
7405   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7406                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7407 See info node `Coding Systems' and info node `Text and Binary' concerning
7408 such conversion.  */);
7409   inhibit_eol_conversion = 0;
7410
7411   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7412                doc: /* Non-nil means process buffer inherits coding system of process output.
7413 Bind it to t if the process output is to be treated as if it were a file
7414 read from some filesystem.  */);
7415   inherit_process_coding_system = 0;
7416
7417   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7418                doc: /* Alist to decide a coding system to use for a file I/O operation.
7419 The format is ((PATTERN . VAL) ...),
7420 where PATTERN is a regular expression matching a file name,
7421 VAL is a coding system, a cons of coding systems, or a function symbol.
7422 If VAL is a coding system, it is used for both decoding and encoding
7423 the file contents.
7424 If VAL is a cons of coding systems, the car part is used for decoding,
7425 and the cdr part is used for encoding.
7426 If VAL is a function symbol, the function must return a coding system
7427 or a cons of coding systems which are used as above.  The function gets
7428 the arguments with which `find-operation-coding-system' was called.
7429
7430 See also the function `find-operation-coding-system'
7431 and the variable `auto-coding-alist'.  */);
7432   Vfile_coding_system_alist = Qnil;
7433
7434   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7435     doc: /* Alist to decide a coding system to use for a process I/O operation.
7436 The format is ((PATTERN . VAL) ...),
7437 where PATTERN is a regular expression matching a program name,
7438 VAL is a coding system, a cons of coding systems, or a function symbol.
7439 If VAL is a coding system, it is used for both decoding what received
7440 from the program and encoding what sent to the program.
7441 If VAL is a cons of coding systems, the car part is used for decoding,
7442 and the cdr part is used for encoding.
7443 If VAL is a function symbol, the function must return a coding system
7444 or a cons of coding systems which are used as above.
7445
7446 See also the function `find-operation-coding-system'.  */);
7447   Vprocess_coding_system_alist = Qnil;
7448
7449   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7450     doc: /* Alist to decide a coding system to use for a network I/O operation.
7451 The format is ((PATTERN . VAL) ...),
7452 where PATTERN is a regular expression matching a network service name
7453 or is a port number to connect to,
7454 VAL is a coding system, a cons of coding systems, or a function symbol.
7455 If VAL is a coding system, it is used for both decoding what received
7456 from the network stream and encoding what sent to the network stream.
7457 If VAL is a cons of coding systems, the car part is used for decoding,
7458 and the cdr part is used for encoding.
7459 If VAL is a function symbol, the function must return a coding system
7460 or a cons of coding systems which are used as above.
7461
7462 See also the function `find-operation-coding-system'.  */);
7463   Vnetwork_coding_system_alist = Qnil;
7464
7465   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7466                doc: /* Coding system to use with system messages.
7467 Also used for decoding keyboard input on X Window system.  */);
7468   Vlocale_coding_system = Qnil;
7469
7470   /* The eol mnemonics are reset in startup.el system-dependently.  */
7471   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7472                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7473   eol_mnemonic_unix = build_string (":");
7474
7475   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7476                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7477   eol_mnemonic_dos = build_string ("\\");
7478
7479   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7480                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7481   eol_mnemonic_mac = build_string ("/");
7482
7483   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7484                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7485   eol_mnemonic_undecided = build_string (":");
7486
7487   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7488                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7489   Venable_character_translation = Qt;
7490
7491   DEFVAR_LISP ("standard-translation-table-for-decode",
7492                &Vstandard_translation_table_for_decode,
7493                doc: /* Table for translating characters while decoding.  */);
7494   Vstandard_translation_table_for_decode = Qnil;
7495
7496   DEFVAR_LISP ("standard-translation-table-for-encode",
7497                &Vstandard_translation_table_for_encode,
7498                doc: /* Table for translating characters while encoding.  */);
7499   Vstandard_translation_table_for_encode = Qnil;
7500
7501   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7502                doc: /* Alist of charsets vs revision numbers.
7503 While encoding, if a charset (car part of an element) is found,
7504 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7505   Vcharset_revision_alist = Qnil;
7506
7507   DEFVAR_LISP ("default-process-coding-system",
7508                &Vdefault_process_coding_system,
7509                doc: /* Cons of coding systems used for process I/O by default.
7510 The car part is used for decoding a process output,
7511 the cdr part is used for encoding a text to be sent to a process.  */);
7512   Vdefault_process_coding_system = Qnil;
7513
7514   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7515                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7516 This is a vector of length 256.
7517 If Nth element is non-nil, the existence of code N in a file
7518 \(or output of subprocess) doesn't prevent it to be detected as
7519 a coding system of ISO 2022 variant which has a flag
7520 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7521 or reading output of a subprocess.
7522 Only 128th through 159th elements has a meaning.  */);
7523   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7524
7525   DEFVAR_LISP ("select-safe-coding-system-function",
7526                &Vselect_safe_coding_system_function,
7527                doc: /* Function to call to select safe coding system for encoding a text.
7528
7529 If set, this function is called to force a user to select a proper
7530 coding system which can encode the text in the case that a default
7531 coding system used in each operation can't encode the text.
7532
7533 The default value is `select-safe-coding-system' (which see).  */);
7534   Vselect_safe_coding_system_function = Qnil;
7535
7536   DEFVAR_BOOL ("coding-system-require-warning",
7537                &coding_system_require_warning,
7538                doc: /* Internal use only.
7539 If non-nil, on writing a file, select-safe-coding-system-function is
7540 called even if coding-system-for-write is non-nil.  The command
7541 universal-coding-system-argument binds this variable to t temporarily.  */);
7542   coding_system_require_warning = 0;
7543
7544
7545   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7546                doc: /* Char-table containing safe coding systems of each characters.
7547 Each element doesn't include such generic coding systems that can
7548 encode any characters.  They are in the first extra slot.  */);
7549   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7550
7551   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7552                &inhibit_iso_escape_detection,
7553                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7554
7555 By default, on reading a file, Emacs tries to detect how the text is
7556 encoded.  This code detection is sensitive to escape sequences.  If
7557 the sequence is valid as ISO2022, the code is determined as one of
7558 the ISO2022 encodings, and the file is decoded by the corresponding
7559 coding system (e.g. `iso-2022-7bit').
7560
7561 However, there may be a case that you want to read escape sequences in
7562 a file as is.  In such a case, you can set this variable to non-nil.
7563 Then, as the code detection ignores any escape sequences, no file is
7564 detected as encoded in some ISO2022 encoding.  The result is that all
7565 escape sequences become visible in a buffer.
7566
7567 The default value is nil, and it is strongly recommended not to change
7568 it.  That is because many Emacs Lisp source files that contain
7569 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7570 in Emacs's distribution, and they won't be decoded correctly on
7571 reading if you suppress escape sequence detection.
7572
7573 The other way to read escape sequences in a file without decoding is
7574 to explicitly specify some coding system that doesn't use ISO2022's
7575 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7576   inhibit_iso_escape_detection = 0;
7577
7578   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7579                doc: /* Char table for translating self-inserting characters.
7580 This is applied to the result of input methods, not their input.  See also
7581 `keyboard-translate-table'.  */);
7582     Vtranslation_table_for_input = Qnil;
7583 }
7584
7585 char *
7586 emacs_strerror (error_number)
7587      int error_number;
7588 {
7589   char *str;
7590
7591   synchronize_system_messages_locale ();
7592   str = strerror (error_number);
7593
7594   if (! NILP (Vlocale_coding_system))
7595     {
7596       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7597                                                       Vlocale_coding_system,
7598                                                       0);
7599       str = (char *) SDATA (dec);
7600     }
7601
7602   return str;
7603 }
7604
7605 #endif /* emacs */
7606