src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7
   8 This file is part of GNU Emacs.
   9
  10 GNU Emacs is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2, or (at your option)
  13 any later version.
  14
  15 GNU Emacs is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with GNU Emacs; see the file COPYING.  If not, write to
  22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  23 Boston, MA 02110-1301, USA.  */
  24
  25 /*** TABLE OF CONTENTS ***
  26
  27   0. General comments
  28   1. Preamble
  29   2. Emacs' internal format (emacs-mule) handlers
  30   3. ISO2022 handlers
  31   4. Shift-JIS and BIG5 handlers
  32   5. CCL handlers
  33   6. End-of-line handlers
  34   7. C library functions
  35   8. Emacs Lisp library functions
  36   9. Post-amble
  37
  38 */
  39
  40 /*** 0. General comments ***/
  41
  42
  43 /*** GENERAL NOTE on CODING SYSTEMS ***
  44
  45   A coding system is an encoding mechanism for one or more character
  46   sets.  Here's a list of coding systems which Emacs can handle.  When
  47   we say "decode", it means converting some other coding system to
  48   Emacs' internal format (emacs-mule), and when we say "encode",
  49   it means converting the coding system emacs-mule to some other
  50   coding system.
  51
  52   0. Emacs' internal format (emacs-mule)
  53
  54   Emacs itself holds a multi-lingual character in buffers and strings
  55   in a special format.  Details are described in section 2.
  56
  57   1. ISO2022
  58
  59   The most famous coding system for multiple character sets.  X's
  60   Compound Text, various EUCs (Extended Unix Code), and coding
  61   systems used in Internet communication such as ISO-2022-JP are
  62   all variants of ISO2022.  Details are described in section 3.
  63
  64   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  65
  66   A coding system to encode character sets: ASCII, JISX0201, and
  67   JISX0208.  Widely used for PC's in Japan.  Details are described in
  68   section 4.
  69
  70   3. BIG5
  71
  72   A coding system to encode the character sets ASCII and Big5.  Widely
  73   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  74   described in section 4.  In this file, when we write "BIG5"
  75   (all uppercase), we mean the coding system, and when we write
  76   "Big5" (capitalized), we mean the character set.
  77
  78   4. Raw text
  79
  80   A coding system for text containing random 8-bit code.  Emacs does
  81   no code conversion on such text except for end-of-line format.
  82
  83   5. Other
  84
  85   If a user wants to read/write text encoded in a coding system not
  86   listed above, he can supply a decoder and an encoder for it as CCL
  87   (Code Conversion Language) programs.  Emacs executes the CCL program
  88   while reading/writing.
  89
  90   Emacs represents a coding system by a Lisp symbol that has a property
  91   `coding-system'.  But, before actually using the coding system, the
  92   information about it is set in a structure of type `struct
  93   coding_system' for rapid processing.  See section 6 for more details.
  94
  95 */
  96
  97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  98
  99   How end-of-line of text is encoded depends on the operating system.
 100   For instance, Unix's format is just one byte of `line-feed' code,
 101   whereas DOS's format is two-byte sequence of `carriage-return' and
 102   `line-feed' codes.  MacOS's format is usually one byte of
 103   `carriage-return'.
 104
 105   Since text character encoding and end-of-line encoding are
 106   independent, any coding system described above can have any
 107   end-of-line format.  So Emacs has information about end-of-line
 108   format in each coding-system.  See section 6 for more details.
 109
 110 */
 111
 112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 113
 114   These functions check if a text between SRC and SRC_END is encoded
 115   in the coding system category XXX.  Each returns an integer value in
 116   which appropriate flag bits for the category XXX are set.  The flag
 117   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 118   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 119   of the range 0x80..0x9F are in multibyte form.  */
 120 #if 0
 121 int
 122 detect_coding_emacs_mule (src, src_end, multibytep)
 123      unsigned char *src, *src_end;
 124      int multibytep;
 125 {
 126   ...
 127 }
 128 #endif
 129
 130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 131
 132   These functions decode SRC_BYTES length of unibyte text at SOURCE
 133   encoded in CODING to Emacs' internal format.  The resulting
 134   multibyte text goes to a place pointed to by DESTINATION, the length
 135   of which should not exceed DST_BYTES.
 136
 137   These functions set the information about original and decoded texts
 138   in the members `produced', `produced_char', `consumed', and
 139   `consumed_char' of the structure *CODING.  They also set the member
 140   `result' to one of CODING_FINISH_XXX indicating how the decoding
 141   finished.
 142
 143   DST_BYTES zero means that the source area and destination area are
 144   overlapped, which means that we can produce a decoded text until it
 145   reaches the head of the not-yet-decoded source text.
 146
 147   Below is a template for these functions.  */
 148 #if 0
 149 static void
 150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 151      struct coding_system *coding;
 152      const unsigned char *source;
 153      unsigned char *destination;
 154      int src_bytes, dst_bytes;
 155 {
 156   ...
 157 }
 158 #endif
 159
 160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 161
 162   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 163   internal multibyte format to CODING.  The resulting unibyte text
 164   goes to a place pointed to by DESTINATION, the length of which
 165   should not exceed DST_BYTES.
 166
 167   These functions set the information about original and encoded texts
 168   in the members `produced', `produced_char', `consumed', and
 169   `consumed_char' of the structure *CODING.  They also set the member
 170   `result' to one of CODING_FINISH_XXX indicating how the encoding
 171   finished.
 172
 173   DST_BYTES zero means that the source area and destination area are
 174   overlapped, which means that we can produce encoded text until it
 175   reaches at the head of the not-yet-encoded source text.
 176
 177   Below is a template for these functions.  */
 178 #if 0
 179 static void
 180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 181      struct coding_system *coding;
 182      unsigned char *source, *destination;
 183      int src_bytes, dst_bytes;
 184 {
 185   ...
 186 }
 187 #endif
 188
 189 /*** COMMONLY USED MACROS ***/
 190
 191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 192    get one, two, and three bytes from the source text respectively.
 193    If there are not enough bytes in the source, they jump to
 194    `label_end_of_loop'.  The caller should set variables `coding',
 195    `src' and `src_end' to appropriate pointer in advance.  These
 196    macros are called from decoding routines `decode_coding_XXX', thus
 197    it is assumed that the source text is unibyte.  */
 198
 199 #define ONE_MORE_BYTE(c1)                                       \
 200   do {                                                          \
 201     if (src >= src_end)                                         \
 202       {                                                         \
 203         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 204         goto label_end_of_loop;                                 \
 205       }                                                         \
 206     c1 = *src++;                                                \
 207   } while (0)
 208
 209 #define TWO_MORE_BYTES(c1, c2)                                  \
 210   do {                                                          \
 211     if (src + 1 >= src_end)                                     \
 212       {                                                         \
 213         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 214         goto label_end_of_loop;                                 \
 215       }                                                         \
 216     c1 = *src++;                                                \
 217     c2 = *src++;                                                \
 218   } while (0)
 219
 220
 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 222    form if MULTIBYTEP is nonzero.  */
 223
 224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 225   do {                                                          \
 226     if (src >= src_end)                                         \
 227       {                                                         \
 228         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 229         goto label_end_of_loop;                                 \
 230       }                                                         \
 231     c1 = *src++;                                                \
 232     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 233       c1 = *src++ - 0x20;                                       \
 234   } while (0)
 235
 236 /* Set C to the next character at the source text pointed by `src'.
 237    If there are not enough characters in the source, jump to
 238    `label_end_of_loop'.  The caller should set variables `coding'
 239    `src', `src_end', and `translation_table' to appropriate pointers
 240    in advance.  This macro is used in encoding routines
 241    `encode_coding_XXX', thus it assumes that the source text is in
 242    multibyte form except for 8-bit characters.  8-bit characters are
 243    in multibyte form if coding->src_multibyte is nonzero, else they
 244    are represented by a single byte.  */
 245
 246 #define ONE_MORE_CHAR(c)                                        \
 247   do {                                                          \
 248     int len = src_end - src;                                    \
 249     int bytes;                                                  \
 250     if (len <= 0)                                               \
 251       {                                                         \
 252         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 253         goto label_end_of_loop;                                 \
 254       }                                                         \
 255     if (coding->src_multibyte                                   \
 256         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 257       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 258     else                                                        \
 259       c = *src, bytes = 1;                                      \
 260     if (!NILP (translation_table))                              \
 261       c = translate_char (translation_table, c, -1, 0, 0);      \
 262     src += bytes;                                               \
 263   } while (0)
 264
 265
 266 /* Produce a multibyte form of character C to `dst'.  Jump to
 267    `label_end_of_loop' if there's not enough space at `dst'.
 268
 269    If we are now in the middle of a composition sequence, the decoded
 270    character may be ALTCHAR (for the current composition).  In that
 271    case, the character goes to coding->cmp_data->data instead of
 272    `dst'.
 273
 274    This macro is used in decoding routines.  */
 275
 276 #define EMIT_CHAR(c)                                                    \
 277   do {                                                                  \
 278     if (! COMPOSING_P (coding)                                          \
 279         || coding->composing == COMPOSITION_RELATIVE                    \
 280         || coding->composing == COMPOSITION_WITH_RULE)                  \
 281       {                                                                 \
 282         int bytes = CHAR_BYTES (c);                                     \
 283         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 284           {                                                             \
 285             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 286             goto label_end_of_loop;                                     \
 287           }                                                             \
 288         dst += CHAR_STRING (c, dst);                                    \
 289         coding->produced_char++;                                        \
 290       }                                                                 \
 291                                                                         \
 292     if (COMPOSING_P (coding)                                            \
 293         && coding->composing != COMPOSITION_RELATIVE)                   \
 294       {                                                                 \
 295         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 296         coding->composition_rule_follows                                \
 297           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 298       }                                                                 \
 299   } while (0)
 300
 301
 302 #define EMIT_ONE_BYTE(c)                                        \
 303   do {                                                          \
 304     if (dst >= (dst_bytes ? dst_end : src))                     \
 305       {                                                         \
 306         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 307         goto label_end_of_loop;                                 \
 308       }                                                         \
 309     *dst++ = c;                                                 \
 310   } while (0)
 311
 312 #define EMIT_TWO_BYTES(c1, c2)                                  \
 313   do {                                                          \
 314     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 315       {                                                         \
 316         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 317         goto label_end_of_loop;                                 \
 318       }                                                         \
 319     *dst++ = c1, *dst++ = c2;                                   \
 320   } while (0)
 321
 322 #define EMIT_BYTES(from, to)                                    \
 323   do {                                                          \
 324     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 325       {                                                         \
 326         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 327         goto label_end_of_loop;                                 \
 328       }                                                         \
 329     while (from < to)                                           \
 330       *dst++ = *from++;                                         \
 331   } while (0)
 332
 333 \f
 334 /*** 1. Preamble ***/
 335
 336 #ifdef emacs
 337 #include <config.h>
 338 #endif
 339
 340 #include <stdio.h>
 341
 342 #ifdef emacs
 343
 344 #include "lisp.h"
 345 #include "buffer.h"
 346 #include "charset.h"
 347 #include "composite.h"
 348 #include "ccl.h"
 349 #include "coding.h"
 350 #include "window.h"
 351 #include "intervals.h"
 352 #include "frame.h"
 353 #include "termhooks.h"
 354
 355 #else  /* not emacs */
 356
 357 #include "mulelib.h"
 358
 359 #endif /* not emacs */
 360
 361 Lisp_Object Qcoding_system, Qeol_type;
 362 Lisp_Object Qbuffer_file_coding_system;
 363 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 364 Lisp_Object Qno_conversion, Qundecided;
 365 Lisp_Object Qcoding_system_history;
 366 Lisp_Object Qsafe_chars;
 367 Lisp_Object Qvalid_codes;
 368
 369 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 370 Lisp_Object Qcall_process, Qcall_process_region;
 371 Lisp_Object Qstart_process, Qopen_network_stream;
 372 Lisp_Object Qtarget_idx;
 373
 374 /* If a symbol has this property, evaluate the value to define the
 375    symbol as a coding system.  */
 376 Lisp_Object Qcoding_system_define_form;
 377
 378 Lisp_Object Vselect_safe_coding_system_function;
 379
 380 int coding_system_require_warning;
 381
 382 /* Mnemonic string for each format of end-of-line.  */
 383 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 384 /* Mnemonic string to indicate format of end-of-line is not yet
 385    decided.  */
 386 Lisp_Object eol_mnemonic_undecided;
 387
 388 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 389    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 390 int system_eol_type;
 391
 392 #ifdef emacs
 393
 394 /* Information about which coding system is safe for which chars.
 395    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 396
 397    GENERIC-LIST is a list of generic coding systems which can encode
 398    any characters.
 399
 400    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 401    corresponding char table that contains safe chars.  */
 402 Lisp_Object Vcoding_system_safe_chars;
 403
 404 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 405
 406 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 407
 408 /* Coding system emacs-mule and raw-text are for converting only
 409    end-of-line format.  */
 410 Lisp_Object Qemacs_mule, Qraw_text;
 411
 412 Lisp_Object Qutf_8;
 413
 414 /* Coding-systems are handed between Emacs Lisp programs and C internal
 415    routines by the following three variables.  */
 416 /* Coding-system for reading files and receiving data from process.  */
 417 Lisp_Object Vcoding_system_for_read;
 418 /* Coding-system for writing files and sending data to process.  */
 419 Lisp_Object Vcoding_system_for_write;
 420 /* Coding-system actually used in the latest I/O.  */
 421 Lisp_Object Vlast_coding_system_used;
 422
 423 /* A vector of length 256 which contains information about special
 424    Latin codes (especially for dealing with Microsoft codes).  */
 425 Lisp_Object Vlatin_extra_code_table;
 426
 427 /* Flag to inhibit code conversion of end-of-line format.  */
 428 int inhibit_eol_conversion;
 429
 430 /* Flag to inhibit ISO2022 escape sequence detection.  */
 431 int inhibit_iso_escape_detection;
 432
 433 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 434 int inherit_process_coding_system;
 435
 436 /* Coding system to be used to encode text for terminal display when
 437    terminal coding system is nil.  */
 438 struct coding_system safe_terminal_coding;
 439
 440 /* Default coding system to be used to write a file.  */
 441 struct coding_system default_buffer_file_coding;
 442
 443 Lisp_Object Vfile_coding_system_alist;
 444 Lisp_Object Vprocess_coding_system_alist;
 445 Lisp_Object Vnetwork_coding_system_alist;
 446
 447 Lisp_Object Vlocale_coding_system;
 448
 449 #endif /* emacs */
 450
 451 Lisp_Object Qcoding_category, Qcoding_category_index;
 452
 453 /* List of symbols `coding-category-xxx' ordered by priority.  */
 454 Lisp_Object Vcoding_category_list;
 455
 456 /* Table of coding categories (Lisp symbols).  */
 457 Lisp_Object Vcoding_category_table;
 458
 459 /* Table of names of symbol for each coding-category.  */
 460 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 461   "coding-category-emacs-mule",
 462   "coding-category-sjis",
 463   "coding-category-iso-7",
 464   "coding-category-iso-7-tight",
 465   "coding-category-iso-8-1",
 466   "coding-category-iso-8-2",
 467   "coding-category-iso-7-else",
 468   "coding-category-iso-8-else",
 469   "coding-category-ccl",
 470   "coding-category-big5",
 471   "coding-category-utf-8",
 472   "coding-category-utf-16-be",
 473   "coding-category-utf-16-le",
 474   "coding-category-raw-text",
 475   "coding-category-binary"
 476 };
 477
 478 /* Table of pointers to coding systems corresponding to each coding
 479    categories.  */
 480 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 481
 482 /* Table of coding category masks.  Nth element is a mask for a coding
 483    category of which priority is Nth.  */
 484 static
 485 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 486
 487 /* Flag to tell if we look up translation table on character code
 488    conversion.  */
 489 Lisp_Object Venable_character_translation;
 490 /* Standard translation table to look up on decoding (reading).  */
 491 Lisp_Object Vstandard_translation_table_for_decode;
 492 /* Standard translation table to look up on encoding (writing).  */
 493 Lisp_Object Vstandard_translation_table_for_encode;
 494
 495 Lisp_Object Qtranslation_table;
 496 Lisp_Object Qtranslation_table_id;
 497 Lisp_Object Qtranslation_table_for_decode;
 498 Lisp_Object Qtranslation_table_for_encode;
 499
 500 /* Alist of charsets vs revision number.  */
 501 Lisp_Object Vcharset_revision_alist;
 502
 503 /* Default coding systems used for process I/O.  */
 504 Lisp_Object Vdefault_process_coding_system;
 505
 506 /* Char table for translating Quail and self-inserting input.  */
 507 Lisp_Object Vtranslation_table_for_input;
 508
 509 /* Global flag to tell that we can't call post-read-conversion and
 510    pre-write-conversion functions.  Usually the value is zero, but it
 511    is set to 1 temporarily while such functions are running.  This is
 512    to avoid infinite recursive call.  */
 513 static int inhibit_pre_post_conversion;
 514
 515 Lisp_Object Qchar_coding_system;
 516
 517 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 518    its validity.  */
 519
 520 Lisp_Object
 521 coding_safe_chars (coding_system)
 522      Lisp_Object coding_system;
 523 {
 524   Lisp_Object coding_spec, plist, safe_chars;
 525
 526   coding_spec = Fget (coding_system, Qcoding_system);
 527   plist = XVECTOR (coding_spec)->contents[3];
 528   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 529   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 530 }
 531
 532 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 533   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 534
 535 \f
 536 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 537
 538 /* Emacs' internal format for representation of multiple character
 539    sets is a kind of multi-byte encoding, i.e. characters are
 540    represented by variable-length sequences of one-byte codes.
 541
 542    ASCII characters and control characters (e.g. `tab', `newline') are
 543    represented by one-byte sequences which are their ASCII codes, in
 544    the range 0x00 through 0x7F.
 545
 546    8-bit characters of the range 0x80..0x9F are represented by
 547    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 548    code + 0x20).
 549
 550    8-bit characters of the range 0xA0..0xFF are represented by
 551    one-byte sequences which are their 8-bit code.
 552
 553    The other characters are represented by a sequence of `base
 554    leading-code', optional `extended leading-code', and one or two
 555    `position-code's.  The length of the sequence is determined by the
 556    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 557    whereas extended leading-code and position-code take the range 0xA0
 558    through 0xFF.  See `charset.h' for more details about leading-code
 559    and position-code.
 560
 561    --- CODE RANGE of Emacs' internal format ---
 562    character set        range
 563    -------------        -----
 564    ascii                0x00..0x7F
 565    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 566    eight-bit-graphic    0xA0..0xBF
 567    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 568    ---------------------------------------------
 569
 570    As this is the internal character representation, the format is
 571    usually not used externally (i.e. in a file or in a data sent to a
 572    process).  But, it is possible to have a text externally in this
 573    format (i.e. by encoding by the coding system `emacs-mule').
 574
 575    In that case, a sequence of one-byte codes has a slightly different
 576    form.
 577
 578    Firstly, all characters in eight-bit-control are represented by
 579    one-byte sequences which are their 8-bit code.
 580
 581    Next, character composition data are represented by the byte
 582    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 583    where,
 584         METHOD is 0xF0 plus one of composition method (enum
 585         composition_method),
 586
 587         BYTES is 0xA0 plus the byte length of these composition data,
 588
 589         CHARS is 0xA0 plus the number of characters composed by these
 590         data,
 591
 592         COMPONENTs are characters of multibyte form or composition
 593         rules encoded by two-byte of ASCII codes.
 594
 595    In addition, for backward compatibility, the following formats are
 596    also recognized as composition data on decoding.
 597
 598    0x80 MSEQ ...
 599    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 600
 601    Here,
 602         MSEQ is a multibyte form but in these special format:
 603           ASCII: 0xA0 ASCII_CODE+0x80,
 604           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 605         RULE is a one byte code of the range 0xA0..0xF0 that
 606         represents a composition rule.
 607   */
 608
 609 enum emacs_code_class_type emacs_code_class[256];
 610
 611 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 612    Check if a text is encoded in Emacs' internal format.  If it is,
 613    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 614
 615 static int
 616 detect_coding_emacs_mule (src, src_end, multibytep)
 617       unsigned char *src, *src_end;
 618       int multibytep;
 619 {
 620   unsigned char c;
 621   int composing = 0;
 622   /* Dummy for ONE_MORE_BYTE.  */
 623   struct coding_system dummy_coding;
 624   struct coding_system *coding = &dummy_coding;
 625
 626   while (1)
 627     {
 628       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 629
 630       if (composing)
 631         {
 632           if (c < 0xA0)
 633             composing = 0;
 634           else if (c == 0xA0)
 635             {
 636               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 637               c &= 0x7F;
 638             }
 639           else
 640             c -= 0x20;
 641         }
 642
 643       if (c < 0x20)
 644         {
 645           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 646             return 0;
 647         }
 648       else if (c >= 0x80 && c < 0xA0)
 649         {
 650           if (c == 0x80)
 651             /* Old leading code for a composite character.  */
 652             composing = 1;
 653           else
 654             {
 655               unsigned char *src_base = src - 1;
 656               int bytes;
 657
 658               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 659                                                bytes))
 660                 return 0;
 661               src = src_base + bytes;
 662             }
 663         }
 664     }
 665  label_end_of_loop:
 666   return CODING_CATEGORY_MASK_EMACS_MULE;
 667 }
 668
 669
 670 /* Record the starting position START and METHOD of one composition.  */
 671
 672 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 673   do {                                                          \
 674     struct composition_data *cmp_data = coding->cmp_data;       \
 675     int *data = cmp_data->data + cmp_data->used;                \
 676     coding->cmp_data_start = cmp_data->used;                    \
 677     data[0] = -1;                                               \
 678     data[1] = cmp_data->char_offset + start;                    \
 679     data[3] = (int) method;                                     \
 680     cmp_data->used += 4;                                        \
 681   } while (0)
 682
 683 /* Record the ending position END of the current composition.  */
 684
 685 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 686   do {                                                          \
 687     struct composition_data *cmp_data = coding->cmp_data;       \
 688     int *data = cmp_data->data + coding->cmp_data_start;        \
 689     data[0] = cmp_data->used - coding->cmp_data_start;          \
 690     data[2] = cmp_data->char_offset + end;                      \
 691   } while (0)
 692
 693 /* Record one COMPONENT (alternate character or composition rule).  */
 694
 695 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 696   do {                                                                  \
 697     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 698     if (coding->cmp_data->used - coding->cmp_data_start                 \
 699         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 700       {                                                                 \
 701         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 702         coding->composing = COMPOSITION_NO;                             \
 703       }                                                                 \
 704   } while (0)
 705
 706
 707 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 708    is not less than SRC_END, return -1 without incrementing Src.  */
 709
 710 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 711
 712
 713 /* Decode a character represented as a component of composition
 714    sequence of Emacs 20 style at SRC.  Set C to that character, store
 715    its multibyte form sequence at P, and set P to the end of that
 716    sequence.  If no valid character is found, set C to -1.  */
 717
 718 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 719   do {                                                          \
 720     int bytes;                                                  \
 721                                                                 \
 722     c = SAFE_ONE_MORE_BYTE ();                                  \
 723     if (c < 0)                                                  \
 724       break;                                                    \
 725     if (CHAR_HEAD_P (c))                                        \
 726       c = -1;                                                   \
 727     else if (c == 0xA0)                                         \
 728       {                                                         \
 729         c = SAFE_ONE_MORE_BYTE ();                              \
 730         if (c < 0xA0)                                           \
 731           c = -1;                                               \
 732         else                                                    \
 733           {                                                     \
 734             c -= 0x80;                                          \
 735             *p++ = c;                                           \
 736           }                                                     \
 737       }                                                         \
 738     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 739       {                                                         \
 740         unsigned char *p0 = p;                                  \
 741                                                                 \
 742         c -= 0x20;                                              \
 743         *p++ = c;                                               \
 744         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 745         while (--bytes)                                         \
 746           {                                                     \
 747             c = SAFE_ONE_MORE_BYTE ();                          \
 748             if (c < 0)                                          \
 749               break;                                            \
 750             *p++ = c;                                           \
 751           }                                                     \
 752         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 753             || (coding->flags /* We are recovering a file.  */  \
 754                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 755                 && ! CHAR_HEAD_P (p0[1])))                      \
 756           c = STRING_CHAR (p0, bytes);                          \
 757         else                                                    \
 758           c = -1;                                               \
 759       }                                                         \
 760     else                                                        \
 761       c = -1;                                                   \
 762   } while (0)
 763
 764
 765 /* Decode a composition rule represented as a component of composition
 766    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 767    valid rule is found, set C to -1.  */
 768
 769 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 770   do {                                                  \
 771     c = SAFE_ONE_MORE_BYTE ();                          \
 772     c -= 0xA0;                                          \
 773     if (c < 0 || c >= 81)                               \
 774       c = -1;                                           \
 775     else                                                \
 776       {                                                 \
 777         gref = c / 9, nref = c % 9;                     \
 778         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 779       }                                                 \
 780   } while (0)
 781
 782
 783 /* Decode composition sequence encoded by `emacs-mule' at the source
 784    pointed by SRC.  SRC_END is the end of source.  Store information
 785    of the composition in CODING->cmp_data.
 786
 787    For backward compatibility, decode also a composition sequence of
 788    Emacs 20 style.  In that case, the composition sequence contains
 789    characters that should be extracted into a buffer or string.  Store
 790    those characters at *DESTINATION in multibyte form.
 791
 792    If we encounter an invalid byte sequence, return 0.
 793    If we encounter an insufficient source or destination, or
 794    insufficient space in CODING->cmp_data, return 1.
 795    Otherwise, return consumed bytes in the source.
 796
 797 */
 798 static INLINE int
 799 decode_composition_emacs_mule (coding, src, src_end,
 800                                destination, dst_end, dst_bytes)
 801      struct coding_system *coding;
 802      const unsigned char *src, *src_end;
 803      unsigned char **destination, *dst_end;
 804      int dst_bytes;
 805 {
 806   unsigned char *dst = *destination;
 807   int method, data_len, nchars;
 808   const unsigned char *src_base = src++;
 809   /* Store components of composition.  */
 810   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 811   int ncomponent;
 812   /* Store multibyte form of characters to be composed.  This is for
 813      Emacs 20 style composition sequence.  */
 814   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 815   unsigned char *bufp = buf;
 816   int c, i, gref, nref;
 817
 818   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 819       >= COMPOSITION_DATA_SIZE)
 820     {
 821       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 822       return -1;
 823     }
 824
 825   ONE_MORE_BYTE (c);
 826   if (c - 0xF0 >= COMPOSITION_RELATIVE
 827            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 828     {
 829       int with_rule;
 830
 831       method = c - 0xF0;
 832       with_rule = (method == COMPOSITION_WITH_RULE
 833                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 834       ONE_MORE_BYTE (c);
 835       data_len = c - 0xA0;
 836       if (data_len < 4
 837           || src_base + data_len > src_end)
 838         return 0;
 839       ONE_MORE_BYTE (c);
 840       nchars = c - 0xA0;
 841       if (c < 1)
 842         return 0;
 843       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 844         {
 845           /* If it is longer than this, it can't be valid.  */
 846           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 847             return 0;
 848
 849           if (ncomponent % 2 && with_rule)
 850             {
 851               ONE_MORE_BYTE (gref);
 852               gref -= 32;
 853               ONE_MORE_BYTE (nref);
 854               nref -= 32;
 855               c = COMPOSITION_ENCODE_RULE (gref, nref);
 856             }
 857           else
 858             {
 859               int bytes;
 860               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 861                   || (coding->flags /* We are recovering a file.  */
 862                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 863                       && ! CHAR_HEAD_P (src[1])))
 864                 c = STRING_CHAR (src, bytes);
 865               else
 866                 c = *src, bytes = 1;
 867               src += bytes;
 868             }
 869           component[ncomponent] = c;
 870         }
 871     }
 872   else if (c >= 0x80)
 873     {
 874       /* This may be an old Emacs 20 style format.  See the comment at
 875          the section 2 of this file.  */
 876       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 877       if (src == src_end
 878           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 879         goto label_end_of_loop;
 880
 881       src_end = src;
 882       src = src_base + 1;
 883       if (c < 0xC0)
 884         {
 885           method = COMPOSITION_RELATIVE;
 886           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 887             {
 888               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 889               if (c < 0)
 890                 break;
 891               component[ncomponent++] = c;
 892             }
 893           if (ncomponent < 2)
 894             return 0;
 895           nchars = ncomponent;
 896         }
 897       else if (c == 0xFF)
 898         {
 899           method = COMPOSITION_WITH_RULE;
 900           src++;
 901           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 902           if (c < 0)
 903             return 0;
 904           component[0] = c;
 905           for (ncomponent = 1;
 906                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 907             {
 908               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 909               if (c < 0)
 910                 break;
 911               component[ncomponent++] = c;
 912               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 913               if (c < 0)
 914                 break;
 915               component[ncomponent++] = c;
 916             }
 917           if (ncomponent < 3)
 918             return 0;
 919           nchars = (ncomponent + 1) / 2;
 920         }
 921       else
 922         return 0;
 923     }
 924   else
 925     return 0;
 926
 927   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 928     {
 929       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 930       for (i = 0; i < ncomponent; i++)
 931         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 932       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 933       if (buf < bufp)
 934         {
 935           unsigned char *p = buf;
 936           EMIT_BYTES (p, bufp);
 937           *destination += bufp - buf;
 938           coding->produced_char += nchars;
 939         }
 940       return (src - src_base);
 941     }
 942  label_end_of_loop:
 943   return -1;
 944 }
 945
 946 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 947
 948 static void
 949 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 950      struct coding_system *coding;
 951      const unsigned char *source;
 952      unsigned char *destination;
 953      int src_bytes, dst_bytes;
 954 {
 955   const unsigned char *src = source;
 956   const unsigned char *src_end = source + src_bytes;
 957   unsigned char *dst = destination;
 958   unsigned char *dst_end = destination + dst_bytes;
 959   /* SRC_BASE remembers the start position in source in each loop.
 960      The loop will be exited when there's not enough source code, or
 961      when there's not enough destination area to produce a
 962      character.  */
 963   const unsigned char *src_base;
 964
 965   coding->produced_char = 0;
 966   while ((src_base = src) < src_end)
 967     {
 968       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 969       const unsigned char *p;
 970       int bytes;
 971
 972       if (*src == '\r')
 973         {
 974           int c = *src++;
 975
 976           if (coding->eol_type == CODING_EOL_CR)
 977             c = '\n';
 978           else if (coding->eol_type == CODING_EOL_CRLF)
 979             {
 980               ONE_MORE_BYTE (c);
 981               if (c != '\n')
 982                 {
 983                   src--;
 984                   c = '\r';
 985                 }
 986             }
 987           *dst++ = c;
 988           coding->produced_char++;
 989           continue;
 990         }
 991       else if (*src == '\n')
 992         {
 993           if ((coding->eol_type == CODING_EOL_CR
 994                || coding->eol_type == CODING_EOL_CRLF)
 995               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 996             {
 997               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 998               goto label_end_of_loop;
 999             }
1000           *dst++ = *src++;
1001           coding->produced_char++;
1002           continue;
1003         }
1004       else if (*src == 0x80 && coding->cmp_data)
1005         {
1006           /* Start of composition data.  */
1007           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1008                                                          &dst, dst_end,
1009                                                          dst_bytes);
1010           if (consumed < 0)
1011             goto label_end_of_loop;
1012           else if (consumed > 0)
1013             {
1014               src += consumed;
1015               continue;
1016             }
1017           bytes = CHAR_STRING (*src, tmp);
1018           p = tmp;
1019           src++;
1020         }
1021       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1022                || (coding->flags /* We are recovering a file.  */
1023                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1024                    && ! CHAR_HEAD_P (src[1])))
1025         {
1026           p = src;
1027           src += bytes;
1028         }
1029       else
1030         {
1031           int i, c;
1032
1033           bytes = BYTES_BY_CHAR_HEAD (*src);
1034           src++;
1035           for (i = 1; i < bytes; i++)
1036             {
1037               ONE_MORE_BYTE (c);
1038               if (CHAR_HEAD_P (c))
1039                 break;
1040             }
1041           if (i < bytes)
1042             {
1043               bytes = CHAR_STRING (*src_base, tmp);
1044               p = tmp;
1045               src = src_base + 1;
1046             }
1047           else
1048             {
1049               p = src_base;
1050             }
1051         }
1052       if (dst + bytes >= (dst_bytes ? dst_end : src))
1053         {
1054           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1055           break;
1056         }
1057       while (bytes--) *dst++ = *p++;
1058       coding->produced_char++;
1059     }
1060  label_end_of_loop:
1061   coding->consumed = coding->consumed_char = src_base - source;
1062   coding->produced = dst - destination;
1063 }
1064
1065
1066 /* Encode composition data stored at DATA into a special byte sequence
1067    starting by 0x80.  Update CODING->cmp_data_start and maybe
1068    CODING->cmp_data for the next call.  */
1069
1070 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1071   do {                                                                  \
1072     unsigned char buf[1024], *p0 = buf, *p;                             \
1073     int len = data[0];                                                  \
1074     int i;                                                              \
1075                                                                         \
1076     buf[0] = 0x80;                                                      \
1077     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1078     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1079     p = buf + 4;                                                        \
1080     if (data[3] == COMPOSITION_WITH_RULE                                \
1081         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1082       {                                                                 \
1083         p += CHAR_STRING (data[4], p);                                  \
1084         for (i = 5; i < len; i += 2)                                    \
1085           {                                                             \
1086             int gref, nref;                                             \
1087              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1088             *p++ = 0x20 + gref;                                         \
1089             *p++ = 0x20 + nref;                                         \
1090             p += CHAR_STRING (data[i + 1], p);                          \
1091           }                                                             \
1092       }                                                                 \
1093     else                                                                \
1094       {                                                                 \
1095         for (i = 4; i < len; i++)                                       \
1096           p += CHAR_STRING (data[i], p);                                \
1097       }                                                                 \
1098     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1099                                                                         \
1100     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1101       {                                                                 \
1102         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1103         goto label_end_of_loop;                                         \
1104       }                                                                 \
1105     while (p0 < p)                                                      \
1106       *dst++ = *p0++;                                                   \
1107     coding->cmp_data_start += data[0];                                  \
1108     if (coding->cmp_data_start == coding->cmp_data->used                \
1109         && coding->cmp_data->next)                                      \
1110       {                                                                 \
1111         coding->cmp_data = coding->cmp_data->next;                      \
1112         coding->cmp_data_start = 0;                                     \
1113       }                                                                 \
1114   } while (0)
1115
1116
1117 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1118                             unsigned char *, int, int));
1119
1120 static void
1121 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1122      struct coding_system *coding;
1123      const unsigned char *source;
1124      unsigned char *destination;
1125      int src_bytes, dst_bytes;
1126 {
1127   const unsigned char *src = source;
1128   const unsigned char *src_end = source + src_bytes;
1129   unsigned char *dst = destination;
1130   unsigned char *dst_end = destination + dst_bytes;
1131   const unsigned char *src_base;
1132   int c;
1133   int char_offset;
1134   int *data;
1135
1136   Lisp_Object translation_table;
1137
1138   translation_table = Qnil;
1139
1140   /* Optimization for the case that there's no composition.  */
1141   if (!coding->cmp_data || coding->cmp_data->used == 0)
1142     {
1143       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1144       return;
1145     }
1146
1147   char_offset = coding->cmp_data->char_offset;
1148   data = coding->cmp_data->data + coding->cmp_data_start;
1149   while (1)
1150     {
1151       src_base = src;
1152
1153       /* If SRC starts a composition, encode the information about the
1154          composition in advance.  */
1155       if (coding->cmp_data_start < coding->cmp_data->used
1156           && char_offset + coding->consumed_char == data[1])
1157         {
1158           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1159           char_offset = coding->cmp_data->char_offset;
1160           data = coding->cmp_data->data + coding->cmp_data_start;
1161         }
1162
1163       ONE_MORE_CHAR (c);
1164       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1165                         || coding->eol_type == CODING_EOL_CR))
1166         {
1167           if (coding->eol_type == CODING_EOL_CRLF)
1168             EMIT_TWO_BYTES ('\r', c);
1169           else
1170             EMIT_ONE_BYTE ('\r');
1171         }
1172       else if (SINGLE_BYTE_CHAR_P (c))
1173         {
1174           if (coding->flags && ! ASCII_BYTE_P (c))
1175             {
1176               /* As we are auto saving, retain the multibyte form for
1177                  8-bit chars.  */
1178               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1179               int bytes = CHAR_STRING (c, buf);
1180
1181               if (bytes == 1)
1182                 EMIT_ONE_BYTE (buf[0]);
1183               else
1184                 EMIT_TWO_BYTES (buf[0], buf[1]);
1185             }
1186           else
1187             EMIT_ONE_BYTE (c);
1188         }
1189       else
1190         EMIT_BYTES (src_base, src);
1191       coding->consumed_char++;
1192     }
1193  label_end_of_loop:
1194   coding->consumed = src_base - source;
1195   coding->produced = coding->produced_char = dst - destination;
1196   return;
1197 }
1198
1199 \f
1200 /*** 3. ISO2022 handlers ***/
1201
1202 /* The following note describes the coding system ISO2022 briefly.
1203    Since the intention of this note is to help understand the
1204    functions in this file, some parts are NOT ACCURATE or are OVERLY
1205    SIMPLIFIED.  For thorough understanding, please refer to the
1206    original document of ISO2022.  This is equivalent to the standard
1207    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1208
1209    ISO2022 provides many mechanisms to encode several character sets
1210    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1211    is encoded using bytes less than 128.  This may make the encoded
1212    text a little bit longer, but the text passes more easily through
1213    several types of gateway, some of which strip off the MSB (Most
1214    Significant Bit).
1215
1216    There are two kinds of character sets: control character sets and
1217    graphic character sets.  The former contain control characters such
1218    as `newline' and `escape' to provide control functions (control
1219    functions are also provided by escape sequences).  The latter
1220    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1221    two control character sets and many graphic character sets.
1222
1223    Graphic character sets are classified into one of the following
1224    four classes, according to the number of bytes (DIMENSION) and
1225    number of characters in one dimension (CHARS) of the set:
1226    - DIMENSION1_CHARS94
1227    - DIMENSION1_CHARS96
1228    - DIMENSION2_CHARS94
1229    - DIMENSION2_CHARS96
1230
1231    In addition, each character set is assigned an identification tag,
1232    unique for each set, called the "final character" (denoted as <F>
1233    hereafter).  The <F> of each character set is decided by ECMA(*)
1234    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1235    (0x30..0x3F are for private use only).
1236
1237    Note (*): ECMA = European Computer Manufacturers Association
1238
1239    Here are examples of graphic character sets [NAME(<F>)]:
1240         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1241         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1242         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1243         o DIMENSION2_CHARS96 -- none for the moment
1244
1245    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1246         C0 [0x00..0x1F] -- control character plane 0
1247         GL [0x20..0x7F] -- graphic character plane 0
1248         C1 [0x80..0x9F] -- control character plane 1
1249         GR [0xA0..0xFF] -- graphic character plane 1
1250
1251    A control character set is directly designated and invoked to C0 or
1252    C1 by an escape sequence.  The most common case is that:
1253    - ISO646's  control character set is designated/invoked to C0, and
1254    - ISO6429's control character set is designated/invoked to C1,
1255    and usually these designations/invocations are omitted in encoded
1256    text.  In a 7-bit environment, only C0 can be used, and a control
1257    character for C1 is encoded by an appropriate escape sequence to
1258    fit into the environment.  All control characters for C1 are
1259    defined to have corresponding escape sequences.
1260
1261    A graphic character set is at first designated to one of four
1262    graphic registers (G0 through G3), then these graphic registers are
1263    invoked to GL or GR.  These designations and invocations can be
1264    done independently.  The most common case is that G0 is invoked to
1265    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1266    these invocations and designations are omitted in encoded text.
1267    In a 7-bit environment, only GL can be used.
1268
1269    When a graphic character set of CHARS94 is invoked to GL, codes
1270    0x20 and 0x7F of the GL area work as control characters SPACE and
1271    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1272    be used.
1273
1274    There are two ways of invocation: locking-shift and single-shift.
1275    With locking-shift, the invocation lasts until the next different
1276    invocation, whereas with single-shift, the invocation affects the
1277    following character only and doesn't affect the locking-shift
1278    state.  Invocations are done by the following control characters or
1279    escape sequences:
1280
1281    ----------------------------------------------------------------------
1282    abbrev  function                  cntrl escape seq   description
1283    ----------------------------------------------------------------------
1284    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1285    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1286    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1287    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1288    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1289    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1290    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1291    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1292    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1293    ----------------------------------------------------------------------
1294    (*) These are not used by any known coding system.
1295
1296    Control characters for these functions are defined by macros
1297    ISO_CODE_XXX in `coding.h'.
1298
1299    Designations are done by the following escape sequences:
1300    ----------------------------------------------------------------------
1301    escape sequence      description
1302    ----------------------------------------------------------------------
1303    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1304    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1305    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1306    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1307    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1308    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1309    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1310    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1311    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1312    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1313    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1314    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1315    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1316    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1317    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1318    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1319    ----------------------------------------------------------------------
1320
1321    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1322    of dimension 1, chars 94, and final character <F>, etc...
1323
1324    Note (*): Although these designations are not allowed in ISO2022,
1325    Emacs accepts them on decoding, and produces them on encoding
1326    CHARS96 character sets in a coding system which is characterized as
1327    7-bit environment, non-locking-shift, and non-single-shift.
1328
1329    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1330    '(' can be omitted.  We refer to this as "short-form" hereafter.
1331
1332    Now you may notice that there are a lot of ways of encoding the
1333    same multilingual text in ISO2022.  Actually, there exist many
1334    coding systems such as Compound Text (used in X11's inter client
1335    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1336    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1337    localized platforms), and all of these are variants of ISO2022.
1338
1339    In addition to the above, Emacs handles two more kinds of escape
1340    sequences: ISO6429's direction specification and Emacs' private
1341    sequence for specifying character composition.
1342
1343    ISO6429's direction specification takes the following form:
1344         o CSI ']'      -- end of the current direction
1345         o CSI '0' ']'  -- end of the current direction
1346         o CSI '1' ']'  -- start of left-to-right text
1347         o CSI '2' ']'  -- start of right-to-left text
1348    The control character CSI (0x9B: control sequence introducer) is
1349    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1350
1351    Character composition specification takes the following form:
1352         o ESC '0' -- start relative composition
1353         o ESC '1' -- end composition
1354         o ESC '2' -- start rule-base composition (*)
1355         o ESC '3' -- start relative composition with alternate chars  (**)
1356         o ESC '4' -- start rule-base composition with alternate chars  (**)
1357   Since these are not standard escape sequences of any ISO standard,
1358   the use of them with these meanings is restricted to Emacs only.
1359
1360   (*) This form is used only in Emacs 20.5 and older versions,
1361   but the newer versions can safely decode it.
1362   (**) This form is used only in Emacs 21.1 and newer versions,
1363   and the older versions can't decode it.
1364
1365   Here's a list of example usages of these composition escape
1366   sequences (categorized by `enum composition_method').
1367
1368   COMPOSITION_RELATIVE:
1369         ESC 0 CHAR [ CHAR ] ESC 1
1370   COMPOSITION_WITH_RULE:
1371         ESC 2 CHAR [ RULE CHAR ] ESC 1
1372   COMPOSITION_WITH_ALTCHARS:
1373         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1374   COMPOSITION_WITH_RULE_ALTCHARS:
1375         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1376
1377 enum iso_code_class_type iso_code_class[256];
1378
1379 #define CHARSET_OK(idx, charset, c)                                     \
1380   (coding_system_table[idx]                                             \
1381    && (charset == CHARSET_ASCII                                         \
1382        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1383            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1384    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1385                                               charset)                  \
1386        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1387
1388 #define SHIFT_OUT_OK(idx) \
1389   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1390
1391 #define COMPOSITION_OK(idx)     \
1392   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1393
1394 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1395    Check if a text is encoded in ISO2022.  If it is, return an
1396    integer in which appropriate flag bits any of:
1397         CODING_CATEGORY_MASK_ISO_7
1398         CODING_CATEGORY_MASK_ISO_7_TIGHT
1399         CODING_CATEGORY_MASK_ISO_8_1
1400         CODING_CATEGORY_MASK_ISO_8_2
1401         CODING_CATEGORY_MASK_ISO_7_ELSE
1402         CODING_CATEGORY_MASK_ISO_8_ELSE
1403    are set.  If a code which should never appear in ISO2022 is found,
1404    returns 0.  */
1405
1406 static int
1407 detect_coding_iso2022 (src, src_end, multibytep)
1408      unsigned char *src, *src_end;
1409      int multibytep;
1410 {
1411   int mask = CODING_CATEGORY_MASK_ISO;
1412   int mask_found = 0;
1413   int reg[4], shift_out = 0, single_shifting = 0;
1414   int c, c1, charset;
1415   /* Dummy for ONE_MORE_BYTE.  */
1416   struct coding_system dummy_coding;
1417   struct coding_system *coding = &dummy_coding;
1418   Lisp_Object safe_chars;
1419
1420   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1421   while (mask && src < src_end)
1422     {
1423       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1424     retry:
1425       switch (c)
1426         {
1427         case ISO_CODE_ESC:
1428           if (inhibit_iso_escape_detection)
1429             break;
1430           single_shifting = 0;
1431           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1432           if (c >= '(' && c <= '/')
1433             {
1434               /* Designation sequence for a charset of dimension 1.  */
1435               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1436               if (c1 < ' ' || c1 >= 0x80
1437                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1438                 /* Invalid designation sequence.  Just ignore.  */
1439                 break;
1440               reg[(c - '(') % 4] = charset;
1441             }
1442           else if (c == '$')
1443             {
1444               /* Designation sequence for a charset of dimension 2.  */
1445               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1446               if (c >= '@' && c <= 'B')
1447                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1448                 reg[0] = charset = iso_charset_table[1][0][c];
1449               else if (c >= '(' && c <= '/')
1450                 {
1451                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1452                   if (c1 < ' ' || c1 >= 0x80
1453                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1454                     /* Invalid designation sequence.  Just ignore.  */
1455                     break;
1456                   reg[(c - '(') % 4] = charset;
1457                 }
1458               else
1459                 /* Invalid designation sequence.  Just ignore.  */
1460                 break;
1461             }
1462           else if (c == 'N' || c == 'O')
1463             {
1464               /* ESC <Fe> for SS2 or SS3.  */
1465               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1466               break;
1467             }
1468           else if (c >= '0' && c <= '4')
1469             {
1470               /* ESC <Fp> for start/end composition.  */
1471               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1472                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1473               else
1474                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1475               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1476                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1477               else
1478                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1479               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1480                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1481               else
1482                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1483               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1484                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1485               else
1486                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1487               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1488                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1489               else
1490                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1491               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1492                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1493               else
1494                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1495               break;
1496             }
1497           else
1498             /* Invalid escape sequence.  Just ignore.  */
1499             break;
1500
1501           /* We found a valid designation sequence for CHARSET.  */
1502           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1503           c = MAKE_CHAR (charset, 0, 0);
1504           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1505             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1506           else
1507             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1508           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1509             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1510           else
1511             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1512           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1513             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1514           else
1515             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1516           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1517             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1518           else
1519             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1520           break;
1521
1522         case ISO_CODE_SO:
1523           if (inhibit_iso_escape_detection)
1524             break;
1525           single_shifting = 0;
1526           if (shift_out == 0
1527               && (reg[1] >= 0
1528                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1529                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1530             {
1531               /* Locking shift out.  */
1532               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1533               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1534             }
1535           break;
1536
1537         case ISO_CODE_SI:
1538           if (inhibit_iso_escape_detection)
1539             break;
1540           single_shifting = 0;
1541           if (shift_out == 1)
1542             {
1543               /* Locking shift in.  */
1544               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1545               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1546             }
1547           break;
1548
1549         case ISO_CODE_CSI:
1550           single_shifting = 0;
1551         case ISO_CODE_SS2:
1552         case ISO_CODE_SS3:
1553           {
1554             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1555
1556             if (inhibit_iso_escape_detection)
1557               break;
1558             if (c != ISO_CODE_CSI)
1559               {
1560                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1561                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1562                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1563                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1564                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1565                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1566                 single_shifting = 1;
1567               }
1568             if (VECTORP (Vlatin_extra_code_table)
1569                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1570               {
1571                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572                     & CODING_FLAG_ISO_LATIN_EXTRA)
1573                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575                     & CODING_FLAG_ISO_LATIN_EXTRA)
1576                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577               }
1578             mask &= newmask;
1579             mask_found |= newmask;
1580           }
1581           break;
1582
1583         default:
1584           if (c < 0x80)
1585             {
1586               single_shifting = 0;
1587               break;
1588             }
1589           else if (c < 0xA0)
1590             {
1591               single_shifting = 0;
1592               if (VECTORP (Vlatin_extra_code_table)
1593                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1594                 {
1595                   int newmask = 0;
1596
1597                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1598                       & CODING_FLAG_ISO_LATIN_EXTRA)
1599                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1600                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1601                       & CODING_FLAG_ISO_LATIN_EXTRA)
1602                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1603                   mask &= newmask;
1604                   mask_found |= newmask;
1605                 }
1606               else
1607                 return 0;
1608             }
1609           else
1610             {
1611               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1612                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1613               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1614               /* Check the length of succeeding codes of the range
1615                  0xA0..0FF.  If the byte length is odd, we exclude
1616                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1617                  when we are not single shifting.  */
1618               if (!single_shifting
1619                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1620                 {
1621                   int i = 1;
1622
1623                   c = -1;
1624                   while (src < src_end)
1625                     {
1626                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1627                       if (c < 0xA0)
1628                         break;
1629                       i++;
1630                     }
1631
1632                   if (i & 1 && src < src_end)
1633                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1634                   else
1635                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1636                   if (c >= 0)
1637                     /* This means that we have read one extra byte.  */
1638                     goto retry;
1639                 }
1640             }
1641           break;
1642         }
1643     }
1644  label_end_of_loop:
1645   return (mask & mask_found);
1646 }
1647
1648 /* Decode a character of which charset is CHARSET, the 1st position
1649    code is C1, the 2nd position code is C2, and return the decoded
1650    character code.  If the variable `translation_table' is non-nil,
1651    returned the translated code.  */
1652
1653 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1654   (NILP (translation_table)                     \
1655    ? MAKE_CHAR (charset, c1, c2)                \
1656    : translate_char (translation_table, -1, charset, c1, c2))
1657
1658 /* Set designation state into CODING.  */
1659 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1660   do {                                                                     \
1661     int charset, c;                                                        \
1662                                                                            \
1663     if (final_char < '0' || final_char >= 128)                             \
1664       goto label_invalid_code;                                             \
1665     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1666                                  make_number (chars),                      \
1667                                  make_number (final_char));                \
1668     c = MAKE_CHAR (charset, 0, 0);                                         \
1669     if (charset >= 0                                                       \
1670         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1671             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1672       {                                                                    \
1673         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1674             && reg == 0                                                    \
1675             && charset == CHARSET_ASCII)                                   \
1676           {                                                                \
1677             /* We should insert this designation sequence as is so         \
1678                that it is surely written back to a file.  */               \
1679             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1680             goto label_invalid_code;                                       \
1681           }                                                                \
1682         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1683         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1684             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1685           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1686         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1687       }                                                                    \
1688     else                                                                   \
1689       {                                                                    \
1690         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1691         goto label_invalid_code;                                           \
1692       }                                                                    \
1693   } while (0)
1694
1695 /* Allocate a memory block for storing information about compositions.
1696    The block is chained to the already allocated blocks.  */
1697
1698 void
1699 coding_allocate_composition_data (coding, char_offset)
1700      struct coding_system *coding;
1701      int char_offset;
1702 {
1703   struct composition_data *cmp_data
1704     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1705
1706   cmp_data->char_offset = char_offset;
1707   cmp_data->used = 0;
1708   cmp_data->prev = coding->cmp_data;
1709   cmp_data->next = NULL;
1710   if (coding->cmp_data)
1711     coding->cmp_data->next = cmp_data;
1712   coding->cmp_data = cmp_data;
1713   coding->cmp_data_start = 0;
1714   coding->composing = COMPOSITION_NO;
1715 }
1716
1717 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1718    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1719    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1720    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1721    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1722   */
1723
1724 #define DECODE_COMPOSITION_START(c1)                                       \
1725   do {                                                                     \
1726     if (coding->composing == COMPOSITION_DISABLED)                         \
1727       {                                                                    \
1728         *dst++ = ISO_CODE_ESC;                                             \
1729         *dst++ = c1 & 0x7f;                                                \
1730         coding->produced_char += 2;                                        \
1731       }                                                                    \
1732     else if (!COMPOSING_P (coding))                                        \
1733       {                                                                    \
1734         /* This is surely the start of a composition.  We must be sure     \
1735            that coding->cmp_data has enough space to store the             \
1736            information about the composition.  If not, terminate the       \
1737            current decoding loop, allocate one more memory block for       \
1738            coding->cmp_data in the caller, then start the decoding         \
1739            loop again.  We can't allocate memory here directly because     \
1740            it may cause buffer/string relocation.  */                      \
1741         if (!coding->cmp_data                                              \
1742             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1743                 >= COMPOSITION_DATA_SIZE))                                 \
1744           {                                                                \
1745             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1746             goto label_end_of_loop;                                        \
1747           }                                                                \
1748         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1749                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1750                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1751                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1752         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1753                                       coding->composing);                  \
1754         coding->composition_rule_follows = 0;                              \
1755       }                                                                    \
1756     else                                                                   \
1757       {                                                                    \
1758         /* We are already handling a composition.  If the method is        \
1759            the following two, the codes following the current escape       \
1760            sequence are actual characters stored in a buffer.  */          \
1761         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1762             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1763           {                                                                \
1764             coding->composing = COMPOSITION_RELATIVE;                      \
1765             coding->composition_rule_follows = 0;                          \
1766           }                                                                \
1767       }                                                                    \
1768   } while (0)
1769
1770 /* Handle composition end sequence ESC 1.  */
1771
1772 #define DECODE_COMPOSITION_END(c1)                                      \
1773   do {                                                                  \
1774     if (! COMPOSING_P (coding))                                         \
1775       {                                                                 \
1776         *dst++ = ISO_CODE_ESC;                                          \
1777         *dst++ = c1;                                                    \
1778         coding->produced_char += 2;                                     \
1779       }                                                                 \
1780     else                                                                \
1781       {                                                                 \
1782         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1783         coding->composing = COMPOSITION_NO;                             \
1784       }                                                                 \
1785   } while (0)
1786
1787 /* Decode a composition rule from the byte C1 (and maybe one more byte
1788    from SRC) and store one encoded composition rule in
1789    coding->cmp_data.  */
1790
1791 #define DECODE_COMPOSITION_RULE(c1)                                     \
1792   do {                                                                  \
1793     int rule = 0;                                                       \
1794     (c1) -= 32;                                                         \
1795     if (c1 < 81)                /* old format (before ver.21) */        \
1796       {                                                                 \
1797         int gref = (c1) / 9;                                            \
1798         int nref = (c1) % 9;                                            \
1799         if (gref == 4) gref = 10;                                       \
1800         if (nref == 4) nref = 10;                                       \
1801         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1802       }                                                                 \
1803     else if (c1 < 93)           /* new format (after ver.21) */         \
1804       {                                                                 \
1805         ONE_MORE_BYTE (c2);                                             \
1806         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1807       }                                                                 \
1808     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1809     coding->composition_rule_follows = 0;                               \
1810   } while (0)
1811
1812
1813 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1814
1815 static void
1816 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1817      struct coding_system *coding;
1818      const unsigned char *source;
1819      unsigned char *destination;
1820      int src_bytes, dst_bytes;
1821 {
1822   const unsigned char *src = source;
1823   const unsigned char *src_end = source + src_bytes;
1824   unsigned char *dst = destination;
1825   unsigned char *dst_end = destination + dst_bytes;
1826   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1827   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1828   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1829   /* SRC_BASE remembers the start position in source in each loop.
1830      The loop will be exited when there's not enough source code
1831      (within macro ONE_MORE_BYTE), or when there's not enough
1832      destination area to produce a character (within macro
1833      EMIT_CHAR).  */
1834   const unsigned char *src_base;
1835   int c, charset;
1836   Lisp_Object translation_table;
1837   Lisp_Object safe_chars;
1838
1839   safe_chars = coding_safe_chars (coding->symbol);
1840
1841   if (NILP (Venable_character_translation))
1842     translation_table = Qnil;
1843   else
1844     {
1845       translation_table = coding->translation_table_for_decode;
1846       if (NILP (translation_table))
1847         translation_table = Vstandard_translation_table_for_decode;
1848     }
1849
1850   coding->result = CODING_FINISH_NORMAL;
1851
1852   while (1)
1853     {
1854       int c1, c2 = 0;
1855
1856       src_base = src;
1857       ONE_MORE_BYTE (c1);
1858
1859       /* We produce no character or one character.  */
1860       switch (iso_code_class [c1])
1861         {
1862         case ISO_0x20_or_0x7F:
1863           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1864             {
1865               DECODE_COMPOSITION_RULE (c1);
1866               continue;
1867             }
1868           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1869             {
1870               /* This is SPACE or DEL.  */
1871               charset = CHARSET_ASCII;
1872               break;
1873             }
1874           /* This is a graphic character, we fall down ...  */
1875
1876         case ISO_graphic_plane_0:
1877           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1878             {
1879               DECODE_COMPOSITION_RULE (c1);
1880               continue;
1881             }
1882           charset = charset0;
1883           break;
1884
1885         case ISO_0xA0_or_0xFF:
1886           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1887               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1888             goto label_invalid_code;
1889           /* This is a graphic character, we fall down ... */
1890
1891         case ISO_graphic_plane_1:
1892           if (charset1 < 0)
1893             goto label_invalid_code;
1894           charset = charset1;
1895           break;
1896
1897         case ISO_control_0:
1898           if (COMPOSING_P (coding))
1899             DECODE_COMPOSITION_END ('1');
1900
1901           /* All ISO2022 control characters in this class have the
1902              same representation in Emacs internal format.  */
1903           if (c1 == '\n'
1904               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1905               && (coding->eol_type == CODING_EOL_CR
1906                   || coding->eol_type == CODING_EOL_CRLF))
1907             {
1908               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1909               goto label_end_of_loop;
1910             }
1911           charset = CHARSET_ASCII;
1912           break;
1913
1914         case ISO_control_1:
1915           if (COMPOSING_P (coding))
1916             DECODE_COMPOSITION_END ('1');
1917           goto label_invalid_code;
1918
1919         case ISO_carriage_return:
1920           if (COMPOSING_P (coding))
1921             DECODE_COMPOSITION_END ('1');
1922
1923           if (coding->eol_type == CODING_EOL_CR)
1924             c1 = '\n';
1925           else if (coding->eol_type == CODING_EOL_CRLF)
1926             {
1927               ONE_MORE_BYTE (c1);
1928               if (c1 != ISO_CODE_LF)
1929                 {
1930                   src--;
1931                   c1 = '\r';
1932                 }
1933             }
1934           charset = CHARSET_ASCII;
1935           break;
1936
1937         case ISO_shift_out:
1938           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1939               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1940             goto label_invalid_code;
1941           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1942           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1943           continue;
1944
1945         case ISO_shift_in:
1946           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1947             goto label_invalid_code;
1948           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1949           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950           continue;
1951
1952         case ISO_single_shift_2_7:
1953         case ISO_single_shift_2:
1954           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1955             goto label_invalid_code;
1956           /* SS2 is handled as an escape sequence of ESC 'N' */
1957           c1 = 'N';
1958           goto label_escape_sequence;
1959
1960         case ISO_single_shift_3:
1961           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962             goto label_invalid_code;
1963           /* SS2 is handled as an escape sequence of ESC 'O' */
1964           c1 = 'O';
1965           goto label_escape_sequence;
1966
1967         case ISO_control_sequence_introducer:
1968           /* CSI is handled as an escape sequence of ESC '[' ...  */
1969           c1 = '[';
1970           goto label_escape_sequence;
1971
1972         case ISO_escape:
1973           ONE_MORE_BYTE (c1);
1974         label_escape_sequence:
1975           /* Escape sequences handled by Emacs are invocation,
1976              designation, direction specification, and character
1977              composition specification.  */
1978           switch (c1)
1979             {
1980             case '&':           /* revision of following character set */
1981               ONE_MORE_BYTE (c1);
1982               if (!(c1 >= '@' && c1 <= '~'))
1983                 goto label_invalid_code;
1984               ONE_MORE_BYTE (c1);
1985               if (c1 != ISO_CODE_ESC)
1986                 goto label_invalid_code;
1987               ONE_MORE_BYTE (c1);
1988               goto label_escape_sequence;
1989
1990             case '$':           /* designation of 2-byte character set */
1991               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1992                 goto label_invalid_code;
1993               ONE_MORE_BYTE (c1);
1994               if (c1 >= '@' && c1 <= 'B')
1995                 {       /* designation of JISX0208.1978, GB2312.1980,
1996                            or JISX0208.1980 */
1997                   DECODE_DESIGNATION (0, 2, 94, c1);
1998                 }
1999               else if (c1 >= 0x28 && c1 <= 0x2B)
2000                 {       /* designation of DIMENSION2_CHARS94 character set */
2001                   ONE_MORE_BYTE (c2);
2002                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2003                 }
2004               else if (c1 >= 0x2C && c1 <= 0x2F)
2005                 {       /* designation of DIMENSION2_CHARS96 character set */
2006                   ONE_MORE_BYTE (c2);
2007                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2008                 }
2009               else
2010                 goto label_invalid_code;
2011               /* We must update these variables now.  */
2012               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2013               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2014               continue;
2015
2016             case 'n':           /* invocation of locking-shift-2 */
2017               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2018                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2019                 goto label_invalid_code;
2020               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2021               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2022               continue;
2023
2024             case 'o':           /* invocation of locking-shift-3 */
2025               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2026                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2027                 goto label_invalid_code;
2028               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2029               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2030               continue;
2031
2032             case 'N':           /* invocation of single-shift-2 */
2033               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2034                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2035                 goto label_invalid_code;
2036               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2037               ONE_MORE_BYTE (c1);
2038               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2039                 goto label_invalid_code;
2040               break;
2041
2042             case 'O':           /* invocation of single-shift-3 */
2043               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2044                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2045                 goto label_invalid_code;
2046               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2047               ONE_MORE_BYTE (c1);
2048               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2049                 goto label_invalid_code;
2050               break;
2051
2052             case '0': case '2': case '3': case '4': /* start composition */
2053               DECODE_COMPOSITION_START (c1);
2054               continue;
2055
2056             case '1':           /* end composition */
2057               DECODE_COMPOSITION_END (c1);
2058               continue;
2059
2060             case '[':           /* specification of direction */
2061               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2062                 goto label_invalid_code;
2063               /* For the moment, nested direction is not supported.
2064                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2065                  left-to-right, and nonzero means right-to-left.  */
2066               ONE_MORE_BYTE (c1);
2067               switch (c1)
2068                 {
2069                 case ']':       /* end of the current direction */
2070                   coding->mode &= ~CODING_MODE_DIRECTION;
2071
2072                 case '0':       /* end of the current direction */
2073                 case '1':       /* start of left-to-right direction */
2074                   ONE_MORE_BYTE (c1);
2075                   if (c1 == ']')
2076                     coding->mode &= ~CODING_MODE_DIRECTION;
2077                   else
2078                     goto label_invalid_code;
2079                   break;
2080
2081                 case '2':       /* start of right-to-left direction */
2082                   ONE_MORE_BYTE (c1);
2083                   if (c1 == ']')
2084                     coding->mode |= CODING_MODE_DIRECTION;
2085                   else
2086                     goto label_invalid_code;
2087                   break;
2088
2089                 default:
2090                   goto label_invalid_code;
2091                 }
2092               continue;
2093
2094             case '%':
2095               if (COMPOSING_P (coding))
2096                 DECODE_COMPOSITION_END ('1');
2097               ONE_MORE_BYTE (c1);
2098               if (c1 == '/')
2099                 {
2100                   /* CTEXT extended segment:
2101                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2102                      We keep these bytes as is for the moment.
2103                      They may be decoded by post-read-conversion.  */
2104                   int dim, M, L;
2105                   int size, required;
2106                   int produced_chars;
2107
2108                   ONE_MORE_BYTE (dim);
2109                   ONE_MORE_BYTE (M);
2110                   ONE_MORE_BYTE (L);
2111                   size = ((M - 128) * 128) + (L - 128);
2112                   required = 8 + size * 2;
2113                   if (dst + required > (dst_bytes ? dst_end : src))
2114                     goto label_end_of_loop;
2115                   *dst++ = ISO_CODE_ESC;
2116                   *dst++ = '%';
2117                   *dst++ = '/';
2118                   *dst++ = dim;
2119                   produced_chars = 4;
2120                   dst += CHAR_STRING (M, dst), produced_chars++;
2121                   dst += CHAR_STRING (L, dst), produced_chars++;
2122                   while (size-- > 0)
2123                     {
2124                       ONE_MORE_BYTE (c1);
2125                       dst += CHAR_STRING (c1, dst), produced_chars++;
2126                     }
2127                   coding->produced_char += produced_chars;
2128                 }
2129               else if (c1 == 'G')
2130                 {
2131                   unsigned char *d = dst;
2132                   int produced_chars;
2133
2134                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2135                      ESC % G --UTF-8-BYTES-- ESC % @
2136                      We keep these bytes as is for the moment.
2137                      They may be decoded by post-read-conversion.  */
2138                   if (d + 6 > (dst_bytes ? dst_end : src))
2139                     goto label_end_of_loop;
2140                   *d++ = ISO_CODE_ESC;
2141                   *d++ = '%';
2142                   *d++ = 'G';
2143                   produced_chars = 3;
2144                   while (d + 1 < (dst_bytes ? dst_end : src))
2145                     {
2146                       ONE_MORE_BYTE (c1);
2147                       if (c1 == ISO_CODE_ESC
2148                           && src + 1 < src_end
2149                           && src[0] == '%'
2150                           && src[1] == '@')
2151                         {
2152                           src += 2;
2153                           break;
2154                         }
2155                       d += CHAR_STRING (c1, d), produced_chars++;
2156                     }
2157                   if (d + 3 > (dst_bytes ? dst_end : src))
2158                     goto label_end_of_loop;
2159                   *d++ = ISO_CODE_ESC;
2160                   *d++ = '%';
2161                   *d++ = '@';
2162                   dst = d;
2163                   coding->produced_char += produced_chars + 3;
2164                 }
2165               else
2166                 goto label_invalid_code;
2167               continue;
2168
2169             default:
2170               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2171                 goto label_invalid_code;
2172               if (c1 >= 0x28 && c1 <= 0x2B)
2173                 {       /* designation of DIMENSION1_CHARS94 character set */
2174                   ONE_MORE_BYTE (c2);
2175                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2176                 }
2177               else if (c1 >= 0x2C && c1 <= 0x2F)
2178                 {       /* designation of DIMENSION1_CHARS96 character set */
2179                   ONE_MORE_BYTE (c2);
2180                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2181                 }
2182               else
2183                 goto label_invalid_code;
2184               /* We must update these variables now.  */
2185               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2186               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2187               continue;
2188             }
2189         }
2190
2191       /* Now we know CHARSET and 1st position code C1 of a character.
2192          Produce a multibyte sequence for that character while getting
2193          2nd position code C2 if necessary.  */
2194       if (CHARSET_DIMENSION (charset) == 2)
2195         {
2196           ONE_MORE_BYTE (c2);
2197           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2198             /* C2 is not in a valid range.  */
2199             goto label_invalid_code;
2200         }
2201       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2202       EMIT_CHAR (c);
2203       continue;
2204
2205     label_invalid_code:
2206       coding->errors++;
2207       if (COMPOSING_P (coding))
2208         DECODE_COMPOSITION_END ('1');
2209       src = src_base;
2210       c = *src++;
2211       if (! NILP (translation_table))
2212         c = translate_char (translation_table, c, 0, 0, 0);
2213       EMIT_CHAR (c);
2214     }
2215
2216  label_end_of_loop:
2217   coding->consumed = coding->consumed_char = src_base - source;
2218   coding->produced = dst - destination;
2219   return;
2220 }
2221
2222
2223 /* ISO2022 encoding stuff.  */
2224
2225 /*
2226    It is not enough to say just "ISO2022" on encoding, we have to
2227    specify more details.  In Emacs, each ISO2022 coding system
2228    variant has the following specifications:
2229         1. Initial designation to G0 through G3.
2230         2. Allows short-form designation?
2231         3. ASCII should be designated to G0 before control characters?
2232         4. ASCII should be designated to G0 at end of line?
2233         5. 7-bit environment or 8-bit environment?
2234         6. Use locking-shift?
2235         7. Use Single-shift?
2236    And the following two are only for Japanese:
2237         8. Use ASCII in place of JIS0201-1976-Roman?
2238         9. Use JISX0208-1983 in place of JISX0208-1978?
2239    These specifications are encoded in `coding->flags' as flag bits
2240    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2241    details.
2242 */
2243
2244 /* Produce codes (escape sequence) for designating CHARSET to graphic
2245    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2246    '@', 'A', or 'B' and the coding system CODING allows, produce
2247    designation sequence of short-form.  */
2248
2249 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2250   do {                                                                  \
2251     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2252     char *intermediate_char_94 = "()*+";                                \
2253     char *intermediate_char_96 = ",-./";                                \
2254     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2255                                                                         \
2256     if (revision < 255)                                                 \
2257       {                                                                 \
2258         *dst++ = ISO_CODE_ESC;                                          \
2259         *dst++ = '&';                                                   \
2260         *dst++ = '@' + revision;                                        \
2261       }                                                                 \
2262     *dst++ = ISO_CODE_ESC;                                              \
2263     if (CHARSET_DIMENSION (charset) == 1)                               \
2264       {                                                                 \
2265         if (CHARSET_CHARS (charset) == 94)                              \
2266           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2267         else                                                            \
2268           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2269       }                                                                 \
2270     else                                                                \
2271       {                                                                 \
2272         *dst++ = '$';                                                   \
2273         if (CHARSET_CHARS (charset) == 94)                              \
2274           {                                                             \
2275             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2276                 || reg != 0                                             \
2277                 || final_char < '@' || final_char > 'B')                \
2278               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2279           }                                                             \
2280         else                                                            \
2281           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2282       }                                                                 \
2283     *dst++ = final_char;                                                \
2284     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2285   } while (0)
2286
2287 /* The following two macros produce codes (control character or escape
2288    sequence) for ISO2022 single-shift functions (single-shift-2 and
2289    single-shift-3).  */
2290
2291 #define ENCODE_SINGLE_SHIFT_2                           \
2292   do {                                                  \
2293     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2294       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2295     else                                                \
2296       *dst++ = ISO_CODE_SS2;                            \
2297     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2298   } while (0)
2299
2300 #define ENCODE_SINGLE_SHIFT_3                           \
2301   do {                                                  \
2302     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2303       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2304     else                                                \
2305       *dst++ = ISO_CODE_SS3;                            \
2306     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2307   } while (0)
2308
2309 /* The following four macros produce codes (control character or
2310    escape sequence) for ISO2022 locking-shift functions (shift-in,
2311    shift-out, locking-shift-2, and locking-shift-3).  */
2312
2313 #define ENCODE_SHIFT_IN                         \
2314   do {                                          \
2315     *dst++ = ISO_CODE_SI;                       \
2316     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2317   } while (0)
2318
2319 #define ENCODE_SHIFT_OUT                        \
2320   do {                                          \
2321     *dst++ = ISO_CODE_SO;                       \
2322     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2323   } while (0)
2324
2325 #define ENCODE_LOCKING_SHIFT_2                  \
2326   do {                                          \
2327     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2328     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2329   } while (0)
2330
2331 #define ENCODE_LOCKING_SHIFT_3                  \
2332   do {                                          \
2333     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2334     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2335   } while (0)
2336
2337 /* Produce codes for a DIMENSION1 character whose character set is
2338    CHARSET and whose position-code is C1.  Designation and invocation
2339    sequences are also produced in advance if necessary.  */
2340
2341 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2342   do {                                                                  \
2343     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2344       {                                                                 \
2345         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2346           *dst++ = c1 & 0x7F;                                           \
2347         else                                                            \
2348           *dst++ = c1 | 0x80;                                           \
2349         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2350         break;                                                          \
2351       }                                                                 \
2352     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2353       {                                                                 \
2354         *dst++ = c1 & 0x7F;                                             \
2355         break;                                                          \
2356       }                                                                 \
2357     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2358       {                                                                 \
2359         *dst++ = c1 | 0x80;                                             \
2360         break;                                                          \
2361       }                                                                 \
2362     else                                                                \
2363       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2364          must invoke it, or, at first, designate it to some graphic     \
2365          register.  Then repeat the loop to actually produce the        \
2366          character.  */                                                 \
2367       dst = encode_invocation_designation (charset, coding, dst);       \
2368   } while (1)
2369
2370 /* Produce codes for a DIMENSION2 character whose character set is
2371    CHARSET and whose position-codes are C1 and C2.  Designation and
2372    invocation codes are also produced in advance if necessary.  */
2373
2374 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2375   do {                                                                  \
2376     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2377       {                                                                 \
2378         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2379           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2380         else                                                            \
2381           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2382         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2383         break;                                                          \
2384       }                                                                 \
2385     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2386       {                                                                 \
2387         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2388         break;                                                          \
2389       }                                                                 \
2390     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2391       {                                                                 \
2392         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2393         break;                                                          \
2394       }                                                                 \
2395     else                                                                \
2396       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2397          must invoke it, or, at first, designate it to some graphic     \
2398          register.  Then repeat the loop to actually produce the        \
2399          character.  */                                                 \
2400       dst = encode_invocation_designation (charset, coding, dst);       \
2401   } while (1)
2402
2403 #define ENCODE_ISO_CHARACTER(c)                                 \
2404   do {                                                          \
2405     int charset, c1, c2;                                        \
2406                                                                 \
2407     SPLIT_CHAR (c, charset, c1, c2);                            \
2408     if (CHARSET_DEFINED_P (charset))                            \
2409       {                                                         \
2410         if (CHARSET_DIMENSION (charset) == 1)                   \
2411           {                                                     \
2412             if (charset == CHARSET_ASCII                        \
2413                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2414               charset = charset_latin_jisx0201;                 \
2415             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2416           }                                                     \
2417         else                                                    \
2418           {                                                     \
2419             if (charset == charset_jisx0208                     \
2420                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2421               charset = charset_jisx0208_1978;                  \
2422             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2423           }                                                     \
2424       }                                                         \
2425     else                                                        \
2426       {                                                         \
2427         *dst++ = c1;                                            \
2428         if (c2 >= 0)                                            \
2429           *dst++ = c2;                                          \
2430       }                                                         \
2431   } while (0)
2432
2433
2434 /* Instead of encoding character C, produce one or two `?'s.  */
2435
2436 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2437   do {                                                          \
2438     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2439     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2440       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2441   } while (0)
2442
2443
2444 /* Produce designation and invocation codes at a place pointed by DST
2445    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2446    Return new DST.  */
2447
2448 unsigned char *
2449 encode_invocation_designation (charset, coding, dst)
2450      int charset;
2451      struct coding_system *coding;
2452      unsigned char *dst;
2453 {
2454   int reg;                      /* graphic register number */
2455
2456   /* At first, check designations.  */
2457   for (reg = 0; reg < 4; reg++)
2458     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2459       break;
2460
2461   if (reg >= 4)
2462     {
2463       /* CHARSET is not yet designated to any graphic registers.  */
2464       /* At first check the requested designation.  */
2465       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2466       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2467         /* Since CHARSET requests no special designation, designate it
2468            to graphic register 0.  */
2469         reg = 0;
2470
2471       ENCODE_DESIGNATION (charset, reg, coding);
2472     }
2473
2474   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2475       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2476     {
2477       /* Since the graphic register REG is not invoked to any graphic
2478          planes, invoke it to graphic plane 0.  */
2479       switch (reg)
2480         {
2481         case 0:                 /* graphic register 0 */
2482           ENCODE_SHIFT_IN;
2483           break;
2484
2485         case 1:                 /* graphic register 1 */
2486           ENCODE_SHIFT_OUT;
2487           break;
2488
2489         case 2:                 /* graphic register 2 */
2490           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2491             ENCODE_SINGLE_SHIFT_2;
2492           else
2493             ENCODE_LOCKING_SHIFT_2;
2494           break;
2495
2496         case 3:                 /* graphic register 3 */
2497           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498             ENCODE_SINGLE_SHIFT_3;
2499           else
2500             ENCODE_LOCKING_SHIFT_3;
2501           break;
2502         }
2503     }
2504
2505   return dst;
2506 }
2507
2508 /* Produce 2-byte codes for encoded composition rule RULE.  */
2509
2510 #define ENCODE_COMPOSITION_RULE(rule)           \
2511   do {                                          \
2512     int gref, nref;                             \
2513     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2514     *dst++ = 32 + 81 + gref;                    \
2515     *dst++ = 32 + nref;                         \
2516   } while (0)
2517
2518 /* Produce codes for indicating the start of a composition sequence
2519    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2520    which specify information about the composition.  See the comment
2521    in coding.h for the format of DATA.  */
2522
2523 #define ENCODE_COMPOSITION_START(coding, data)                          \
2524   do {                                                                  \
2525     coding->composing = data[3];                                        \
2526     *dst++ = ISO_CODE_ESC;                                              \
2527     if (coding->composing == COMPOSITION_RELATIVE)                      \
2528       *dst++ = '0';                                                     \
2529     else                                                                \
2530       {                                                                 \
2531         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2532                   ? '3' : '4');                                         \
2533         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2534         coding->composition_rule_follows = 0;                           \
2535       }                                                                 \
2536   } while (0)
2537
2538 /* Produce codes for indicating the end of the current composition.  */
2539
2540 #define ENCODE_COMPOSITION_END(coding, data)                    \
2541   do {                                                          \
2542     *dst++ = ISO_CODE_ESC;                                      \
2543     *dst++ = '1';                                               \
2544     coding->cmp_data_start += data[0];                          \
2545     coding->composing = COMPOSITION_NO;                         \
2546     if (coding->cmp_data_start == coding->cmp_data->used        \
2547         && coding->cmp_data->next)                              \
2548       {                                                         \
2549         coding->cmp_data = coding->cmp_data->next;              \
2550         coding->cmp_data_start = 0;                             \
2551       }                                                         \
2552   } while (0)
2553
2554 /* Produce composition start sequence ESC 0.  Here, this sequence
2555    doesn't mean the start of a new composition but means that we have
2556    just produced components (alternate chars and composition rules) of
2557    the composition and the actual text follows in SRC.  */
2558
2559 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2560   do {                                          \
2561     *dst++ = ISO_CODE_ESC;                      \
2562     *dst++ = '0';                               \
2563     coding->composing = COMPOSITION_RELATIVE;   \
2564   } while (0)
2565
2566 /* The following three macros produce codes for indicating direction
2567    of text.  */
2568 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2569   do {                                                  \
2570     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2571       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2572     else                                                \
2573       *dst++ = ISO_CODE_CSI;                            \
2574   } while (0)
2575
2576 #define ENCODE_DIRECTION_R2L    \
2577   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2578
2579 #define ENCODE_DIRECTION_L2R    \
2580   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2581
2582 /* Produce codes for designation and invocation to reset the graphic
2583    planes and registers to initial state.  */
2584 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2585   do {                                                                      \
2586     int reg;                                                                \
2587     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2588       ENCODE_SHIFT_IN;                                                      \
2589     for (reg = 0; reg < 4; reg++)                                           \
2590       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2591           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2592               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2593         ENCODE_DESIGNATION                                                  \
2594           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2595   } while (0)
2596
2597 /* Produce designation sequences of charsets in the line started from
2598    SRC to a place pointed by DST, and return updated DST.
2599
2600    If the current block ends before any end-of-line, we may fail to
2601    find all the necessary designations.  */
2602
2603 static unsigned char *
2604 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2605      struct coding_system *coding;
2606      Lisp_Object translation_table;
2607      const unsigned char *src, *src_end;
2608      unsigned char *dst;
2609 {
2610   int charset, c, found = 0, reg;
2611   /* Table of charsets to be designated to each graphic register.  */
2612   int r[4];
2613
2614   for (reg = 0; reg < 4; reg++)
2615     r[reg] = -1;
2616
2617   while (found < 4)
2618     {
2619       ONE_MORE_CHAR (c);
2620       if (c == '\n')
2621         break;
2622
2623       charset = CHAR_CHARSET (c);
2624       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2625       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2626         {
2627           found++;
2628           r[reg] = charset;
2629         }
2630     }
2631
2632  label_end_of_loop:
2633   if (found)
2634     {
2635       for (reg = 0; reg < 4; reg++)
2636         if (r[reg] >= 0
2637             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2638           ENCODE_DESIGNATION (r[reg], reg, coding);
2639     }
2640
2641   return dst;
2642 }
2643
2644 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2645
2646 static void
2647 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2648      struct coding_system *coding;
2649      const unsigned char *source;
2650      unsigned char *destination;
2651      int src_bytes, dst_bytes;
2652 {
2653   const unsigned char *src = source;
2654   const unsigned char *src_end = source + src_bytes;
2655   unsigned char *dst = destination;
2656   unsigned char *dst_end = destination + dst_bytes;
2657   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2658      from DST_END to assure overflow checking is necessary only at the
2659      head of loop.  */
2660   unsigned char *adjusted_dst_end = dst_end - 19;
2661   /* SRC_BASE remembers the start position in source in each loop.
2662      The loop will be exited when there's not enough source text to
2663      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2664      there's not enough destination area to produce encoded codes
2665      (within macro EMIT_BYTES).  */
2666   const unsigned char *src_base;
2667   int c;
2668   Lisp_Object translation_table;
2669   Lisp_Object safe_chars;
2670
2671   if (coding->flags & CODING_FLAG_ISO_SAFE)
2672     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2673
2674   safe_chars = coding_safe_chars (coding->symbol);
2675
2676   if (NILP (Venable_character_translation))
2677     translation_table = Qnil;
2678   else
2679     {
2680       translation_table = coding->translation_table_for_encode;
2681       if (NILP (translation_table))
2682         translation_table = Vstandard_translation_table_for_encode;
2683     }
2684
2685   coding->consumed_char = 0;
2686   coding->errors = 0;
2687   while (1)
2688     {
2689       src_base = src;
2690
2691       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2692         {
2693           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2694           break;
2695         }
2696
2697       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2698           && CODING_SPEC_ISO_BOL (coding))
2699         {
2700           /* We have to produce designation sequences if any now.  */
2701           dst = encode_designation_at_bol (coding, translation_table,
2702                                            src, src_end, dst);
2703           CODING_SPEC_ISO_BOL (coding) = 0;
2704         }
2705
2706       /* Check composition start and end.  */
2707       if (coding->composing != COMPOSITION_DISABLED
2708           && coding->cmp_data_start < coding->cmp_data->used)
2709         {
2710           struct composition_data *cmp_data = coding->cmp_data;
2711           int *data = cmp_data->data + coding->cmp_data_start;
2712           int this_pos = cmp_data->char_offset + coding->consumed_char;
2713
2714           if (coding->composing == COMPOSITION_RELATIVE)
2715             {
2716               if (this_pos == data[2])
2717                 {
2718                   ENCODE_COMPOSITION_END (coding, data);
2719                   cmp_data = coding->cmp_data;
2720                   data = cmp_data->data + coding->cmp_data_start;
2721                 }
2722             }
2723           else if (COMPOSING_P (coding))
2724             {
2725               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2726               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2727                 /* We have consumed components of the composition.
2728                    What follows in SRC is the composition's base
2729                    text.  */
2730                 ENCODE_COMPOSITION_FAKE_START (coding);
2731               else
2732                 {
2733                   int c = cmp_data->data[coding->cmp_data_index++];
2734                   if (coding->composition_rule_follows)
2735                     {
2736                       ENCODE_COMPOSITION_RULE (c);
2737                       coding->composition_rule_follows = 0;
2738                     }
2739                   else
2740                     {
2741                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2742                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2743                         ENCODE_UNSAFE_CHARACTER (c);
2744                       else
2745                         ENCODE_ISO_CHARACTER (c);
2746                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2747                         coding->composition_rule_follows = 1;
2748                     }
2749                   continue;
2750                 }
2751             }
2752           if (!COMPOSING_P (coding))
2753             {
2754               if (this_pos == data[1])
2755                 {
2756                   ENCODE_COMPOSITION_START (coding, data);
2757                   continue;
2758                 }
2759             }
2760         }
2761
2762       ONE_MORE_CHAR (c);
2763
2764       /* Now encode the character C.  */
2765       if (c < 0x20 || c == 0x7F)
2766         {
2767           if (c == '\r')
2768             {
2769               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2770                 {
2771                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2772                     ENCODE_RESET_PLANE_AND_REGISTER;
2773                   *dst++ = c;
2774                   continue;
2775                 }
2776               /* fall down to treat '\r' as '\n' ...  */
2777               c = '\n';
2778             }
2779           if (c == '\n')
2780             {
2781               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2782                 ENCODE_RESET_PLANE_AND_REGISTER;
2783               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2784                 bcopy (coding->spec.iso2022.initial_designation,
2785                        coding->spec.iso2022.current_designation,
2786                        sizeof coding->spec.iso2022.initial_designation);
2787               if (coding->eol_type == CODING_EOL_LF
2788                   || coding->eol_type == CODING_EOL_UNDECIDED)
2789                 *dst++ = ISO_CODE_LF;
2790               else if (coding->eol_type == CODING_EOL_CRLF)
2791                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2792               else
2793                 *dst++ = ISO_CODE_CR;
2794               CODING_SPEC_ISO_BOL (coding) = 1;
2795             }
2796           else
2797             {
2798               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2799                 ENCODE_RESET_PLANE_AND_REGISTER;
2800               *dst++ = c;
2801             }
2802         }
2803       else if (ASCII_BYTE_P (c))
2804         ENCODE_ISO_CHARACTER (c);
2805       else if (SINGLE_BYTE_CHAR_P (c))
2806         {
2807           *dst++ = c;
2808           coding->errors++;
2809         }
2810       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2811                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2812         ENCODE_UNSAFE_CHARACTER (c);
2813       else
2814         ENCODE_ISO_CHARACTER (c);
2815
2816       coding->consumed_char++;
2817     }
2818
2819  label_end_of_loop:
2820   coding->consumed = src_base - source;
2821   coding->produced = coding->produced_char = dst - destination;
2822 }
2823
2824 \f
2825 /*** 4. SJIS and BIG5 handlers ***/
2826
2827 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2828    quite widely.  So, for the moment, Emacs supports them in the bare
2829    C code.  But, in the future, they may be supported only by CCL.  */
2830
2831 /* SJIS is a coding system encoding three character sets: ASCII, right
2832    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2833    as is.  A character of charset katakana-jisx0201 is encoded by
2834    "position-code + 0x80".  A character of charset japanese-jisx0208
2835    is encoded in 2-byte but two position-codes are divided and shifted
2836    so that it fits in the range below.
2837
2838    --- CODE RANGE of SJIS ---
2839    (character set)      (range)
2840    ASCII                0x00 .. 0x7F
2841    KATAKANA-JISX0201    0xA1 .. 0xDF
2842    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2843             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2844    -------------------------------
2845
2846 */
2847
2848 /* BIG5 is a coding system encoding two character sets: ASCII and
2849    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2850    character set and is encoded in two bytes.
2851
2852    --- CODE RANGE of BIG5 ---
2853    (character set)      (range)
2854    ASCII                0x00 .. 0x7F
2855    Big5 (1st byte)      0xA1 .. 0xFE
2856         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2857    --------------------------
2858
2859    Since the number of characters in Big5 is larger than maximum
2860    characters in Emacs' charset (96x96), it can't be handled as one
2861    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2862    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2863    contains frequently used characters and the latter contains less
2864    frequently used characters.  */
2865
2866 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2867    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2868    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2869    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2870
2871 /* Number of Big5 characters which have the same code in 1st byte.  */
2872 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2873
2874 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2875   do {                                                                  \
2876     unsigned int temp                                                   \
2877       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2878     if (b1 < 0xC9)                                                      \
2879       charset = charset_big5_1;                                         \
2880     else                                                                \
2881       {                                                                 \
2882         charset = charset_big5_2;                                       \
2883         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2884       }                                                                 \
2885     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2886     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2887   } while (0)
2888
2889 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2890   do {                                                                  \
2891     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2892     if (charset == charset_big5_2)                                      \
2893       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2894     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2895     b2 = temp % BIG5_SAME_ROW;                                          \
2896     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2897   } while (0)
2898
2899 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2900    Check if a text is encoded in SJIS.  If it is, return
2901    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2902
2903 static int
2904 detect_coding_sjis (src, src_end, multibytep)
2905      unsigned char *src, *src_end;
2906      int multibytep;
2907 {
2908   int c;
2909   /* Dummy for ONE_MORE_BYTE.  */
2910   struct coding_system dummy_coding;
2911   struct coding_system *coding = &dummy_coding;
2912
2913   while (1)
2914     {
2915       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2916       if (c < 0x80)
2917         continue;
2918       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2919         return 0;
2920       if (c <= 0x9F || c >= 0xE0)
2921         {
2922           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2923           if (c < 0x40 || c == 0x7F || c > 0xFC)
2924             return 0;
2925         }
2926     }
2927  label_end_of_loop:
2928   return CODING_CATEGORY_MASK_SJIS;
2929 }
2930
2931 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2932    Check if a text is encoded in BIG5.  If it is, return
2933    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2934
2935 static int
2936 detect_coding_big5 (src, src_end, multibytep)
2937      unsigned char *src, *src_end;
2938      int multibytep;
2939 {
2940   int c;
2941   /* Dummy for ONE_MORE_BYTE.  */
2942   struct coding_system dummy_coding;
2943   struct coding_system *coding = &dummy_coding;
2944
2945   while (1)
2946     {
2947       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2948       if (c < 0x80)
2949         continue;
2950       if (c < 0xA1 || c > 0xFE)
2951         return 0;
2952       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2953       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2954         return 0;
2955     }
2956  label_end_of_loop:
2957   return CODING_CATEGORY_MASK_BIG5;
2958 }
2959
2960 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2961    Check if a text is encoded in UTF-8.  If it is, return
2962    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2963
2964 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2965 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2966 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2967 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2968 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2969 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2970 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2971
2972 static int
2973 detect_coding_utf_8 (src, src_end, multibytep)
2974      unsigned char *src, *src_end;
2975      int multibytep;
2976 {
2977   unsigned char c;
2978   int seq_maybe_bytes;
2979   /* Dummy for ONE_MORE_BYTE.  */
2980   struct coding_system dummy_coding;
2981   struct coding_system *coding = &dummy_coding;
2982
2983   while (1)
2984     {
2985       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2986       if (UTF_8_1_OCTET_P (c))
2987         continue;
2988       else if (UTF_8_2_OCTET_LEADING_P (c))
2989         seq_maybe_bytes = 1;
2990       else if (UTF_8_3_OCTET_LEADING_P (c))
2991         seq_maybe_bytes = 2;
2992       else if (UTF_8_4_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 3;
2994       else if (UTF_8_5_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 4;
2996       else if (UTF_8_6_OCTET_LEADING_P (c))
2997         seq_maybe_bytes = 5;
2998       else
2999         return 0;
3000
3001       do
3002         {
3003           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3004           if (!UTF_8_EXTRA_OCTET_P (c))
3005             return 0;
3006           seq_maybe_bytes--;
3007         }
3008       while (seq_maybe_bytes > 0);
3009     }
3010
3011  label_end_of_loop:
3012   return CODING_CATEGORY_MASK_UTF_8;
3013 }
3014
3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3016    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3017    Little Endian (otherwise).  If it is, return
3018    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3019    else return 0.  */
3020
3021 #define UTF_16_INVALID_P(val)   \
3022   (((val) == 0xFFFE)            \
3023    || ((val) == 0xFFFF))
3024
3025 #define UTF_16_HIGH_SURROGATE_P(val) \
3026   (((val) & 0xD800) == 0xD800)
3027
3028 #define UTF_16_LOW_SURROGATE_P(val) \
3029   (((val) & 0xDC00) == 0xDC00)
3030
3031 static int
3032 detect_coding_utf_16 (src, src_end, multibytep)
3033      unsigned char *src, *src_end;
3034      int multibytep;
3035 {
3036   unsigned char c1, c2;
3037   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3038   struct coding_system dummy_coding;
3039   struct coding_system *coding = &dummy_coding;
3040
3041   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3042   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3043
3044   if ((c1 == 0xFF) && (c2 == 0xFE))
3045     return CODING_CATEGORY_MASK_UTF_16_LE;
3046   else if ((c1 == 0xFE) && (c2 == 0xFF))
3047     return CODING_CATEGORY_MASK_UTF_16_BE;
3048
3049  label_end_of_loop:
3050   return 0;
3051 }
3052
3053 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3054    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3055
3056 static void
3057 decode_coding_sjis_big5 (coding, source, destination,
3058                          src_bytes, dst_bytes, sjis_p)
3059      struct coding_system *coding;
3060      const unsigned char *source;
3061      unsigned char  *destination;
3062      int src_bytes, dst_bytes;
3063      int sjis_p;
3064 {
3065   const unsigned char *src = source;
3066   const unsigned char *src_end = source + src_bytes;
3067   unsigned char *dst = destination;
3068   unsigned char *dst_end = destination + dst_bytes;
3069   /* SRC_BASE remembers the start position in source in each loop.
3070      The loop will be exited when there's not enough source code
3071      (within macro ONE_MORE_BYTE), or when there's not enough
3072      destination area to produce a character (within macro
3073      EMIT_CHAR).  */
3074   const unsigned char *src_base;
3075   Lisp_Object translation_table;
3076
3077   if (NILP (Venable_character_translation))
3078     translation_table = Qnil;
3079   else
3080     {
3081       translation_table = coding->translation_table_for_decode;
3082       if (NILP (translation_table))
3083         translation_table = Vstandard_translation_table_for_decode;
3084     }
3085
3086   coding->produced_char = 0;
3087   while (1)
3088     {
3089       int c, charset, c1, c2 = 0;
3090
3091       src_base = src;
3092       ONE_MORE_BYTE (c1);
3093
3094       if (c1 < 0x80)
3095         {
3096           charset = CHARSET_ASCII;
3097           if (c1 < 0x20)
3098             {
3099               if (c1 == '\r')
3100                 {
3101                   if (coding->eol_type == CODING_EOL_CRLF)
3102                     {
3103                       ONE_MORE_BYTE (c2);
3104                       if (c2 == '\n')
3105                         c1 = c2;
3106                       else
3107                         /* To process C2 again, SRC is subtracted by 1.  */
3108                         src--;
3109                     }
3110                   else if (coding->eol_type == CODING_EOL_CR)
3111                     c1 = '\n';
3112                 }
3113               else if (c1 == '\n'
3114                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3115                        && (coding->eol_type == CODING_EOL_CR
3116                            || coding->eol_type == CODING_EOL_CRLF))
3117                 {
3118                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3119                   goto label_end_of_loop;
3120                 }
3121             }
3122         }
3123       else
3124         {
3125           if (sjis_p)
3126             {
3127               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3128                 goto label_invalid_code;
3129               if (c1 <= 0x9F || c1 >= 0xE0)
3130                 {
3131                   /* SJIS -> JISX0208 */
3132                   ONE_MORE_BYTE (c2);
3133                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3134                     goto label_invalid_code;
3135                   DECODE_SJIS (c1, c2, c1, c2);
3136                   charset = charset_jisx0208;
3137                 }
3138               else
3139                 /* SJIS -> JISX0201-Kana */
3140                 charset = charset_katakana_jisx0201;
3141             }
3142           else
3143             {
3144               /* BIG5 -> Big5 */
3145               if (c1 < 0xA0 || c1 > 0xFE)
3146                 goto label_invalid_code;
3147               ONE_MORE_BYTE (c2);
3148               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3149                 goto label_invalid_code;
3150               DECODE_BIG5 (c1, c2, charset, c1, c2);
3151             }
3152         }
3153
3154       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3155       EMIT_CHAR (c);
3156       continue;
3157
3158     label_invalid_code:
3159       coding->errors++;
3160       src = src_base;
3161       c = *src++;
3162       EMIT_CHAR (c);
3163     }
3164
3165  label_end_of_loop:
3166   coding->consumed = coding->consumed_char = src_base - source;
3167   coding->produced = dst - destination;
3168   return;
3169 }
3170
3171 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3172    This function can encode charsets `ascii', `katakana-jisx0201',
3173    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3174    are sure that all these charsets are registered as official charset
3175    (i.e. do not have extended leading-codes).  Characters of other
3176    charsets are produced without any encoding.  If SJIS_P is 1, encode
3177    SJIS text, else encode BIG5 text.  */
3178
3179 static void
3180 encode_coding_sjis_big5 (coding, source, destination,
3181                          src_bytes, dst_bytes, sjis_p)
3182      struct coding_system *coding;
3183      unsigned char *source, *destination;
3184      int src_bytes, dst_bytes;
3185      int sjis_p;
3186 {
3187   unsigned char *src = source;
3188   unsigned char *src_end = source + src_bytes;
3189   unsigned char *dst = destination;
3190   unsigned char *dst_end = destination + dst_bytes;
3191   /* SRC_BASE remembers the start position in source in each loop.
3192      The loop will be exited when there's not enough source text to
3193      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3194      there's not enough destination area to produce encoded codes
3195      (within macro EMIT_BYTES).  */
3196   unsigned char *src_base;
3197   Lisp_Object translation_table;
3198
3199   if (NILP (Venable_character_translation))
3200     translation_table = Qnil;
3201   else
3202     {
3203       translation_table = coding->translation_table_for_encode;
3204       if (NILP (translation_table))
3205         translation_table = Vstandard_translation_table_for_encode;
3206     }
3207
3208   while (1)
3209     {
3210       int c, charset, c1, c2;
3211
3212       src_base = src;
3213       ONE_MORE_CHAR (c);
3214
3215       /* Now encode the character C.  */
3216       if (SINGLE_BYTE_CHAR_P (c))
3217         {
3218           switch (c)
3219             {
3220             case '\r':
3221               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3222                 {
3223                   EMIT_ONE_BYTE (c);
3224                   break;
3225                 }
3226               c = '\n';
3227             case '\n':
3228               if (coding->eol_type == CODING_EOL_CRLF)
3229                 {
3230                   EMIT_TWO_BYTES ('\r', c);
3231                   break;
3232                 }
3233               else if (coding->eol_type == CODING_EOL_CR)
3234                 c = '\r';
3235             default:
3236               EMIT_ONE_BYTE (c);
3237             }
3238         }
3239       else
3240         {
3241           SPLIT_CHAR (c, charset, c1, c2);
3242           if (sjis_p)
3243             {
3244               if (charset == charset_jisx0208
3245                   || charset == charset_jisx0208_1978)
3246                 {
3247                   ENCODE_SJIS (c1, c2, c1, c2);
3248                   EMIT_TWO_BYTES (c1, c2);
3249                 }
3250               else if (charset == charset_katakana_jisx0201)
3251                 EMIT_ONE_BYTE (c1 | 0x80);
3252               else if (charset == charset_latin_jisx0201)
3253                 EMIT_ONE_BYTE (c1);
3254               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3255                 {
3256                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3257                   if (CHARSET_WIDTH (charset) > 1)
3258                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3259                 }
3260               else
3261                 /* There's no way other than producing the internal
3262                    codes as is.  */
3263                 EMIT_BYTES (src_base, src);
3264             }
3265           else
3266             {
3267               if (charset == charset_big5_1 || charset == charset_big5_2)
3268                 {
3269                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3270                   EMIT_TWO_BYTES (c1, c2);
3271                 }
3272               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3273                 {
3274                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275                   if (CHARSET_WIDTH (charset) > 1)
3276                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3277                 }
3278               else
3279                 /* There's no way other than producing the internal
3280                    codes as is.  */
3281                 EMIT_BYTES (src_base, src);
3282             }
3283         }
3284       coding->consumed_char++;
3285     }
3286
3287  label_end_of_loop:
3288   coding->consumed = src_base - source;
3289   coding->produced = coding->produced_char = dst - destination;
3290 }
3291
3292 \f
3293 /*** 5. CCL handlers ***/
3294
3295 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3296    Check if a text is encoded in a coding system of which
3297    encoder/decoder are written in CCL program.  If it is, return
3298    CODING_CATEGORY_MASK_CCL, else return 0.  */
3299
3300 static int
3301 detect_coding_ccl (src, src_end, multibytep)
3302      unsigned char *src, *src_end;
3303      int multibytep;
3304 {
3305   unsigned char *valid;
3306   int c;
3307   /* Dummy for ONE_MORE_BYTE.  */
3308   struct coding_system dummy_coding;
3309   struct coding_system *coding = &dummy_coding;
3310
3311   /* No coding system is assigned to coding-category-ccl.  */
3312   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3313     return 0;
3314
3315   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3316   while (1)
3317     {
3318       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3319       if (! valid[c])
3320         return 0;
3321     }
3322  label_end_of_loop:
3323   return CODING_CATEGORY_MASK_CCL;
3324 }
3325
3326 \f
3327 /*** 6. End-of-line handlers ***/
3328
3329 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3330
3331 static void
3332 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3333      struct coding_system *coding;
3334      const unsigned char *source;
3335      unsigned char *destination;
3336      int src_bytes, dst_bytes;
3337 {
3338   const unsigned char *src = source;
3339   unsigned char *dst = destination;
3340   const unsigned char *src_end = src + src_bytes;
3341   unsigned char *dst_end = dst + dst_bytes;
3342   Lisp_Object translation_table;
3343   /* SRC_BASE remembers the start position in source in each loop.
3344      The loop will be exited when there's not enough source code
3345      (within macro ONE_MORE_BYTE), or when there's not enough
3346      destination area to produce a character (within macro
3347      EMIT_CHAR).  */
3348   const unsigned char *src_base;
3349   int c;
3350
3351   translation_table = Qnil;
3352   switch (coding->eol_type)
3353     {
3354     case CODING_EOL_CRLF:
3355       while (1)
3356         {
3357           src_base = src;
3358           ONE_MORE_BYTE (c);
3359           if (c == '\r')
3360             {
3361               ONE_MORE_BYTE (c);
3362               if (c != '\n')
3363                 {
3364                   src--;
3365                   c = '\r';
3366                 }
3367             }
3368           else if (c == '\n'
3369                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3370             {
3371               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3372               goto label_end_of_loop;
3373             }
3374           EMIT_CHAR (c);
3375         }
3376       break;
3377
3378     case CODING_EOL_CR:
3379       while (1)
3380         {
3381           src_base = src;
3382           ONE_MORE_BYTE (c);
3383           if (c == '\n')
3384             {
3385               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3386                 {
3387                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3388                   goto label_end_of_loop;
3389                 }
3390             }
3391           else if (c == '\r')
3392             c = '\n';
3393           EMIT_CHAR (c);
3394         }
3395       break;
3396
3397     default:                    /* no need for EOL handling */
3398       while (1)
3399         {
3400           src_base = src;
3401           ONE_MORE_BYTE (c);
3402           EMIT_CHAR (c);
3403         }
3404     }
3405
3406  label_end_of_loop:
3407   coding->consumed = coding->consumed_char = src_base - source;
3408   coding->produced = dst - destination;
3409   return;
3410 }
3411
3412 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3413    format of end-of-line according to `coding->eol_type'.  It also
3414    convert multibyte form 8-bit characters to unibyte if
3415    CODING->src_multibyte is nonzero.  If `coding->mode &
3416    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3417    also means end-of-line.  */
3418
3419 static void
3420 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3421      struct coding_system *coding;
3422      const unsigned char *source;
3423      unsigned char *destination;
3424      int src_bytes, dst_bytes;
3425 {
3426   const unsigned char *src = source;
3427   unsigned char *dst = destination;
3428   const unsigned char *src_end = src + src_bytes;
3429   unsigned char *dst_end = dst + dst_bytes;
3430   Lisp_Object translation_table;
3431   /* SRC_BASE remembers the start position in source in each loop.
3432      The loop will be exited when there's not enough source text to
3433      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3434      there's not enough destination area to produce encoded codes
3435      (within macro EMIT_BYTES).  */
3436   const unsigned char *src_base;
3437   unsigned char *tmp;
3438   int c;
3439   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3440
3441   translation_table = Qnil;
3442   if (coding->src_multibyte
3443       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3444     {
3445       src_end--;
3446       src_bytes--;
3447       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3448     }
3449
3450   if (coding->eol_type == CODING_EOL_CRLF)
3451     {
3452       while (src < src_end)
3453         {
3454           src_base = src;
3455           c = *src++;
3456           if (c >= 0x20)
3457             EMIT_ONE_BYTE (c);
3458           else if (c == '\n' || (c == '\r' && selective_display))
3459             EMIT_TWO_BYTES ('\r', '\n');
3460           else
3461             EMIT_ONE_BYTE (c);
3462         }
3463       src_base = src;
3464     label_end_of_loop:
3465       ;
3466     }
3467   else
3468     {
3469       if (!dst_bytes || src_bytes <= dst_bytes)
3470         {
3471           safe_bcopy (src, dst, src_bytes);
3472           src_base = src_end;
3473           dst += src_bytes;
3474         }
3475       else
3476         {
3477           if (coding->src_multibyte
3478               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3479             dst_bytes--;
3480           safe_bcopy (src, dst, dst_bytes);
3481           src_base = src + dst_bytes;
3482           dst = destination + dst_bytes;
3483           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3484         }
3485       if (coding->eol_type == CODING_EOL_CR)
3486         {
3487           for (tmp = destination; tmp < dst; tmp++)
3488             if (*tmp == '\n') *tmp = '\r';
3489         }
3490       else if (selective_display)
3491         {
3492           for (tmp = destination; tmp < dst; tmp++)
3493             if (*tmp == '\r') *tmp = '\n';
3494         }
3495     }
3496   if (coding->src_multibyte)
3497     dst = destination + str_as_unibyte (destination, dst - destination);
3498
3499   coding->consumed = src_base - source;
3500   coding->produced = dst - destination;
3501   coding->produced_char = coding->produced;
3502 }
3503
3504 \f
3505 /*** 7. C library functions ***/
3506
3507 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3508    has a property `coding-system'.  The value of this property is a
3509    vector of length 5 (called the coding-vector).  Among elements of
3510    this vector, the first (element[0]) and the fifth (element[4])
3511    carry important information for decoding/encoding.  Before
3512    decoding/encoding, this information should be set in fields of a
3513    structure of type `coding_system'.
3514
3515    The value of the property `coding-system' can be a symbol of another
3516    subsidiary coding-system.  In that case, Emacs gets coding-vector
3517    from that symbol.
3518
3519    `element[0]' contains information to be set in `coding->type'.  The
3520    value and its meaning is as follows:
3521
3522    0 -- coding_type_emacs_mule
3523    1 -- coding_type_sjis
3524    2 -- coding_type_iso2022
3525    3 -- coding_type_big5
3526    4 -- coding_type_ccl encoder/decoder written in CCL
3527    nil -- coding_type_no_conversion
3528    t -- coding_type_undecided (automatic conversion on decoding,
3529                                no-conversion on encoding)
3530
3531    `element[4]' contains information to be set in `coding->flags' and
3532    `coding->spec'.  The meaning varies by `coding->type'.
3533
3534    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3535    of length 32 (of which the first 13 sub-elements are used now).
3536    Meanings of these sub-elements are:
3537
3538    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3539         If the value is an integer of valid charset, the charset is
3540         assumed to be designated to graphic register N initially.
3541
3542         If the value is minus, it is a minus value of charset which
3543         reserves graphic register N, which means that the charset is
3544         not designated initially but should be designated to graphic
3545         register N just before encoding a character in that charset.
3546
3547         If the value is nil, graphic register N is never used on
3548         encoding.
3549
3550    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3551         Each value takes t or nil.  See the section ISO2022 of
3552         `coding.h' for more information.
3553
3554    If `coding->type' is `coding_type_big5', element[4] is t to denote
3555    BIG5-ETen or nil to denote BIG5-HKU.
3556
3557    If `coding->type' takes the other value, element[4] is ignored.
3558
3559    Emacs Lisp's coding systems also carry information about format of
3560    end-of-line in a value of property `eol-type'.  If the value is
3561    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3562    means CODING_EOL_CR.  If it is not integer, it should be a vector
3563    of subsidiary coding systems of which property `eol-type' has one
3564    of the above values.
3565
3566 */
3567
3568 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3569    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3570    is setup so that no conversion is necessary and return -1, else
3571    return 0.  */
3572
3573 int
3574 setup_coding_system (coding_system, coding)
3575      Lisp_Object coding_system;
3576      struct coding_system *coding;
3577 {
3578   Lisp_Object coding_spec, coding_type, eol_type, plist;
3579   Lisp_Object val;
3580
3581   /* At first, zero clear all members.  */
3582   bzero (coding, sizeof (struct coding_system));
3583
3584   /* Initialize some fields required for all kinds of coding systems.  */
3585   coding->symbol = coding_system;
3586   coding->heading_ascii = -1;
3587   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3588   coding->composing = COMPOSITION_DISABLED;
3589   coding->cmp_data = NULL;
3590
3591   if (NILP (coding_system))
3592     goto label_invalid_coding_system;
3593
3594   coding_spec = Fget (coding_system, Qcoding_system);
3595
3596   if (!VECTORP (coding_spec)
3597       || XVECTOR (coding_spec)->size != 5
3598       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3599     goto label_invalid_coding_system;
3600
3601   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3602   if (VECTORP (eol_type))
3603     {
3604       coding->eol_type = CODING_EOL_UNDECIDED;
3605       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3606       if (system_eol_type != CODING_EOL_LF)
3607         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3608     }
3609   else if (XFASTINT (eol_type) == 1)
3610     {
3611       coding->eol_type = CODING_EOL_CRLF;
3612       coding->common_flags
3613         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3614     }
3615   else if (XFASTINT (eol_type) == 2)
3616     {
3617       coding->eol_type = CODING_EOL_CR;
3618       coding->common_flags
3619         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3620     }
3621   else
3622     coding->eol_type = CODING_EOL_LF;
3623
3624   coding_type = XVECTOR (coding_spec)->contents[0];
3625   /* Try short cut.  */
3626   if (SYMBOLP (coding_type))
3627     {
3628       if (EQ (coding_type, Qt))
3629         {
3630           coding->type = coding_type_undecided;
3631           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3632         }
3633       else
3634         coding->type = coding_type_no_conversion;
3635       /* Initialize this member.  Any thing other than
3636          CODING_CATEGORY_IDX_UTF_16_BE and
3637          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3638          special treatment in detect_eol.  */
3639       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3640
3641       return 0;
3642     }
3643
3644   /* Get values of coding system properties:
3645      `post-read-conversion', `pre-write-conversion',
3646      `translation-table-for-decode', `translation-table-for-encode'.  */
3647   plist = XVECTOR (coding_spec)->contents[3];
3648   /* Pre & post conversion functions should be disabled if
3649      inhibit_eol_conversion is nonzero.  This is the case that a code
3650      conversion function is called while those functions are running.  */
3651   if (! inhibit_pre_post_conversion)
3652     {
3653       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3654       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3655     }
3656   val = Fplist_get (plist, Qtranslation_table_for_decode);
3657   if (SYMBOLP (val))
3658     val = Fget (val, Qtranslation_table_for_decode);
3659   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3660   val = Fplist_get (plist, Qtranslation_table_for_encode);
3661   if (SYMBOLP (val))
3662     val = Fget (val, Qtranslation_table_for_encode);
3663   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3664   val = Fplist_get (plist, Qcoding_category);
3665   if (!NILP (val))
3666     {
3667       val = Fget (val, Qcoding_category_index);
3668       if (INTEGERP (val))
3669         coding->category_idx = XINT (val);
3670       else
3671         goto label_invalid_coding_system;
3672     }
3673   else
3674     goto label_invalid_coding_system;
3675
3676   /* If the coding system has non-nil `composition' property, enable
3677      composition handling.  */
3678   val = Fplist_get (plist, Qcomposition);
3679   if (!NILP (val))
3680     coding->composing = COMPOSITION_NO;
3681
3682   switch (XFASTINT (coding_type))
3683     {
3684     case 0:
3685       coding->type = coding_type_emacs_mule;
3686       coding->common_flags
3687         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3688       if (!NILP (coding->post_read_conversion))
3689         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3690       if (!NILP (coding->pre_write_conversion))
3691         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3692       break;
3693
3694     case 1:
3695       coding->type = coding_type_sjis;
3696       coding->common_flags
3697         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3698       break;
3699
3700     case 2:
3701       coding->type = coding_type_iso2022;
3702       coding->common_flags
3703         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3704       {
3705         Lisp_Object val, temp;
3706         Lisp_Object *flags;
3707         int i, charset, reg_bits = 0;
3708
3709         val = XVECTOR (coding_spec)->contents[4];
3710
3711         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3712           goto label_invalid_coding_system;
3713
3714         flags = XVECTOR (val)->contents;
3715         coding->flags
3716           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3717              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3718              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3719              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3720              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3721              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3722              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3723              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3724              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3725              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3726              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3727              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3728              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3729              );
3730
3731         /* Invoke graphic register 0 to plane 0.  */
3732         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3733         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3734         CODING_SPEC_ISO_INVOCATION (coding, 1)
3735           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3736         /* Not single shifting at first.  */
3737         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3738         /* Beginning of buffer should also be regarded as bol. */
3739         CODING_SPEC_ISO_BOL (coding) = 1;
3740
3741         for (charset = 0; charset <= MAX_CHARSET; charset++)
3742           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3743         val = Vcharset_revision_alist;
3744         while (CONSP (val))
3745           {
3746             charset = get_charset_id (Fcar_safe (XCAR (val)));
3747             if (charset >= 0
3748                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3749                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3750               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3751             val = XCDR (val);
3752           }
3753
3754         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3755            FLAGS[REG] can be one of below:
3756                 integer CHARSET: CHARSET occupies register I,
3757                 t: designate nothing to REG initially, but can be used
3758                   by any charsets,
3759                 list of integer, nil, or t: designate the first
3760                   element (if integer) to REG initially, the remaining
3761                   elements (if integer) is designated to REG on request,
3762                   if an element is t, REG can be used by any charsets,
3763                 nil: REG is never used.  */
3764         for (charset = 0; charset <= MAX_CHARSET; charset++)
3765           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3766             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3767         for (i = 0; i < 4; i++)
3768           {
3769             if ((INTEGERP (flags[i])
3770                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3771                 || (charset = get_charset_id (flags[i])) >= 0)
3772               {
3773                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3774                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3775               }
3776             else if (EQ (flags[i], Qt))
3777               {
3778                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3779                 reg_bits |= 1 << i;
3780                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3781               }
3782             else if (CONSP (flags[i]))
3783               {
3784                 Lisp_Object tail;
3785                 tail = flags[i];
3786
3787                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3788                 if ((INTEGERP (XCAR (tail))
3789                      && (charset = XINT (XCAR (tail)),
3790                          CHARSET_VALID_P (charset)))
3791                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3792                   {
3793                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3794                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3795                   }
3796                 else
3797                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3798                 tail = XCDR (tail);
3799                 while (CONSP (tail))
3800                   {
3801                     if ((INTEGERP (XCAR (tail))
3802                          && (charset = XINT (XCAR (tail)),
3803                              CHARSET_VALID_P (charset)))
3804                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3805                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3806                         = i;
3807                     else if (EQ (XCAR (tail), Qt))
3808                       reg_bits |= 1 << i;
3809                     tail = XCDR (tail);
3810                   }
3811               }
3812             else
3813               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3814
3815             CODING_SPEC_ISO_DESIGNATION (coding, i)
3816               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3817           }
3818
3819         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3820           {
3821             /* REG 1 can be used only by locking shift in 7-bit env.  */
3822             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3823               reg_bits &= ~2;
3824             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3825               /* Without any shifting, only REG 0 and 1 can be used.  */
3826               reg_bits &= 3;
3827           }
3828
3829         if (reg_bits)
3830           for (charset = 0; charset <= MAX_CHARSET; charset++)
3831             {
3832               if (CHARSET_DEFINED_P (charset)
3833                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3834                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3835                 {
3836                   /* There exist some default graphic registers to be
3837                      used by CHARSET.  */
3838
3839                   /* We had better avoid designating a charset of
3840                      CHARS96 to REG 0 as far as possible.  */
3841                   if (CHARSET_CHARS (charset) == 96)
3842                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3843                       = (reg_bits & 2
3844                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3845                   else
3846                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3847                       = (reg_bits & 1
3848                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3849                 }
3850             }
3851       }
3852       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3853       coding->spec.iso2022.last_invalid_designation_register = -1;
3854       break;
3855
3856     case 3:
3857       coding->type = coding_type_big5;
3858       coding->common_flags
3859         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3860       coding->flags
3861         = (NILP (XVECTOR (coding_spec)->contents[4])
3862            ? CODING_FLAG_BIG5_HKU
3863            : CODING_FLAG_BIG5_ETEN);
3864       break;
3865
3866     case 4:
3867       coding->type = coding_type_ccl;
3868       coding->common_flags
3869         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3870       {
3871         val = XVECTOR (coding_spec)->contents[4];
3872         if (! CONSP (val)
3873             || setup_ccl_program (&(coding->spec.ccl.decoder),
3874                                   XCAR (val)) < 0
3875             || setup_ccl_program (&(coding->spec.ccl.encoder),
3876                                   XCDR (val)) < 0)
3877           goto label_invalid_coding_system;
3878
3879         bzero (coding->spec.ccl.valid_codes, 256);
3880         val = Fplist_get (plist, Qvalid_codes);
3881         if (CONSP (val))
3882           {
3883             Lisp_Object this;
3884
3885             for (; CONSP (val); val = XCDR (val))
3886               {
3887                 this = XCAR (val);
3888                 if (INTEGERP (this)
3889                     && XINT (this) >= 0 && XINT (this) < 256)
3890                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3891                 else if (CONSP (this)
3892                          && INTEGERP (XCAR (this))
3893                          && INTEGERP (XCDR (this)))
3894                   {
3895                     int start = XINT (XCAR (this));
3896                     int end = XINT (XCDR (this));
3897
3898                     if (start >= 0 && start <= end && end < 256)
3899                       while (start <= end)
3900                         coding->spec.ccl.valid_codes[start++] = 1;
3901                   }
3902               }
3903           }
3904       }
3905       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3906       coding->spec.ccl.cr_carryover = 0;
3907       coding->spec.ccl.eight_bit_carryover[0] = 0;
3908       break;
3909
3910     case 5:
3911       coding->type = coding_type_raw_text;
3912       break;
3913
3914     default:
3915       goto label_invalid_coding_system;
3916     }
3917   return 0;
3918
3919  label_invalid_coding_system:
3920   coding->type = coding_type_no_conversion;
3921   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3922   coding->common_flags = 0;
3923   coding->eol_type = NILP (coding_system) ? system_eol_type : CODING_EOL_LF;
3924   if (coding->eol_type != CODING_EOL_LF)
3925     coding->common_flags
3926       |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3927   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3928   return NILP (coding_system) ? 0 : -1;
3929 }
3930
3931 /* Free memory blocks allocated for storing composition information.  */
3932
3933 void
3934 coding_free_composition_data (coding)
3935      struct coding_system *coding;
3936 {
3937   struct composition_data *cmp_data = coding->cmp_data, *next;
3938
3939   if (!cmp_data)
3940     return;
3941   /* Memory blocks are chained.  At first, rewind to the first, then,
3942      free blocks one by one.  */
3943   while (cmp_data->prev)
3944     cmp_data = cmp_data->prev;
3945   while (cmp_data)
3946     {
3947       next = cmp_data->next;
3948       xfree (cmp_data);
3949       cmp_data = next;
3950     }
3951   coding->cmp_data = NULL;
3952 }
3953
3954 /* Set `char_offset' member of all memory blocks pointed by
3955    coding->cmp_data to POS.  */
3956
3957 void
3958 coding_adjust_composition_offset (coding, pos)
3959      struct coding_system *coding;
3960      int pos;
3961 {
3962   struct composition_data *cmp_data;
3963
3964   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3965     cmp_data->char_offset = pos;
3966 }
3967
3968 /* Setup raw-text or one of its subsidiaries in the structure
3969    coding_system CODING according to the already setup value eol_type
3970    in CODING.  CODING should be setup for some coding system in
3971    advance.  */
3972
3973 void
3974 setup_raw_text_coding_system (coding)
3975      struct coding_system *coding;
3976 {
3977   if (coding->type != coding_type_raw_text)
3978     {
3979       coding->symbol = Qraw_text;
3980       coding->type = coding_type_raw_text;
3981       if (coding->eol_type != CODING_EOL_UNDECIDED)
3982         {
3983           Lisp_Object subsidiaries;
3984           subsidiaries = Fget (Qraw_text, Qeol_type);
3985
3986           if (VECTORP (subsidiaries)
3987               && XVECTOR (subsidiaries)->size == 3)
3988             coding->symbol
3989               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3990         }
3991       setup_coding_system (coding->symbol, coding);
3992     }
3993   return;
3994 }
3995
3996 /* Emacs has a mechanism to automatically detect a coding system if it
3997    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3998    it's impossible to distinguish some coding systems accurately
3999    because they use the same range of codes.  So, at first, coding
4000    systems are categorized into 7, those are:
4001
4002    o coding-category-emacs-mule
4003
4004         The category for a coding system which has the same code range
4005         as Emacs' internal format.  Assigned the coding-system (Lisp
4006         symbol) `emacs-mule' by default.
4007
4008    o coding-category-sjis
4009
4010         The category for a coding system which has the same code range
4011         as SJIS.  Assigned the coding-system (Lisp
4012         symbol) `japanese-shift-jis' by default.
4013
4014    o coding-category-iso-7
4015
4016         The category for a coding system which has the same code range
4017         as ISO2022 of 7-bit environment.  This doesn't use any locking
4018         shift and single shift functions.  This can encode/decode all
4019         charsets.  Assigned the coding-system (Lisp symbol)
4020         `iso-2022-7bit' by default.
4021
4022    o coding-category-iso-7-tight
4023
4024         Same as coding-category-iso-7 except that this can
4025         encode/decode only the specified charsets.
4026
4027    o coding-category-iso-8-1
4028
4029         The category for a coding system which has the same code range
4030         as ISO2022 of 8-bit environment and graphic plane 1 used only
4031         for DIMENSION1 charset.  This doesn't use any locking shift
4032         and single shift functions.  Assigned the coding-system (Lisp
4033         symbol) `iso-latin-1' by default.
4034
4035    o coding-category-iso-8-2
4036
4037         The category for a coding system which has the same code range
4038         as ISO2022 of 8-bit environment and graphic plane 1 used only
4039         for DIMENSION2 charset.  This doesn't use any locking shift
4040         and single shift functions.  Assigned the coding-system (Lisp
4041         symbol) `japanese-iso-8bit' by default.
4042
4043    o coding-category-iso-7-else
4044
4045         The category for a coding system which has the same code range
4046         as ISO2022 of 7-bit environment but uses locking shift or
4047         single shift functions.  Assigned the coding-system (Lisp
4048         symbol) `iso-2022-7bit-lock' by default.
4049
4050    o coding-category-iso-8-else
4051
4052         The category for a coding system which has the same code range
4053         as ISO2022 of 8-bit environment but uses locking shift or
4054         single shift functions.  Assigned the coding-system (Lisp
4055         symbol) `iso-2022-8bit-ss2' by default.
4056
4057    o coding-category-big5
4058
4059         The category for a coding system which has the same code range
4060         as BIG5.  Assigned the coding-system (Lisp symbol)
4061         `cn-big5' by default.
4062
4063    o coding-category-utf-8
4064
4065         The category for a coding system which has the same code range
4066         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4067         symbol) `utf-8' by default.
4068
4069    o coding-category-utf-16-be
4070
4071         The category for a coding system in which a text has an
4072         Unicode signature (cf. Unicode Standard) in the order of BIG
4073         endian at the head.  Assigned the coding-system (Lisp symbol)
4074         `utf-16-be' by default.
4075
4076    o coding-category-utf-16-le
4077
4078         The category for a coding system in which a text has an
4079         Unicode signature (cf. Unicode Standard) in the order of
4080         LITTLE endian at the head.  Assigned the coding-system (Lisp
4081         symbol) `utf-16-le' by default.
4082
4083    o coding-category-ccl
4084
4085         The category for a coding system of which encoder/decoder is
4086         written in CCL programs.  The default value is nil, i.e., no
4087         coding system is assigned.
4088
4089    o coding-category-binary
4090
4091         The category for a coding system not categorized in any of the
4092         above.  Assigned the coding-system (Lisp symbol)
4093         `no-conversion' by default.
4094
4095    Each of them is a Lisp symbol and the value is an actual
4096    `coding-system' (this is also a Lisp symbol) assigned by a user.
4097    What Emacs does actually is to detect a category of coding system.
4098    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4099    decide a single possible category, it selects a category of the
4100    highest priority.  Priorities of categories are also specified by a
4101    user in a Lisp variable `coding-category-list'.
4102
4103 */
4104
4105 static
4106 int ascii_skip_code[256];
4107
4108 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4109    If it detects possible coding systems, return an integer in which
4110    appropriate flag bits are set.  Flag bits are defined by macros
4111    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4112    it should point the table `coding_priorities'.  In that case, only
4113    the flag bit for a coding system of the highest priority is set in
4114    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4115    range 0x80..0x9F are in multibyte form.
4116
4117    How many ASCII characters are at the head is returned as *SKIP.  */
4118
4119 static int
4120 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4121      unsigned char *source;
4122      int src_bytes, *priorities, *skip;
4123      int multibytep;
4124 {
4125   register unsigned char c;
4126   unsigned char *src = source, *src_end = source + src_bytes;
4127   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4128   int i;
4129
4130   /* At first, skip all ASCII characters and control characters except
4131      for three ISO2022 specific control characters.  */
4132   ascii_skip_code[ISO_CODE_SO] = 0;
4133   ascii_skip_code[ISO_CODE_SI] = 0;
4134   ascii_skip_code[ISO_CODE_ESC] = 0;
4135
4136  label_loop_detect_coding:
4137   while (src < src_end && ascii_skip_code[*src]) src++;
4138   *skip = src - source;
4139
4140   if (src >= src_end)
4141     /* We found nothing other than ASCII.  There's nothing to do.  */
4142     return 0;
4143
4144   c = *src;
4145   /* The text seems to be encoded in some multilingual coding system.
4146      Now, try to find in which coding system the text is encoded.  */
4147   if (c < 0x80)
4148     {
4149       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4150       /* C is an ISO2022 specific control code of C0.  */
4151       mask = detect_coding_iso2022 (src, src_end, multibytep);
4152       if (mask == 0)
4153         {
4154           /* No valid ISO2022 code follows C.  Try again.  */
4155           src++;
4156           if (c == ISO_CODE_ESC)
4157             ascii_skip_code[ISO_CODE_ESC] = 1;
4158           else
4159             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4160           goto label_loop_detect_coding;
4161         }
4162       if (priorities)
4163         {
4164           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4165             {
4166               if (mask & priorities[i])
4167                 return priorities[i];
4168             }
4169           return CODING_CATEGORY_MASK_RAW_TEXT;
4170         }
4171     }
4172   else
4173     {
4174       int try;
4175
4176       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4177         c = src[1] - 0x20;
4178
4179       if (c < 0xA0)
4180         {
4181           /* C is the first byte of SJIS character code,
4182              or a leading-code of Emacs' internal format (emacs-mule),
4183              or the first byte of UTF-16.  */
4184           try = (CODING_CATEGORY_MASK_SJIS
4185                   | CODING_CATEGORY_MASK_EMACS_MULE
4186                   | CODING_CATEGORY_MASK_UTF_16_BE
4187                   | CODING_CATEGORY_MASK_UTF_16_LE);
4188
4189           /* Or, if C is a special latin extra code,
4190              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4191              or is an ISO2022 control-sequence-introducer (CSI),
4192              we should also consider the possibility of ISO2022 codings.  */
4193           if ((VECTORP (Vlatin_extra_code_table)
4194                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4195               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4196               || (c == ISO_CODE_CSI
4197                   && (src < src_end
4198                       && (*src == ']'
4199                           || ((*src == '0' || *src == '1' || *src == '2')
4200                               && src + 1 < src_end
4201                               && src[1] == ']')))))
4202             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4203                      | CODING_CATEGORY_MASK_ISO_8BIT);
4204         }
4205       else
4206         /* C is a character of ISO2022 in graphic plane right,
4207            or a SJIS's 1-byte character code (i.e. JISX0201),
4208            or the first byte of BIG5's 2-byte code,
4209            or the first byte of UTF-8/16.  */
4210         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4211                 | CODING_CATEGORY_MASK_ISO_8BIT
4212                 | CODING_CATEGORY_MASK_SJIS
4213                 | CODING_CATEGORY_MASK_BIG5
4214                 | CODING_CATEGORY_MASK_UTF_8
4215                 | CODING_CATEGORY_MASK_UTF_16_BE
4216                 | CODING_CATEGORY_MASK_UTF_16_LE);
4217
4218       /* Or, we may have to consider the possibility of CCL.  */
4219       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4220           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4221               ->spec.ccl.valid_codes)[c])
4222         try |= CODING_CATEGORY_MASK_CCL;
4223
4224       mask = 0;
4225       utf16_examined_p = iso2022_examined_p = 0;
4226       if (priorities)
4227         {
4228           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4229             {
4230               if (!iso2022_examined_p
4231                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4232                 {
4233                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4234                   iso2022_examined_p = 1;
4235                 }
4236               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4237                 mask |= detect_coding_sjis (src, src_end, multibytep);
4238               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4239                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4240               else if (!utf16_examined_p
4241                        && (priorities[i] & try &
4242                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4243                 {
4244                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4245                   utf16_examined_p = 1;
4246                 }
4247               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4248                 mask |= detect_coding_big5 (src, src_end, multibytep);
4249               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4250                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4251               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4252                 mask |= detect_coding_ccl (src, src_end, multibytep);
4253               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4254                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4255               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4256                 mask |= CODING_CATEGORY_MASK_BINARY;
4257               if (mask & priorities[i])
4258                 return priorities[i];
4259             }
4260           return CODING_CATEGORY_MASK_RAW_TEXT;
4261         }
4262       if (try & CODING_CATEGORY_MASK_ISO)
4263         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4264       if (try & CODING_CATEGORY_MASK_SJIS)
4265         mask |= detect_coding_sjis (src, src_end, multibytep);
4266       if (try & CODING_CATEGORY_MASK_BIG5)
4267         mask |= detect_coding_big5 (src, src_end, multibytep);
4268       if (try & CODING_CATEGORY_MASK_UTF_8)
4269         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4270       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4271         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4272       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4273         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4274       if (try & CODING_CATEGORY_MASK_CCL)
4275         mask |= detect_coding_ccl (src, src_end, multibytep);
4276     }
4277   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4278 }
4279
4280 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4281    The information of the detected coding system is set in CODING.  */
4282
4283 void
4284 detect_coding (coding, src, src_bytes)
4285      struct coding_system *coding;
4286      const unsigned char *src;
4287      int src_bytes;
4288 {
4289   unsigned int idx;
4290   int skip, mask;
4291   Lisp_Object val;
4292
4293   val = Vcoding_category_list;
4294   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4295                              coding->src_multibyte);
4296   coding->heading_ascii = skip;
4297
4298   if (!mask) return;
4299
4300   /* We found a single coding system of the highest priority in MASK.  */
4301   idx = 0;
4302   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4303   if (! mask)
4304     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4305
4306   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4307
4308   if (coding->eol_type != CODING_EOL_UNDECIDED)
4309     {
4310       Lisp_Object tmp;
4311
4312       tmp = Fget (val, Qeol_type);
4313       if (VECTORP (tmp))
4314         val = XVECTOR (tmp)->contents[coding->eol_type];
4315     }
4316
4317   /* Setup this new coding system while preserving some slots.  */
4318   {
4319     int src_multibyte = coding->src_multibyte;
4320     int dst_multibyte = coding->dst_multibyte;
4321
4322     setup_coding_system (val, coding);
4323     coding->src_multibyte = src_multibyte;
4324     coding->dst_multibyte = dst_multibyte;
4325     coding->heading_ascii = skip;
4326   }
4327 }
4328
4329 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4330    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4331    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4332
4333    How many non-eol characters are at the head is returned as *SKIP.  */
4334
4335 #define MAX_EOL_CHECK_COUNT 3
4336
4337 static int
4338 detect_eol_type (source, src_bytes, skip)
4339      unsigned char *source;
4340      int src_bytes, *skip;
4341 {
4342   unsigned char *src = source, *src_end = src + src_bytes;
4343   unsigned char c;
4344   int total = 0;                /* How many end-of-lines are found so far.  */
4345   int eol_type = CODING_EOL_UNDECIDED;
4346   int this_eol_type;
4347
4348   *skip = 0;
4349
4350   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4351     {
4352       c = *src++;
4353       if (c == '\n' || c == '\r')
4354         {
4355           if (*skip == 0)
4356             *skip = src - 1 - source;
4357           total++;
4358           if (c == '\n')
4359             this_eol_type = CODING_EOL_LF;
4360           else if (src >= src_end || *src != '\n')
4361             this_eol_type = CODING_EOL_CR;
4362           else
4363             this_eol_type = CODING_EOL_CRLF, src++;
4364
4365           if (eol_type == CODING_EOL_UNDECIDED)
4366             /* This is the first end-of-line.  */
4367             eol_type = this_eol_type;
4368           else if (eol_type != this_eol_type)
4369             {
4370               /* The found type is different from what found before.  */
4371               eol_type = CODING_EOL_INCONSISTENT;
4372               break;
4373             }
4374         }
4375     }
4376
4377   if (*skip == 0)
4378     *skip = src_end - source;
4379   return eol_type;
4380 }
4381
4382 /* Like detect_eol_type, but detect EOL type in 2-octet
4383    big-endian/little-endian format for coding systems utf-16-be and
4384    utf-16-le.  */
4385
4386 static int
4387 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4388      unsigned char *source;
4389      int src_bytes, *skip, big_endian_p;
4390 {
4391   unsigned char *src = source, *src_end = src + src_bytes;
4392   unsigned int c1, c2;
4393   int total = 0;                /* How many end-of-lines are found so far.  */
4394   int eol_type = CODING_EOL_UNDECIDED;
4395   int this_eol_type;
4396   int msb, lsb;
4397
4398   if (big_endian_p)
4399     msb = 0, lsb = 1;
4400   else
4401     msb = 1, lsb = 0;
4402
4403   *skip = 0;
4404
4405   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4406     {
4407       c1 = (src[msb] << 8) | (src[lsb]);
4408       src += 2;
4409
4410       if (c1 == '\n' || c1 == '\r')
4411         {
4412           if (*skip == 0)
4413             *skip = src - 2 - source;
4414           total++;
4415           if (c1 == '\n')
4416             {
4417               this_eol_type = CODING_EOL_LF;
4418             }
4419           else
4420             {
4421               if ((src + 1) >= src_end)
4422                 {
4423                   this_eol_type = CODING_EOL_CR;
4424                 }
4425               else
4426                 {
4427                   c2 = (src[msb] << 8) | (src[lsb]);
4428                   if (c2 == '\n')
4429                     this_eol_type = CODING_EOL_CRLF, src += 2;
4430                   else
4431                     this_eol_type = CODING_EOL_CR;
4432                 }
4433             }
4434
4435           if (eol_type == CODING_EOL_UNDECIDED)
4436             /* This is the first end-of-line.  */
4437             eol_type = this_eol_type;
4438           else if (eol_type != this_eol_type)
4439             {
4440               /* The found type is different from what found before.  */
4441               eol_type = CODING_EOL_INCONSISTENT;
4442               break;
4443             }
4444         }
4445     }
4446
4447   if (*skip == 0)
4448     *skip = src_end - source;
4449   return eol_type;
4450 }
4451
4452 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4453    is encoded.  If it detects an appropriate format of end-of-line, it
4454    sets the information in *CODING.  */
4455
4456 void
4457 detect_eol (coding, src, src_bytes)
4458      struct coding_system *coding;
4459      const unsigned char *src;
4460      int src_bytes;
4461 {
4462   Lisp_Object val;
4463   int skip;
4464   int eol_type;
4465
4466   switch (coding->category_idx)
4467     {
4468     case CODING_CATEGORY_IDX_UTF_16_BE:
4469       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4470       break;
4471     case CODING_CATEGORY_IDX_UTF_16_LE:
4472       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4473       break;
4474     default:
4475       eol_type = detect_eol_type (src, src_bytes, &skip);
4476       break;
4477     }
4478
4479   if (coding->heading_ascii > skip)
4480     coding->heading_ascii = skip;
4481   else
4482     skip = coding->heading_ascii;
4483
4484   if (eol_type == CODING_EOL_UNDECIDED)
4485     return;
4486   if (eol_type == CODING_EOL_INCONSISTENT)
4487     {
4488 #if 0
4489       /* This code is suppressed until we find a better way to
4490          distinguish raw text file and binary file.  */
4491
4492       /* If we have already detected that the coding is raw-text, the
4493          coding should actually be no-conversion.  */
4494       if (coding->type == coding_type_raw_text)
4495         {
4496           setup_coding_system (Qno_conversion, coding);
4497           return;
4498         }
4499       /* Else, let's decode only text code anyway.  */
4500 #endif /* 0 */
4501       eol_type = CODING_EOL_LF;
4502     }
4503
4504   val = Fget (coding->symbol, Qeol_type);
4505   if (VECTORP (val) && XVECTOR (val)->size == 3)
4506     {
4507       int src_multibyte = coding->src_multibyte;
4508       int dst_multibyte = coding->dst_multibyte;
4509       struct composition_data *cmp_data = coding->cmp_data;
4510
4511       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4512       coding->src_multibyte = src_multibyte;
4513       coding->dst_multibyte = dst_multibyte;
4514       coding->heading_ascii = skip;
4515       coding->cmp_data = cmp_data;
4516     }
4517 }
4518
4519 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4520
4521 #define DECODING_BUFFER_MAG(coding)                     \
4522   (coding->type == coding_type_iso2022                  \
4523    ? 3                                                  \
4524    : (coding->type == coding_type_ccl                   \
4525       ? coding->spec.ccl.decoder.buf_magnification      \
4526       : 2))
4527
4528 /* Return maximum size (bytes) of a buffer enough for decoding
4529    SRC_BYTES of text encoded in CODING.  */
4530
4531 int
4532 decoding_buffer_size (coding, src_bytes)
4533      struct coding_system *coding;
4534      int src_bytes;
4535 {
4536   return (src_bytes * DECODING_BUFFER_MAG (coding)
4537           + CONVERSION_BUFFER_EXTRA_ROOM);
4538 }
4539
4540 /* Return maximum size (bytes) of a buffer enough for encoding
4541    SRC_BYTES of text to CODING.  */
4542
4543 int
4544 encoding_buffer_size (coding, src_bytes)
4545      struct coding_system *coding;
4546      int src_bytes;
4547 {
4548   int magnification;
4549
4550   if (coding->type == coding_type_ccl)
4551     {
4552       magnification = coding->spec.ccl.encoder.buf_magnification;
4553       if (coding->eol_type == CODING_EOL_CRLF)
4554         magnification *= 2;
4555     }
4556   else if (CODING_REQUIRE_ENCODING (coding))
4557     magnification = 3;
4558   else
4559     magnification = 1;
4560
4561   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4562 }
4563
4564 /* Working buffer for code conversion.  */
4565 struct conversion_buffer
4566 {
4567   int size;                     /* size of data.  */
4568   int on_stack;                 /* 1 if allocated by alloca.  */
4569   unsigned char *data;
4570 };
4571
4572 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4573 #define allocate_conversion_buffer(buf, len)            \
4574   do {                                                  \
4575     if (len < MAX_ALLOCA)                               \
4576       {                                                 \
4577         buf.data = (unsigned char *) alloca (len);      \
4578         buf.on_stack = 1;                               \
4579       }                                                 \
4580     else                                                \
4581       {                                                 \
4582         buf.data = (unsigned char *) xmalloc (len);     \
4583         buf.on_stack = 0;                               \
4584       }                                                 \
4585     buf.size = len;                                     \
4586   } while (0)
4587
4588 /* Double the allocated memory for *BUF.  */
4589 static void
4590 extend_conversion_buffer (buf)
4591      struct conversion_buffer *buf;
4592 {
4593   if (buf->on_stack)
4594     {
4595       unsigned char *save = buf->data;
4596       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4597       bcopy (save, buf->data, buf->size);
4598       buf->on_stack = 0;
4599     }
4600   else
4601     {
4602       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4603     }
4604   buf->size *= 2;
4605 }
4606
4607 /* Free the allocated memory for BUF if it is not on stack.  */
4608 static void
4609 free_conversion_buffer (buf)
4610      struct conversion_buffer *buf;
4611 {
4612   if (!buf->on_stack)
4613     xfree (buf->data);
4614 }
4615
4616 int
4617 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4618      struct coding_system *coding;
4619      unsigned char *source, *destination;
4620      int src_bytes, dst_bytes, encodep;
4621 {
4622   struct ccl_program *ccl
4623     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4624   unsigned char *dst = destination;
4625
4626   ccl->suppress_error = coding->suppress_error;
4627   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4628   if (encodep)
4629     {
4630       /* On encoding, EOL format is converted within ccl_driver.  For
4631          that, setup proper information in the structure CCL.  */
4632       ccl->eol_type = coding->eol_type;
4633       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4634         ccl->eol_type = CODING_EOL_LF;
4635       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4636       ccl->eight_bit_control = coding->dst_multibyte;
4637     }
4638   else
4639     ccl->eight_bit_control = 1;
4640   ccl->multibyte = coding->src_multibyte;
4641   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4642     {
4643       /* Move carryover bytes to DESTINATION.  */
4644       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4645       while (*p)
4646         *dst++ = *p++;
4647       coding->spec.ccl.eight_bit_carryover[0] = 0;
4648       if (dst_bytes)
4649         dst_bytes -= dst - destination;
4650     }
4651
4652   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4653                                   &(coding->consumed))
4654                       + dst - destination);
4655
4656   if (encodep)
4657     {
4658       coding->produced_char = coding->produced;
4659       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4660     }
4661   else if (!ccl->eight_bit_control)
4662     {
4663       /* The produced bytes forms a valid multibyte sequence. */
4664       coding->produced_char
4665         = multibyte_chars_in_text (destination, coding->produced);
4666       coding->spec.ccl.eight_bit_carryover[0] = 0;
4667     }
4668   else
4669     {
4670       /* On decoding, the destination should always multibyte.  But,
4671          CCL program might have been generated an invalid multibyte
4672          sequence.  Here we make such a sequence valid as
4673          multibyte.  */
4674       int bytes
4675         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4676
4677       if ((coding->consumed < src_bytes
4678            || !ccl->last_block)
4679           && coding->produced >= 1
4680           && destination[coding->produced - 1] >= 0x80)
4681         {
4682           /* We should not convert the tailing 8-bit codes to
4683              multibyte form even if they doesn't form a valid
4684              multibyte sequence.  They may form a valid sequence in
4685              the next call.  */
4686           int carryover = 0;
4687
4688           if (destination[coding->produced - 1] < 0xA0)
4689             carryover = 1;
4690           else if (coding->produced >= 2)
4691             {
4692               if (destination[coding->produced - 2] >= 0x80)
4693                 {
4694                   if (destination[coding->produced - 2] < 0xA0)
4695                     carryover = 2;
4696                   else if (coding->produced >= 3
4697                            && destination[coding->produced - 3] >= 0x80
4698                            && destination[coding->produced - 3] < 0xA0)
4699                     carryover = 3;
4700                 }
4701             }
4702           if (carryover > 0)
4703             {
4704               BCOPY_SHORT (destination + coding->produced - carryover,
4705                            coding->spec.ccl.eight_bit_carryover,
4706                            carryover);
4707               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4708               coding->produced -= carryover;
4709             }
4710         }
4711       coding->produced = str_as_multibyte (destination, bytes,
4712                                            coding->produced,
4713                                            &(coding->produced_char));
4714     }
4715
4716   switch (ccl->status)
4717     {
4718     case CCL_STAT_SUSPEND_BY_SRC:
4719       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4720       break;
4721     case CCL_STAT_SUSPEND_BY_DST:
4722       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4723       break;
4724     case CCL_STAT_QUIT:
4725     case CCL_STAT_INVALID_CMD:
4726       coding->result = CODING_FINISH_INTERRUPT;
4727       break;
4728     default:
4729       coding->result = CODING_FINISH_NORMAL;
4730       break;
4731     }
4732   return coding->result;
4733 }
4734
4735 /* Decode EOL format of the text at PTR of BYTES length destructively
4736    according to CODING->eol_type.  This is called after the CCL
4737    program produced a decoded text at PTR.  If we do CRLF->LF
4738    conversion, update CODING->produced and CODING->produced_char.  */
4739
4740 static void
4741 decode_eol_post_ccl (coding, ptr, bytes)
4742      struct coding_system *coding;
4743      unsigned char *ptr;
4744      int bytes;
4745 {
4746   Lisp_Object val, saved_coding_symbol;
4747   unsigned char *pend = ptr + bytes;
4748   int dummy;
4749
4750   /* Remember the current coding system symbol.  We set it back when
4751      an inconsistent EOL is found so that `last-coding-system-used' is
4752      set to the coding system that doesn't specify EOL conversion.  */
4753   saved_coding_symbol = coding->symbol;
4754
4755   coding->spec.ccl.cr_carryover = 0;
4756   if (coding->eol_type == CODING_EOL_UNDECIDED)
4757     {
4758       /* Here, to avoid the call of setup_coding_system, we directly
4759          call detect_eol_type.  */
4760       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4761       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4762         coding->eol_type = CODING_EOL_LF;
4763       if (coding->eol_type != CODING_EOL_UNDECIDED)
4764         {
4765           val = Fget (coding->symbol, Qeol_type);
4766           if (VECTORP (val) && XVECTOR (val)->size == 3)
4767             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4768         }
4769       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4770     }
4771
4772   if (coding->eol_type == CODING_EOL_LF
4773       || coding->eol_type == CODING_EOL_UNDECIDED)
4774     {
4775       /* We have nothing to do.  */
4776       ptr = pend;
4777     }
4778   else if (coding->eol_type == CODING_EOL_CRLF)
4779     {
4780       unsigned char *pstart = ptr, *p = ptr;
4781
4782       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4783           && *(pend - 1) == '\r')
4784         {
4785           /* If the last character is CR, we can't handle it here
4786              because LF will be in the not-yet-decoded source text.
4787              Record that the CR is not yet processed.  */
4788           coding->spec.ccl.cr_carryover = 1;
4789           coding->produced--;
4790           coding->produced_char--;
4791           pend--;
4792         }
4793       while (ptr < pend)
4794         {
4795           if (*ptr == '\r')
4796             {
4797               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4798                 {
4799                   *p++ = '\n';
4800                   ptr += 2;
4801                 }
4802               else
4803                 {
4804                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4805                     goto undo_eol_conversion;
4806                   *p++ = *ptr++;
4807                 }
4808             }
4809           else if (*ptr == '\n'
4810                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4811             goto undo_eol_conversion;
4812           else
4813             *p++ = *ptr++;
4814           continue;
4815
4816         undo_eol_conversion:
4817           /* We have faced with inconsistent EOL format at PTR.
4818              Convert all LFs before PTR back to CRLFs.  */
4819           for (p--, ptr--; p >= pstart; p--)
4820             {
4821               if (*p == '\n')
4822                 *ptr-- = '\n', *ptr-- = '\r';
4823               else
4824                 *ptr-- = *p;
4825             }
4826           /*  If carryover is recorded, cancel it because we don't
4827               convert CRLF anymore.  */
4828           if (coding->spec.ccl.cr_carryover)
4829             {
4830               coding->spec.ccl.cr_carryover = 0;
4831               coding->produced++;
4832               coding->produced_char++;
4833               pend++;
4834             }
4835           p = ptr = pend;
4836           coding->eol_type = CODING_EOL_LF;
4837           coding->symbol = saved_coding_symbol;
4838         }
4839       if (p < pend)
4840         {
4841           /* As each two-byte sequence CRLF was converted to LF, (PEND
4842              - P) is the number of deleted characters.  */
4843           coding->produced -= pend - p;
4844           coding->produced_char -= pend - p;
4845         }
4846     }
4847   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4848     {
4849       unsigned char *p = ptr;
4850
4851       for (; ptr < pend; ptr++)
4852         {
4853           if (*ptr == '\r')
4854             *ptr = '\n';
4855           else if (*ptr == '\n'
4856                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4857             {
4858               for (; p < ptr; p++)
4859                 {
4860                   if (*p == '\n')
4861                     *p = '\r';
4862                 }
4863               ptr = pend;
4864               coding->eol_type = CODING_EOL_LF;
4865               coding->symbol = saved_coding_symbol;
4866             }
4867         }
4868     }
4869 }
4870
4871 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4872    decoding, it may detect coding system and format of end-of-line if
4873    those are not yet decided.  The source should be unibyte, the
4874    result is multibyte if CODING->dst_multibyte is nonzero, else
4875    unibyte.  */
4876
4877 int
4878 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4879      struct coding_system *coding;
4880      const unsigned char *source;
4881      unsigned char *destination;
4882      int src_bytes, dst_bytes;
4883 {
4884   int extra = 0;
4885
4886   if (coding->type == coding_type_undecided)
4887     detect_coding (coding, source, src_bytes);
4888
4889   if (coding->eol_type == CODING_EOL_UNDECIDED
4890       && coding->type != coding_type_ccl)
4891     {
4892       detect_eol (coding, source, src_bytes);
4893       /* We had better recover the original eol format if we
4894          encounter an inconsistent eol format while decoding.  */
4895       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4896     }
4897
4898   coding->produced = coding->produced_char = 0;
4899   coding->consumed = coding->consumed_char = 0;
4900   coding->errors = 0;
4901   coding->result = CODING_FINISH_NORMAL;
4902
4903   switch (coding->type)
4904     {
4905     case coding_type_sjis:
4906       decode_coding_sjis_big5 (coding, source, destination,
4907                                src_bytes, dst_bytes, 1);
4908       break;
4909
4910     case coding_type_iso2022:
4911       decode_coding_iso2022 (coding, source, destination,
4912                              src_bytes, dst_bytes);
4913       break;
4914
4915     case coding_type_big5:
4916       decode_coding_sjis_big5 (coding, source, destination,
4917                                src_bytes, dst_bytes, 0);
4918       break;
4919
4920     case coding_type_emacs_mule:
4921       decode_coding_emacs_mule (coding, source, destination,
4922                                 src_bytes, dst_bytes);
4923       break;
4924
4925     case coding_type_ccl:
4926       if (coding->spec.ccl.cr_carryover)
4927         {
4928           /* Put the CR which was not processed by the previous call
4929              of decode_eol_post_ccl in DESTINATION.  It will be
4930              decoded together with the following LF by the call to
4931              decode_eol_post_ccl below.  */
4932           *destination = '\r';
4933           coding->produced++;
4934           coding->produced_char++;
4935           dst_bytes--;
4936           extra = coding->spec.ccl.cr_carryover;
4937         }
4938       ccl_coding_driver (coding, source, destination + extra,
4939                          src_bytes, dst_bytes, 0);
4940       if (coding->eol_type != CODING_EOL_LF)
4941         {
4942           coding->produced += extra;
4943           coding->produced_char += extra;
4944           decode_eol_post_ccl (coding, destination, coding->produced);
4945         }
4946       break;
4947
4948     default:
4949       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4950     }
4951
4952   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4953       && coding->mode & CODING_MODE_LAST_BLOCK
4954       && coding->consumed == src_bytes)
4955     coding->result = CODING_FINISH_NORMAL;
4956
4957   if (coding->mode & CODING_MODE_LAST_BLOCK
4958       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4959     {
4960       const unsigned char *src = source + coding->consumed;
4961       unsigned char *dst = destination + coding->produced;
4962
4963       src_bytes -= coding->consumed;
4964       coding->errors++;
4965       if (COMPOSING_P (coding))
4966         DECODE_COMPOSITION_END ('1');
4967       while (src_bytes--)
4968         {
4969           int c = *src++;
4970           dst += CHAR_STRING (c, dst);
4971           coding->produced_char++;
4972         }
4973       coding->consumed = coding->consumed_char = src - source;
4974       coding->produced = dst - destination;
4975       coding->result = CODING_FINISH_NORMAL;
4976     }
4977
4978   if (!coding->dst_multibyte)
4979     {
4980       coding->produced = str_as_unibyte (destination, coding->produced);
4981       coding->produced_char = coding->produced;
4982     }
4983
4984   return coding->result;
4985 }
4986
4987 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4988    multibyteness of the source is CODING->src_multibyte, the
4989    multibyteness of the result is always unibyte.  */
4990
4991 int
4992 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4993      struct coding_system *coding;
4994      const unsigned char *source;
4995      unsigned char *destination;
4996      int src_bytes, dst_bytes;
4997 {
4998   coding->produced = coding->produced_char = 0;
4999   coding->consumed = coding->consumed_char = 0;
5000   coding->errors = 0;
5001   coding->result = CODING_FINISH_NORMAL;
5002   if (coding->eol_type == CODING_EOL_UNDECIDED)
5003     coding->eol_type = system_eol_type;
5004
5005   switch (coding->type)
5006     {
5007     case coding_type_sjis:
5008       encode_coding_sjis_big5 (coding, source, destination,
5009                                src_bytes, dst_bytes, 1);
5010       break;
5011
5012     case coding_type_iso2022:
5013       encode_coding_iso2022 (coding, source, destination,
5014                              src_bytes, dst_bytes);
5015       break;
5016
5017     case coding_type_big5:
5018       encode_coding_sjis_big5 (coding, source, destination,
5019                                src_bytes, dst_bytes, 0);
5020       break;
5021
5022     case coding_type_emacs_mule:
5023       encode_coding_emacs_mule (coding, source, destination,
5024                                 src_bytes, dst_bytes);
5025       break;
5026
5027     case coding_type_ccl:
5028       ccl_coding_driver (coding, source, destination,
5029                          src_bytes, dst_bytes, 1);
5030       break;
5031
5032     default:
5033       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5034     }
5035
5036   if (coding->mode & CODING_MODE_LAST_BLOCK
5037       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5038     {
5039       const unsigned char *src = source + coding->consumed;
5040       unsigned char *dst = destination + coding->produced;
5041
5042       if (coding->type == coding_type_iso2022)
5043         ENCODE_RESET_PLANE_AND_REGISTER;
5044       if (COMPOSING_P (coding))
5045         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5046       if (coding->consumed < src_bytes)
5047         {
5048           int len = src_bytes - coding->consumed;
5049
5050           BCOPY_SHORT (src, dst, len);
5051           if (coding->src_multibyte)
5052             len = str_as_unibyte (dst, len);
5053           dst += len;
5054           coding->consumed = src_bytes;
5055         }
5056       coding->produced = coding->produced_char = dst - destination;
5057       coding->result = CODING_FINISH_NORMAL;
5058     }
5059
5060   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5061       && coding->consumed == src_bytes)
5062     coding->result = CODING_FINISH_NORMAL;
5063
5064   return coding->result;
5065 }
5066
5067 /* Scan text in the region between *BEG and *END (byte positions),
5068    skip characters which we don't have to decode by coding system
5069    CODING at the head and tail, then set *BEG and *END to the region
5070    of the text we actually have to convert.  The caller should move
5071    the gap out of the region in advance if the region is from a
5072    buffer.
5073
5074    If STR is not NULL, *BEG and *END are indices into STR.  */
5075
5076 static void
5077 shrink_decoding_region (beg, end, coding, str)
5078      int *beg, *end;
5079      struct coding_system *coding;
5080      unsigned char *str;
5081 {
5082   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5083   int eol_conversion;
5084   Lisp_Object translation_table;
5085
5086   if (coding->type == coding_type_ccl
5087       || coding->type == coding_type_undecided
5088       || coding->eol_type != CODING_EOL_LF
5089       || !NILP (coding->post_read_conversion)
5090       || coding->composing != COMPOSITION_DISABLED)
5091     {
5092       /* We can't skip any data.  */
5093       return;
5094     }
5095   if (coding->type == coding_type_no_conversion
5096       || coding->type == coding_type_raw_text
5097       || coding->type == coding_type_emacs_mule)
5098     {
5099       /* We need no conversion, but don't have to skip any data here.
5100          Decoding routine handles them effectively anyway.  */
5101       return;
5102     }
5103
5104   translation_table = coding->translation_table_for_decode;
5105   if (NILP (translation_table) && !NILP (Venable_character_translation))
5106     translation_table = Vstandard_translation_table_for_decode;
5107   if (CHAR_TABLE_P (translation_table))
5108     {
5109       int i;
5110       for (i = 0; i < 128; i++)
5111         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5112           break;
5113       if (i < 128)
5114         /* Some ASCII character should be translated.  We give up
5115            shrinking.  */
5116         return;
5117     }
5118
5119   if (coding->heading_ascii >= 0)
5120     /* Detection routine has already found how much we can skip at the
5121        head.  */
5122     *beg += coding->heading_ascii;
5123
5124   if (str)
5125     {
5126       begp_orig = begp = str + *beg;
5127       endp_orig = endp = str + *end;
5128     }
5129   else
5130     {
5131       begp_orig = begp = BYTE_POS_ADDR (*beg);
5132       endp_orig = endp = begp + *end - *beg;
5133     }
5134
5135   eol_conversion = (coding->eol_type == CODING_EOL_CR
5136                     || coding->eol_type == CODING_EOL_CRLF);
5137
5138   switch (coding->type)
5139     {
5140     case coding_type_sjis:
5141     case coding_type_big5:
5142       /* We can skip all ASCII characters at the head.  */
5143       if (coding->heading_ascii < 0)
5144         {
5145           if (eol_conversion)
5146             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5147           else
5148             while (begp < endp && *begp < 0x80) begp++;
5149         }
5150       /* We can skip all ASCII characters at the tail except for the
5151          second byte of SJIS or BIG5 code.  */
5152       if (eol_conversion)
5153         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5154       else
5155         while (begp < endp && endp[-1] < 0x80) endp--;
5156       /* Do not consider LF as ascii if preceded by CR, since that
5157          confuses eol decoding. */
5158       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5159         endp++;
5160       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5161         endp++;
5162       break;
5163
5164     case coding_type_iso2022:
5165       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5166         /* We can't skip any data.  */
5167         break;
5168       if (coding->heading_ascii < 0)
5169         {
5170           /* We can skip all ASCII characters at the head except for a
5171              few control codes.  */
5172           while (begp < endp && (c = *begp) < 0x80
5173                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5174                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5175                  && (!eol_conversion || c != ISO_CODE_LF))
5176             begp++;
5177         }
5178       switch (coding->category_idx)
5179         {
5180         case CODING_CATEGORY_IDX_ISO_8_1:
5181         case CODING_CATEGORY_IDX_ISO_8_2:
5182           /* We can skip all ASCII characters at the tail.  */
5183           if (eol_conversion)
5184             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5185           else
5186             while (begp < endp && endp[-1] < 0x80) endp--;
5187           /* Do not consider LF as ascii if preceded by CR, since that
5188              confuses eol decoding. */
5189           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5190             endp++;
5191           break;
5192
5193         case CODING_CATEGORY_IDX_ISO_7:
5194         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5195           {
5196             /* We can skip all characters at the tail except for 8-bit
5197                codes and ESC and the following 2-byte at the tail.  */
5198             unsigned char *eight_bit = NULL;
5199
5200             if (eol_conversion)
5201               while (begp < endp
5202                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5203                 {
5204                   if (!eight_bit && c & 0x80) eight_bit = endp;
5205                   endp--;
5206                 }
5207             else
5208               while (begp < endp
5209                      && (c = endp[-1]) != ISO_CODE_ESC)
5210                 {
5211                   if (!eight_bit && c & 0x80) eight_bit = endp;
5212                   endp--;
5213                 }
5214             /* Do not consider LF as ascii if preceded by CR, since that
5215                confuses eol decoding. */
5216             if (begp < endp && endp < endp_orig
5217                 && endp[-1] == '\r' && endp[0] == '\n')
5218               endp++;
5219             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5220               {
5221                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5222                   /* This is an ASCII designation sequence.  We can
5223                      surely skip the tail.  But, if we have
5224                      encountered an 8-bit code, skip only the codes
5225                      after that.  */
5226                   endp = eight_bit ? eight_bit : endp + 2;
5227                 else
5228                   /* Hmmm, we can't skip the tail.  */
5229                   endp = endp_orig;
5230               }
5231             else if (eight_bit)
5232               endp = eight_bit;
5233           }
5234         }
5235       break;
5236
5237     default:
5238       abort ();
5239     }
5240   *beg += begp - begp_orig;
5241   *end += endp - endp_orig;
5242   return;
5243 }
5244
5245 /* Like shrink_decoding_region but for encoding.  */
5246
5247 static void
5248 shrink_encoding_region (beg, end, coding, str)
5249      int *beg, *end;
5250      struct coding_system *coding;
5251      unsigned char *str;
5252 {
5253   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5254   int eol_conversion;
5255   Lisp_Object translation_table;
5256
5257   if (coding->type == coding_type_ccl
5258       || coding->eol_type == CODING_EOL_CRLF
5259       || coding->eol_type == CODING_EOL_CR
5260       || (coding->eol_type == CODING_EOL_UNDECIDED
5261           && system_eol_type != CODING_EOL_LF)
5262       || (coding->cmp_data && coding->cmp_data->used > 0))
5263     {
5264       /* We can't skip any data.  */
5265       return;
5266     }
5267   if (coding->type == coding_type_no_conversion
5268       || coding->type == coding_type_raw_text
5269       || coding->type == coding_type_emacs_mule
5270       || coding->type == coding_type_undecided)
5271     {
5272       /* We need no conversion, but don't have to skip any data here.
5273          Encoding routine handles them effectively anyway.  */
5274       return;
5275     }
5276
5277   translation_table = coding->translation_table_for_encode;
5278   if (NILP (translation_table) && !NILP (Venable_character_translation))
5279     translation_table = Vstandard_translation_table_for_encode;
5280   if (CHAR_TABLE_P (translation_table))
5281     {
5282       int i;
5283       for (i = 0; i < 128; i++)
5284         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5285           break;
5286       if (i < 128)
5287         /* Some ASCII character should be translated.  We give up
5288            shrinking.  */
5289         return;
5290     }
5291
5292   if (str)
5293     {
5294       begp_orig = begp = str + *beg;
5295       endp_orig = endp = str + *end;
5296     }
5297   else
5298     {
5299       begp_orig = begp = BYTE_POS_ADDR (*beg);
5300       endp_orig = endp = begp + *end - *beg;
5301     }
5302
5303   eol_conversion = (coding->eol_type == CODING_EOL_CR
5304                     || coding->eol_type == CODING_EOL_CRLF);
5305
5306   /* Here, we don't have to check coding->pre_write_conversion because
5307      the caller is expected to have handled it already.  */
5308   switch (coding->type)
5309     {
5310     case coding_type_iso2022:
5311       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5312         /* We can't skip any data.  */
5313         break;
5314       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5315         {
5316           unsigned char *bol = begp;
5317           while (begp < endp && *begp < 0x80)
5318             {
5319               begp++;
5320               if (begp[-1] == '\n')
5321                 bol = begp;
5322             }
5323           begp = bol;
5324           goto label_skip_tail;
5325         }
5326       /* fall down ... */
5327
5328     case coding_type_sjis:
5329     case coding_type_big5:
5330       /* We can skip all ASCII characters at the head and tail.  */
5331       if (eol_conversion)
5332         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5333       else
5334         while (begp < endp && *begp < 0x80) begp++;
5335     label_skip_tail:
5336       if (eol_conversion)
5337         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5338       else
5339         while (begp < endp && *(endp - 1) < 0x80) endp--;
5340       break;
5341
5342     default:
5343       abort ();
5344     }
5345
5346   *beg += begp - begp_orig;
5347   *end += endp - endp_orig;
5348   return;
5349 }
5350
5351 /* As shrinking conversion region requires some overhead, we don't try
5352    shrinking if the length of conversion region is less than this
5353    value.  */
5354 static int shrink_conversion_region_threshhold = 1024;
5355
5356 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5357   do {                                                                  \
5358     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5359       {                                                                 \
5360         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5361         else shrink_decoding_region (beg, end, coding, str);            \
5362       }                                                                 \
5363   } while (0)
5364
5365 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5366    Vlast_coding_system_used and the remaining elements are buffers to
5367    kill.  */
5368 static Lisp_Object
5369 code_convert_region_unwind (arg)
5370      Lisp_Object arg;
5371 {
5372   struct gcpro gcpro1;
5373   GCPRO1 (arg);
5374
5375   inhibit_pre_post_conversion = 0;
5376   Vlast_coding_system_used = XCAR (arg);
5377   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5378     Fkill_buffer (XCAR (arg));
5379
5380   UNGCPRO;
5381   return Qnil;
5382 }
5383
5384 /* Store information about all compositions in the range FROM and TO
5385    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5386    buffer or a string, defaults to the current buffer.  */
5387
5388 void
5389 coding_save_composition (coding, from, to, obj)
5390      struct coding_system *coding;
5391      int from, to;
5392      Lisp_Object obj;
5393 {
5394   Lisp_Object prop;
5395   int start, end;
5396
5397   if (coding->composing == COMPOSITION_DISABLED)
5398     return;
5399   if (!coding->cmp_data)
5400     coding_allocate_composition_data (coding, from);
5401   if (!find_composition (from, to, &start, &end, &prop, obj)
5402       || end > to)
5403     return;
5404   if (start < from
5405       && (!find_composition (end, to, &start, &end, &prop, obj)
5406           || end > to))
5407     return;
5408   coding->composing = COMPOSITION_NO;
5409   do
5410     {
5411       if (COMPOSITION_VALID_P (start, end, prop))
5412         {
5413           enum composition_method method = COMPOSITION_METHOD (prop);
5414           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5415               >= COMPOSITION_DATA_SIZE)
5416             coding_allocate_composition_data (coding, from);
5417           /* For relative composition, we remember start and end
5418              positions, for the other compositions, we also remember
5419              components.  */
5420           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5421           if (method != COMPOSITION_RELATIVE)
5422             {
5423               /* We must store a*/
5424               Lisp_Object val, ch;
5425
5426               val = COMPOSITION_COMPONENTS (prop);
5427               if (CONSP (val))
5428                 while (CONSP (val))
5429                   {
5430                     ch = XCAR (val), val = XCDR (val);
5431                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5432                   }
5433               else if (VECTORP (val) || STRINGP (val))
5434                 {
5435                   int len = (VECTORP (val)
5436                              ? XVECTOR (val)->size : SCHARS (val));
5437                   int i;
5438                   for (i = 0; i < len; i++)
5439                     {
5440                       ch = (STRINGP (val)
5441                             ? Faref (val, make_number (i))
5442                             : XVECTOR (val)->contents[i]);
5443                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5444                     }
5445                 }
5446               else              /* INTEGERP (val) */
5447                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5448             }
5449           CODING_ADD_COMPOSITION_END (coding, end - from);
5450         }
5451       start = end;
5452     }
5453   while (start < to
5454          && find_composition (start, to, &start, &end, &prop, obj)
5455          && end <= to);
5456
5457   /* Make coding->cmp_data point to the first memory block.  */
5458   while (coding->cmp_data->prev)
5459     coding->cmp_data = coding->cmp_data->prev;
5460   coding->cmp_data_start = 0;
5461 }
5462
5463 /* Reflect the saved information about compositions to OBJ.
5464    CODING->cmp_data points to a memory block for the information.  OBJ
5465    is a buffer or a string, defaults to the current buffer.  */
5466
5467 void
5468 coding_restore_composition (coding, obj)
5469      struct coding_system *coding;
5470      Lisp_Object obj;
5471 {
5472   struct composition_data *cmp_data = coding->cmp_data;
5473
5474   if (!cmp_data)
5475     return;
5476
5477   while (cmp_data->prev)
5478     cmp_data = cmp_data->prev;
5479
5480   while (cmp_data)
5481     {
5482       int i;
5483
5484       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5485            i += cmp_data->data[i])
5486         {
5487           int *data = cmp_data->data + i;
5488           enum composition_method method = (enum composition_method) data[3];
5489           Lisp_Object components;
5490
5491           if (data[0] < 0 || i + data[0] > cmp_data->used)
5492             /* Invalid composition data.  */
5493             break;
5494
5495           if (method == COMPOSITION_RELATIVE)
5496             components = Qnil;
5497           else
5498             {
5499               int len = data[0] - 4, j;
5500               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5501
5502               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5503                   && len % 2 == 0)
5504                 len --;
5505               if (len < 1)
5506                 /* Invalid composition data.  */
5507                 break;
5508               for (j = 0; j < len; j++)
5509                 args[j] = make_number (data[4 + j]);
5510               components = (method == COMPOSITION_WITH_ALTCHARS
5511                             ? Fstring (len, args)
5512                             : Fvector (len, args));
5513             }
5514           compose_text (data[1], data[2], components, Qnil, obj);
5515         }
5516       cmp_data = cmp_data->next;
5517     }
5518 }
5519
5520 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5521    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5522    coding system CODING, and return the status code of code conversion
5523    (currently, this value has no meaning).
5524
5525    How many characters (and bytes) are converted to how many
5526    characters (and bytes) are recorded in members of the structure
5527    CODING.
5528
5529    If REPLACE is nonzero, we do various things as if the original text
5530    is deleted and a new text is inserted.  See the comments in
5531    replace_range (insdel.c) to know what we are doing.
5532
5533    If REPLACE is zero, it is assumed that the source text is unibyte.
5534    Otherwise, it is assumed that the source text is multibyte.  */
5535
5536 int
5537 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5538      int from, from_byte, to, to_byte, encodep, replace;
5539      struct coding_system *coding;
5540 {
5541   int len = to - from, len_byte = to_byte - from_byte;
5542   int nchars_del = 0, nbytes_del = 0;
5543   int require, inserted, inserted_byte;
5544   int head_skip, tail_skip, total_skip = 0;
5545   Lisp_Object saved_coding_symbol;
5546   int first = 1;
5547   unsigned char *src, *dst;
5548   Lisp_Object deletion;
5549   int orig_point = PT, orig_len = len;
5550   int prev_Z;
5551   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5552
5553   deletion = Qnil;
5554   saved_coding_symbol = coding->symbol;
5555
5556   if (from < PT && PT < to)
5557     {
5558       TEMP_SET_PT_BOTH (from, from_byte);
5559       orig_point = from;
5560     }
5561
5562   if (replace)
5563     {
5564       int saved_from = from;
5565       int saved_inhibit_modification_hooks;
5566
5567       prepare_to_modify_buffer (from, to, &from);
5568       if (saved_from != from)
5569         {
5570           to = from + len;
5571           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5572           len_byte = to_byte - from_byte;
5573         }
5574
5575       /* The code conversion routine can not preserve text properties
5576          for now.  So, we must remove all text properties in the
5577          region.  Here, we must suppress all modification hooks.  */
5578       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5579       inhibit_modification_hooks = 1;
5580       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5581       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5582     }
5583
5584   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5585     {
5586       /* We must detect encoding of text and eol format.  */
5587
5588       if (from < GPT && to > GPT)
5589         move_gap_both (from, from_byte);
5590       if (coding->type == coding_type_undecided)
5591         {
5592           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5593           if (coding->type == coding_type_undecided)
5594             {
5595               /* It seems that the text contains only ASCII, but we
5596                  should not leave it undecided because the deeper
5597                  decoding routine (decode_coding) tries to detect the
5598                  encodings again in vain.  */
5599               coding->type = coding_type_emacs_mule;
5600               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5601               /* As emacs-mule decoder will handle composition, we
5602                  need this setting to allocate coding->cmp_data
5603                  later.  */
5604               coding->composing = COMPOSITION_NO;
5605             }
5606         }
5607       if (coding->eol_type == CODING_EOL_UNDECIDED
5608           && coding->type != coding_type_ccl)
5609         {
5610           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5611           if (coding->eol_type == CODING_EOL_UNDECIDED)
5612             coding->eol_type = CODING_EOL_LF;
5613           /* We had better recover the original eol format if we
5614              encounter an inconsistent eol format while decoding.  */
5615           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5616         }
5617     }
5618
5619   /* Now we convert the text.  */
5620
5621   /* For encoding, we must process pre-write-conversion in advance.  */
5622   if (! inhibit_pre_post_conversion
5623       && encodep
5624       && SYMBOLP (coding->pre_write_conversion)
5625       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5626     {
5627       /* The function in pre-write-conversion may put a new text in a
5628          new buffer.  */
5629       struct buffer *prev = current_buffer;
5630       Lisp_Object new;
5631
5632       record_unwind_protect (code_convert_region_unwind,
5633                              Fcons (Vlast_coding_system_used, Qnil));
5634       /* We should not call any more pre-write/post-read-conversion
5635          functions while this pre-write-conversion is running.  */
5636       inhibit_pre_post_conversion = 1;
5637       call2 (coding->pre_write_conversion,
5638              make_number (from), make_number (to));
5639       inhibit_pre_post_conversion = 0;
5640       /* Discard the unwind protect.  */
5641       specpdl_ptr--;
5642
5643       if (current_buffer != prev)
5644         {
5645           len = ZV - BEGV;
5646           new = Fcurrent_buffer ();
5647           set_buffer_internal_1 (prev);
5648           del_range_2 (from, from_byte, to, to_byte, 0);
5649           TEMP_SET_PT_BOTH (from, from_byte);
5650           insert_from_buffer (XBUFFER (new), 1, len, 0);
5651           Fkill_buffer (new);
5652           if (orig_point >= to)
5653             orig_point += len - orig_len;
5654           else if (orig_point > from)
5655             orig_point = from;
5656           orig_len = len;
5657           to = from + len;
5658           from_byte = CHAR_TO_BYTE (from);
5659           to_byte = CHAR_TO_BYTE (to);
5660           len_byte = to_byte - from_byte;
5661           TEMP_SET_PT_BOTH (from, from_byte);
5662         }
5663     }
5664
5665   if (replace)
5666     {
5667       if (! EQ (current_buffer->undo_list, Qt))
5668         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5669       else
5670         {
5671           nchars_del = to - from;
5672           nbytes_del = to_byte - from_byte;
5673         }
5674     }
5675
5676   if (coding->composing != COMPOSITION_DISABLED)
5677     {
5678       if (encodep)
5679         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5680       else
5681         coding_allocate_composition_data (coding, from);
5682     }
5683
5684   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5685      if we must run CCL program or there are compositions to
5686      encode.  */
5687   if (coding->type != coding_type_ccl
5688       && (! coding->cmp_data || coding->cmp_data->used == 0))
5689     {
5690       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5691
5692       if (from < GPT && GPT < to)
5693         move_gap_both (from, from_byte);
5694       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5695       if (from_byte == to_byte
5696           && (encodep || NILP (coding->post_read_conversion))
5697           && ! CODING_REQUIRE_FLUSHING (coding))
5698         {
5699           coding->produced = len_byte;
5700           coding->produced_char = len;
5701           if (!replace)
5702             /* We must record and adjust for this new text now.  */
5703             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5704           coding_free_composition_data (coding);
5705           return 0;
5706         }
5707
5708       head_skip = from_byte - from_byte_orig;
5709       tail_skip = to_byte_orig - to_byte;
5710       total_skip = head_skip + tail_skip;
5711       from += head_skip;
5712       to -= tail_skip;
5713       len -= total_skip; len_byte -= total_skip;
5714     }
5715
5716   /* For conversion, we must put the gap before the text in addition to
5717      making the gap larger for efficient decoding.  The required gap
5718      size starts from 2000 which is the magic number used in make_gap.
5719      But, after one batch of conversion, it will be incremented if we
5720      find that it is not enough .  */
5721   require = 2000;
5722
5723   if (GAP_SIZE  < require)
5724     make_gap (require - GAP_SIZE);
5725   move_gap_both (from, from_byte);
5726
5727   inserted = inserted_byte = 0;
5728
5729   GAP_SIZE += len_byte;
5730   ZV -= len;
5731   Z -= len;
5732   ZV_BYTE -= len_byte;
5733   Z_BYTE -= len_byte;
5734
5735   if (GPT - BEG < BEG_UNCHANGED)
5736     BEG_UNCHANGED = GPT - BEG;
5737   if (Z - GPT < END_UNCHANGED)
5738     END_UNCHANGED = Z - GPT;
5739
5740   if (!encodep && coding->src_multibyte)
5741     {
5742       /* Decoding routines expects that the source text is unibyte.
5743          We must convert 8-bit characters of multibyte form to
5744          unibyte.  */
5745       int len_byte_orig = len_byte;
5746       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5747       if (len_byte < len_byte_orig)
5748         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5749                     len_byte);
5750       coding->src_multibyte = 0;
5751     }
5752
5753   for (;;)
5754     {
5755       int result;
5756
5757       /* The buffer memory is now:
5758          +--------+converted-text+---------+-------original-text-------+---+
5759          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5760                   |<---------------------- GAP ----------------------->|  */
5761       src = GAP_END_ADDR - len_byte;
5762       dst = GPT_ADDR + inserted_byte;
5763
5764       if (encodep)
5765         result = encode_coding (coding, src, dst, len_byte, 0);
5766       else
5767         {
5768           if (coding->composing != COMPOSITION_DISABLED)
5769             coding->cmp_data->char_offset = from + inserted;
5770           result = decode_coding (coding, src, dst, len_byte, 0);
5771         }
5772
5773       /* The buffer memory is now:
5774          +--------+-------converted-text----+--+------original-text----+---+
5775          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5776                   |<---------------------- GAP ----------------------->|  */
5777
5778       inserted += coding->produced_char;
5779       inserted_byte += coding->produced;
5780       len_byte -= coding->consumed;
5781
5782       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5783         {
5784           coding_allocate_composition_data (coding, from + inserted);
5785           continue;
5786         }
5787
5788       src += coding->consumed;
5789       dst += coding->produced;
5790
5791       if (result == CODING_FINISH_NORMAL)
5792         {
5793           src += len_byte;
5794           break;
5795         }
5796       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5797         {
5798           unsigned char *pend = dst, *p = pend - inserted_byte;
5799           Lisp_Object eol_type;
5800
5801           /* Encode LFs back to the original eol format (CR or CRLF).  */
5802           if (coding->eol_type == CODING_EOL_CR)
5803             {
5804               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5805             }
5806           else
5807             {
5808               int count = 0;
5809
5810               while (p < pend) if (*p++ == '\n') count++;
5811               if (src - dst < count)
5812                 {
5813                   /* We don't have sufficient room for encoding LFs
5814                      back to CRLF.  We must record converted and
5815                      not-yet-converted text back to the buffer
5816                      content, enlarge the gap, then record them out of
5817                      the buffer contents again.  */
5818                   int add = len_byte + inserted_byte;
5819
5820                   GAP_SIZE -= add;
5821                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5822                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5823                   make_gap (count - GAP_SIZE);
5824                   GAP_SIZE += add;
5825                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5826                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5827                   /* Don't forget to update SRC, DST, and PEND.  */
5828                   src = GAP_END_ADDR - len_byte;
5829                   dst = GPT_ADDR + inserted_byte;
5830                   pend = dst;
5831                 }
5832               inserted += count;
5833               inserted_byte += count;
5834               coding->produced += count;
5835               p = dst = pend + count;
5836               while (count)
5837                 {
5838                   *--p = *--pend;
5839                   if (*p == '\n') count--, *--p = '\r';
5840                 }
5841             }
5842
5843           /* Suppress eol-format conversion in the further conversion.  */
5844           coding->eol_type = CODING_EOL_LF;
5845
5846           /* Set the coding system symbol to that for Unix-like EOL.  */
5847           eol_type = Fget (saved_coding_symbol, Qeol_type);
5848           if (VECTORP (eol_type)
5849               && XVECTOR (eol_type)->size == 3
5850               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5851             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5852           else
5853             coding->symbol = saved_coding_symbol;
5854
5855           continue;
5856         }
5857       if (len_byte <= 0)
5858         {
5859           if (coding->type != coding_type_ccl
5860               || coding->mode & CODING_MODE_LAST_BLOCK)
5861             break;
5862           coding->mode |= CODING_MODE_LAST_BLOCK;
5863           continue;
5864         }
5865       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5866         {
5867           /* The source text ends in invalid codes.  Let's just
5868              make them valid buffer contents, and finish conversion.  */
5869           if (multibyte_p)
5870             {
5871               unsigned char *start = dst;
5872
5873               inserted += len_byte;
5874               while (len_byte--)
5875                 {
5876                   int c = *src++;
5877                   dst += CHAR_STRING (c, dst);
5878                 }
5879
5880               inserted_byte += dst - start;
5881             }
5882           else
5883             {
5884               inserted += len_byte;
5885               inserted_byte += len_byte;
5886               while (len_byte--)
5887                 *dst++ = *src++;
5888             }
5889           break;
5890         }
5891       if (result == CODING_FINISH_INTERRUPT)
5892         {
5893           /* The conversion procedure was interrupted by a user.  */
5894           break;
5895         }
5896       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5897       if (coding->consumed < 1)
5898         {
5899           /* It's quite strange to require more memory without
5900              consuming any bytes.  Perhaps CCL program bug.  */
5901           break;
5902         }
5903       if (first)
5904         {
5905           /* We have just done the first batch of conversion which was
5906              stopped because of insufficient gap.  Let's reconsider the
5907              required gap size (i.e. SRT - DST) now.
5908
5909              We have converted ORIG bytes (== coding->consumed) into
5910              NEW bytes (coding->produced).  To convert the remaining
5911              LEN bytes, we may need REQUIRE bytes of gap, where:
5912                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5913                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5914              Here, we are sure that NEW >= ORIG.  */
5915
5916           if (coding->produced <= coding->consumed)
5917             {
5918               /* This happens because of CCL-based coding system with
5919                  eol-type CRLF.  */
5920               require = 0;
5921             }
5922           else
5923             {
5924               float ratio = coding->produced - coding->consumed;
5925               ratio /= coding->consumed;
5926               require = len_byte * ratio;
5927             }
5928           first = 0;
5929         }
5930       if ((src - dst) < (require + 2000))
5931         {
5932           /* See the comment above the previous call of make_gap.  */
5933           int add = len_byte + inserted_byte;
5934
5935           GAP_SIZE -= add;
5936           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5937           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5938           make_gap (require + 2000);
5939           GAP_SIZE += add;
5940           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5941           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5942         }
5943     }
5944   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5945
5946   if (encodep && coding->dst_multibyte)
5947     {
5948       /* The output is unibyte.  We must convert 8-bit characters to
5949          multibyte form.  */
5950       if (inserted_byte * 2 > GAP_SIZE)
5951         {
5952           GAP_SIZE -= inserted_byte;
5953           ZV += inserted_byte; Z += inserted_byte;
5954           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5955           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5956           make_gap (inserted_byte - GAP_SIZE);
5957           GAP_SIZE += inserted_byte;
5958           ZV -= inserted_byte; Z -= inserted_byte;
5959           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5960           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5961         }
5962       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5963     }
5964
5965   /* If we shrank the conversion area, adjust it now.  */
5966   if (total_skip > 0)
5967     {
5968       if (tail_skip > 0)
5969         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5970       inserted += total_skip; inserted_byte += total_skip;
5971       GAP_SIZE += total_skip;
5972       GPT -= head_skip; GPT_BYTE -= head_skip;
5973       ZV -= total_skip; ZV_BYTE -= total_skip;
5974       Z -= total_skip; Z_BYTE -= total_skip;
5975       from -= head_skip; from_byte -= head_skip;
5976       to += tail_skip; to_byte += tail_skip;
5977     }
5978
5979   prev_Z = Z;
5980   if (! EQ (current_buffer->undo_list, Qt))
5981     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5982   else
5983     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5984                                  inserted, inserted_byte);
5985   inserted = Z - prev_Z;
5986
5987   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5988     coding_restore_composition (coding, Fcurrent_buffer ());
5989   coding_free_composition_data (coding);
5990
5991   if (! inhibit_pre_post_conversion
5992       && ! encodep && ! NILP (coding->post_read_conversion))
5993     {
5994       Lisp_Object val;
5995       Lisp_Object saved_coding_system;
5996
5997       if (from != PT)
5998         TEMP_SET_PT_BOTH (from, from_byte);
5999       prev_Z = Z;
6000       record_unwind_protect (code_convert_region_unwind,
6001                              Fcons (Vlast_coding_system_used, Qnil));
6002       saved_coding_system = Vlast_coding_system_used;
6003       Vlast_coding_system_used = coding->symbol;
6004       /* We should not call any more pre-write/post-read-conversion
6005          functions while this post-read-conversion is running.  */
6006       inhibit_pre_post_conversion = 1;
6007       val = call1 (coding->post_read_conversion, make_number (inserted));
6008       inhibit_pre_post_conversion = 0;
6009       coding->symbol = Vlast_coding_system_used;
6010       Vlast_coding_system_used = saved_coding_system;
6011       /* Discard the unwind protect.  */
6012       specpdl_ptr--;
6013       CHECK_NUMBER (val);
6014       inserted += Z - prev_Z;
6015     }
6016
6017   if (orig_point >= from)
6018     {
6019       if (orig_point >= from + orig_len)
6020         orig_point += inserted - orig_len;
6021       else
6022         orig_point = from;
6023       TEMP_SET_PT (orig_point);
6024     }
6025
6026   if (replace)
6027     {
6028       signal_after_change (from, to - from, inserted);
6029       update_compositions (from, from + inserted, CHECK_BORDER);
6030     }
6031
6032   {
6033     coding->consumed = to_byte - from_byte;
6034     coding->consumed_char = to - from;
6035     coding->produced = inserted_byte;
6036     coding->produced_char = inserted;
6037   }
6038
6039   return 0;
6040 }
6041
6042 /* Name (or base name) of work buffer for code conversion.  */
6043 static Lisp_Object Vcode_conversion_workbuf_name;
6044
6045 /* Set the current buffer to the working buffer prepared for
6046    code-conversion.  MULTIBYTE specifies the multibyteness of the
6047    buffer.  Return the buffer we set if it must be killed after use.
6048    Otherwise return Qnil.  */
6049
6050 static Lisp_Object
6051 set_conversion_work_buffer (multibyte)
6052      int multibyte;
6053 {
6054   Lisp_Object buffer, buffer_to_kill;
6055   struct buffer *buf;
6056
6057   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6058   buf = XBUFFER (buffer);
6059   if (buf == current_buffer)
6060     {
6061       /* As we are already in the work buffer, we must generate a new
6062          buffer for the work.  */
6063       Lisp_Object name;
6064
6065       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6066       buffer = buffer_to_kill = Fget_buffer_create (name);
6067       buf = XBUFFER (buffer);
6068     }
6069   else
6070     buffer_to_kill = Qnil;
6071
6072   delete_all_overlays (buf);
6073   buf->directory = current_buffer->directory;
6074   buf->read_only = Qnil;
6075   buf->filename = Qnil;
6076   buf->undo_list = Qt;
6077   eassert (buf->overlays_before == NULL);
6078   eassert (buf->overlays_after == NULL);
6079   set_buffer_internal (buf);
6080   if (BEG != BEGV || Z != ZV)
6081     Fwiden ();
6082   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6083   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6084   return buffer_to_kill;
6085 }
6086
6087 Lisp_Object
6088 run_pre_post_conversion_on_str (str, coding, encodep)
6089      Lisp_Object str;
6090      struct coding_system *coding;
6091      int encodep;
6092 {
6093   int count = SPECPDL_INDEX ();
6094   struct gcpro gcpro1, gcpro2;
6095   int multibyte = STRING_MULTIBYTE (str);
6096   Lisp_Object old_deactivate_mark;
6097   Lisp_Object buffer_to_kill;
6098   Lisp_Object unwind_arg;
6099
6100   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6101   /* It is not crucial to specbind this.  */
6102   old_deactivate_mark = Vdeactivate_mark;
6103   GCPRO2 (str, old_deactivate_mark);
6104
6105   /* We must insert the contents of STR as is without
6106      unibyte<->multibyte conversion.  For that, we adjust the
6107      multibyteness of the working buffer to that of STR.  */
6108   buffer_to_kill = set_conversion_work_buffer (multibyte);
6109   if (NILP (buffer_to_kill))
6110     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6111   else
6112     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6113   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6114
6115   insert_from_string (str, 0, 0,
6116                       SCHARS (str), SBYTES (str), 0);
6117   UNGCPRO;
6118   inhibit_pre_post_conversion = 1;
6119   if (encodep)
6120     {
6121       struct buffer *prev = current_buffer;
6122
6123       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6124       if (prev != current_buffer)
6125         /* We must kill the current buffer too.  */
6126         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6127     }
6128   else
6129     {
6130       Vlast_coding_system_used = coding->symbol;
6131       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6132       call1 (coding->post_read_conversion, make_number (Z - BEG));
6133       coding->symbol = Vlast_coding_system_used;
6134     }
6135   inhibit_pre_post_conversion = 0;
6136   Vdeactivate_mark = old_deactivate_mark;
6137   str = make_buffer_string (BEG, Z, 1);
6138   return unbind_to (count, str);
6139 }
6140
6141
6142 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6143    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6144    is intended that this function is called from encode_terminal_code,
6145    the pre-write-conversion function is run by safe_call and thus
6146    "Error during redisplay: ..." is logged when an error occurs.
6147
6148    Store the resulting text in *STR and set CODING->produced_char and
6149    CODING->produced to the number of characters and bytes
6150    respectively.  If the size of *STR is too small, enlarge it by
6151    xrealloc and update *STR and *SIZE.  */
6152
6153 void
6154 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6155      unsigned char **str;
6156      int *size, nchars, nbytes;
6157      struct coding_system *coding;
6158 {
6159   struct gcpro gcpro1, gcpro2;
6160   struct buffer *cur = current_buffer;
6161   struct buffer *prev;
6162   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6163   Lisp_Object args[3];
6164   Lisp_Object buffer_to_kill;
6165
6166   /* It is not crucial to specbind this.  */
6167   old_deactivate_mark = Vdeactivate_mark;
6168   old_last_coding_system_used = Vlast_coding_system_used;
6169   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6170
6171   /* We must insert the contents of STR as is without
6172      unibyte<->multibyte conversion.  For that, we adjust the
6173      multibyteness of the working buffer to that of STR.  */
6174   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6175   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6176   UNGCPRO;
6177   inhibit_pre_post_conversion = 1;
6178   prev = current_buffer;
6179   args[0] = coding->pre_write_conversion;
6180   args[1] = make_number (BEG);
6181   args[2] = make_number (Z);
6182   safe_call (3, args);
6183   inhibit_pre_post_conversion = 0;
6184   Vdeactivate_mark = old_deactivate_mark;
6185   Vlast_coding_system_used = old_last_coding_system_used;
6186   coding->produced_char = Z - BEG;
6187   coding->produced = Z_BYTE - BEG_BYTE;
6188   if (coding->produced > *size)
6189     {
6190       *size = coding->produced;
6191       *str = xrealloc (*str, *size);
6192     }
6193   if (BEG < GPT && GPT < Z)
6194     move_gap (BEG);
6195   bcopy (BEG_ADDR, *str, coding->produced);
6196   coding->src_multibyte
6197     = ! NILP (current_buffer->enable_multibyte_characters);
6198   if (prev != current_buffer)
6199     Fkill_buffer (Fcurrent_buffer ());
6200   set_buffer_internal (cur);
6201   if (! NILP (buffer_to_kill))
6202     Fkill_buffer (buffer_to_kill);
6203 }
6204
6205
6206 Lisp_Object
6207 decode_coding_string (str, coding, nocopy)
6208      Lisp_Object str;
6209      struct coding_system *coding;
6210      int nocopy;
6211 {
6212   int len;
6213   struct conversion_buffer buf;
6214   int from, to_byte;
6215   Lisp_Object saved_coding_symbol;
6216   int result;
6217   int require_decoding;
6218   int shrinked_bytes = 0;
6219   Lisp_Object newstr;
6220   int consumed, consumed_char, produced, produced_char;
6221
6222   from = 0;
6223   to_byte = SBYTES (str);
6224
6225   saved_coding_symbol = coding->symbol;
6226   coding->src_multibyte = STRING_MULTIBYTE (str);
6227   coding->dst_multibyte = 1;
6228   if (CODING_REQUIRE_DETECTION (coding))
6229     {
6230       /* See the comments in code_convert_region.  */
6231       if (coding->type == coding_type_undecided)
6232         {
6233           detect_coding (coding, SDATA (str), to_byte);
6234           if (coding->type == coding_type_undecided)
6235             {
6236               coding->type = coding_type_emacs_mule;
6237               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6238               /* As emacs-mule decoder will handle composition, we
6239                  need this setting to allocate coding->cmp_data
6240                  later.  */
6241               coding->composing = COMPOSITION_NO;
6242             }
6243         }
6244       if (coding->eol_type == CODING_EOL_UNDECIDED
6245           && coding->type != coding_type_ccl)
6246         {
6247           saved_coding_symbol = coding->symbol;
6248           detect_eol (coding, SDATA (str), to_byte);
6249           if (coding->eol_type == CODING_EOL_UNDECIDED)
6250             coding->eol_type = CODING_EOL_LF;
6251           /* We had better recover the original eol format if we
6252              encounter an inconsistent eol format while decoding.  */
6253           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6254         }
6255     }
6256
6257   if (coding->type == coding_type_no_conversion
6258       || coding->type == coding_type_raw_text)
6259     coding->dst_multibyte = 0;
6260
6261   require_decoding = CODING_REQUIRE_DECODING (coding);
6262
6263   if (STRING_MULTIBYTE (str))
6264     {
6265       /* Decoding routines expect the source text to be unibyte.  */
6266       str = Fstring_as_unibyte (str);
6267       to_byte = SBYTES (str);
6268       nocopy = 1;
6269       coding->src_multibyte = 0;
6270     }
6271
6272   /* Try to skip the heading and tailing ASCIIs.  */
6273   if (require_decoding && coding->type != coding_type_ccl)
6274     {
6275       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6276                                 0);
6277       if (from == to_byte)
6278         require_decoding = 0;
6279       shrinked_bytes = from + (SBYTES (str) - to_byte);
6280     }
6281
6282   if (!require_decoding
6283       && !(SYMBOLP (coding->post_read_conversion)
6284            && !NILP (Ffboundp (coding->post_read_conversion))))
6285     {
6286       coding->consumed = SBYTES (str);
6287       coding->consumed_char = SCHARS (str);
6288       if (coding->dst_multibyte)
6289         {
6290           str = Fstring_as_multibyte (str);
6291           nocopy = 1;
6292         }
6293       coding->produced = SBYTES (str);
6294       coding->produced_char = SCHARS (str);
6295       return (nocopy ? str : Fcopy_sequence (str));
6296     }
6297
6298   if (coding->composing != COMPOSITION_DISABLED)
6299     coding_allocate_composition_data (coding, from);
6300   len = decoding_buffer_size (coding, to_byte - from);
6301   allocate_conversion_buffer (buf, len);
6302
6303   consumed = consumed_char = produced = produced_char = 0;
6304   while (1)
6305     {
6306       result = decode_coding (coding, SDATA (str) + from + consumed,
6307                               buf.data + produced, to_byte - from - consumed,
6308                               buf.size - produced);
6309       consumed += coding->consumed;
6310       consumed_char += coding->consumed_char;
6311       produced += coding->produced;
6312       produced_char += coding->produced_char;
6313       if (result == CODING_FINISH_NORMAL
6314           || result == CODING_FINISH_INTERRUPT
6315           || (result == CODING_FINISH_INSUFFICIENT_SRC
6316               && coding->consumed == 0))
6317         break;
6318       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6319         coding_allocate_composition_data (coding, from + produced_char);
6320       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6321         extend_conversion_buffer (&buf);
6322       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6323         {
6324           Lisp_Object eol_type;
6325
6326           /* Recover the original EOL format.  */
6327           if (coding->eol_type == CODING_EOL_CR)
6328             {
6329               unsigned char *p;
6330               for (p = buf.data; p < buf.data + produced; p++)
6331                 if (*p == '\n') *p = '\r';
6332             }
6333           else if (coding->eol_type == CODING_EOL_CRLF)
6334             {
6335               int num_eol = 0;
6336               unsigned char *p0, *p1;
6337               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6338                 if (*p0 == '\n') num_eol++;
6339               if (produced + num_eol >= buf.size)
6340                 extend_conversion_buffer (&buf);
6341               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6342                 {
6343                   *--p1 = *--p0;
6344                   if (*p0 == '\n') *--p1 = '\r';
6345                 }
6346               produced += num_eol;
6347               produced_char += num_eol;
6348             }
6349           /* Suppress eol-format conversion in the further conversion.  */
6350           coding->eol_type = CODING_EOL_LF;
6351
6352           /* Set the coding system symbol to that for Unix-like EOL.  */
6353           eol_type = Fget (saved_coding_symbol, Qeol_type);
6354           if (VECTORP (eol_type)
6355               && XVECTOR (eol_type)->size == 3
6356               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6357             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6358           else
6359             coding->symbol = saved_coding_symbol;
6360
6361
6362         }
6363     }
6364
6365   coding->consumed = consumed;
6366   coding->consumed_char = consumed_char;
6367   coding->produced = produced;
6368   coding->produced_char = produced_char;
6369
6370   if (coding->dst_multibyte)
6371     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6372                                            produced + shrinked_bytes);
6373   else
6374     newstr = make_uninit_string (produced + shrinked_bytes);
6375   if (from > 0)
6376     STRING_COPYIN (newstr, 0, SDATA (str), from);
6377   STRING_COPYIN (newstr, from, buf.data, produced);
6378   if (shrinked_bytes > from)
6379     STRING_COPYIN (newstr, from + produced,
6380                    SDATA (str) + to_byte,
6381                    shrinked_bytes - from);
6382   free_conversion_buffer (&buf);
6383
6384   coding->consumed += shrinked_bytes;
6385   coding->consumed_char += shrinked_bytes;
6386   coding->produced += shrinked_bytes;
6387   coding->produced_char += shrinked_bytes;
6388
6389   if (coding->cmp_data && coding->cmp_data->used)
6390     coding_restore_composition (coding, newstr);
6391   coding_free_composition_data (coding);
6392
6393   if (SYMBOLP (coding->post_read_conversion)
6394       && !NILP (Ffboundp (coding->post_read_conversion)))
6395     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6396
6397   return newstr;
6398 }
6399
6400 Lisp_Object
6401 encode_coding_string (str, coding, nocopy)
6402      Lisp_Object str;
6403      struct coding_system *coding;
6404      int nocopy;
6405 {
6406   int len;
6407   struct conversion_buffer buf;
6408   int from, to, to_byte;
6409   int result;
6410   int shrinked_bytes = 0;
6411   Lisp_Object newstr;
6412   int consumed, consumed_char, produced, produced_char;
6413
6414   if (SYMBOLP (coding->pre_write_conversion)
6415       && !NILP (Ffboundp (coding->pre_write_conversion)))
6416     {
6417       str = run_pre_post_conversion_on_str (str, coding, 1);
6418       /* As STR is just newly generated, we don't have to copy it
6419          anymore.  */
6420       nocopy = 1;
6421     }
6422
6423   from = 0;
6424   to = SCHARS (str);
6425   to_byte = SBYTES (str);
6426
6427   /* Encoding routines determine the multibyteness of the source text
6428      by coding->src_multibyte.  */
6429   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6430   coding->dst_multibyte = 0;
6431   if (! CODING_REQUIRE_ENCODING (coding))
6432     goto no_need_of_encoding;
6433
6434   if (coding->composing != COMPOSITION_DISABLED)
6435     coding_save_composition (coding, from, to, str);
6436
6437   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6438      if we must run CCL program or there are compositions to
6439      encode.  */
6440   if (coding->type != coding_type_ccl
6441       && (! coding->cmp_data || coding->cmp_data->used == 0))
6442     {
6443       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6444                                 1);
6445       if (from == to_byte)
6446         {
6447           coding_free_composition_data (coding);
6448           goto no_need_of_encoding;
6449         }
6450       shrinked_bytes = from + (SBYTES (str) - to_byte);
6451     }
6452
6453   len = encoding_buffer_size (coding, to_byte - from);
6454   allocate_conversion_buffer (buf, len);
6455
6456   consumed = consumed_char = produced = produced_char = 0;
6457   while (1)
6458     {
6459       result = encode_coding (coding, SDATA (str) + from + consumed,
6460                               buf.data + produced, to_byte - from - consumed,
6461                               buf.size - produced);
6462       consumed += coding->consumed;
6463       consumed_char += coding->consumed_char;
6464       produced += coding->produced;
6465       produced_char += coding->produced_char;
6466       if (result == CODING_FINISH_NORMAL
6467           || result == CODING_FINISH_INTERRUPT
6468           || (result == CODING_FINISH_INSUFFICIENT_SRC
6469               && coding->consumed == 0))
6470         break;
6471       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6472       extend_conversion_buffer (&buf);
6473     }
6474
6475   coding->consumed = consumed;
6476   coding->consumed_char = consumed_char;
6477   coding->produced = produced;
6478   coding->produced_char = produced_char;
6479
6480   newstr = make_uninit_string (produced + shrinked_bytes);
6481   if (from > 0)
6482     STRING_COPYIN (newstr, 0, SDATA (str), from);
6483   STRING_COPYIN (newstr, from, buf.data, produced);
6484   if (shrinked_bytes > from)
6485     STRING_COPYIN (newstr, from + produced,
6486                    SDATA (str) + to_byte,
6487                    shrinked_bytes - from);
6488
6489   free_conversion_buffer (&buf);
6490   coding_free_composition_data (coding);
6491
6492   return newstr;
6493
6494  no_need_of_encoding:
6495   coding->consumed = SBYTES (str);
6496   coding->consumed_char = SCHARS (str);
6497   if (STRING_MULTIBYTE (str))
6498     {
6499       if (nocopy)
6500         /* We are sure that STR doesn't contain a multibyte
6501            character.  */
6502         STRING_SET_UNIBYTE (str);
6503       else
6504         {
6505           str = Fstring_as_unibyte (str);
6506           nocopy = 1;
6507         }
6508     }
6509   coding->produced = SBYTES (str);
6510   coding->produced_char = SCHARS (str);
6511   return (nocopy ? str : Fcopy_sequence (str));
6512 }
6513
6514 \f
6515 #ifdef emacs
6516 /*** 8. Emacs Lisp library functions ***/
6517
6518 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6519        doc: /* Return t if OBJECT is nil or a coding-system.
6520 See the documentation of `make-coding-system' for information
6521 about coding-system objects.  */)
6522      (obj)
6523      Lisp_Object obj;
6524 {
6525   if (NILP (obj))
6526     return Qt;
6527   if (!SYMBOLP (obj))
6528     return Qnil;
6529   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6530     return Qt;
6531   /* Get coding-spec vector for OBJ.  */
6532   obj = Fget (obj, Qcoding_system);
6533   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6534           ? Qt : Qnil);
6535 }
6536
6537 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6538        Sread_non_nil_coding_system, 1, 1, 0,
6539        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6540      (prompt)
6541      Lisp_Object prompt;
6542 {
6543   Lisp_Object val;
6544   do
6545     {
6546       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6547                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6548     }
6549   while (SCHARS (val) == 0);
6550   return (Fintern (val, Qnil));
6551 }
6552
6553 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6554        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6555 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6556      (prompt, default_coding_system)
6557      Lisp_Object prompt, default_coding_system;
6558 {
6559   Lisp_Object val;
6560   if (SYMBOLP (default_coding_system))
6561     default_coding_system = SYMBOL_NAME (default_coding_system);
6562   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6563                           Qt, Qnil, Qcoding_system_history,
6564                           default_coding_system, Qnil);
6565   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6566 }
6567
6568 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6569        1, 1, 0,
6570        doc: /* Check validity of CODING-SYSTEM.
6571 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6572 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6573 The value of this property should be a vector of length 5.  */)
6574      (coding_system)
6575      Lisp_Object coding_system;
6576 {
6577   Lisp_Object define_form;
6578
6579   define_form = Fget (coding_system, Qcoding_system_define_form);
6580   if (! NILP (define_form))
6581     {
6582       Fput (coding_system, Qcoding_system_define_form, Qnil);
6583       safe_eval (define_form);
6584     }
6585   if (!NILP (Fcoding_system_p (coding_system)))
6586     return coding_system;
6587   while (1)
6588     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6589 }
6590 \f
6591 Lisp_Object
6592 detect_coding_system (src, src_bytes, highest, multibytep)
6593      const unsigned char *src;
6594      int src_bytes, highest;
6595      int multibytep;
6596 {
6597   int coding_mask, eol_type;
6598   Lisp_Object val, tmp;
6599   int dummy;
6600
6601   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6602   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6603   if (eol_type == CODING_EOL_INCONSISTENT)
6604     eol_type = CODING_EOL_UNDECIDED;
6605
6606   if (!coding_mask)
6607     {
6608       val = Qundecided;
6609       if (eol_type != CODING_EOL_UNDECIDED)
6610         {
6611           Lisp_Object val2;
6612           val2 = Fget (Qundecided, Qeol_type);
6613           if (VECTORP (val2))
6614             val = XVECTOR (val2)->contents[eol_type];
6615         }
6616       return (highest ? val : Fcons (val, Qnil));
6617     }
6618
6619   /* At first, gather possible coding systems in VAL.  */
6620   val = Qnil;
6621   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6622     {
6623       Lisp_Object category_val, category_index;
6624
6625       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6626       category_val = Fsymbol_value (XCAR (tmp));
6627       if (!NILP (category_val)
6628           && NATNUMP (category_index)
6629           && (coding_mask & (1 << XFASTINT (category_index))))
6630         {
6631           val = Fcons (category_val, val);
6632           if (highest)
6633             break;
6634         }
6635     }
6636   if (!highest)
6637     val = Fnreverse (val);
6638
6639   /* Then, replace the elements with subsidiary coding systems.  */
6640   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6641     {
6642       if (eol_type != CODING_EOL_UNDECIDED
6643           && eol_type != CODING_EOL_INCONSISTENT)
6644         {
6645           Lisp_Object eol;
6646           eol = Fget (XCAR (tmp), Qeol_type);
6647           if (VECTORP (eol))
6648             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6649         }
6650     }
6651   return (highest ? XCAR (val) : val);
6652 }
6653
6654 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6655        2, 3, 0,
6656        doc: /* Detect how the byte sequence in the region is encoded.
6657 Return a list of possible coding systems used on decoding a byte
6658 sequence containing the bytes in the region between START and END when
6659 the coding system `undecided' is specified.  The list is ordered by
6660 priority decided in the current language environment.
6661
6662 If only ASCII characters are found, it returns a list of single element
6663 `undecided' or its subsidiary coding system according to a detected
6664 end-of-line format.
6665
6666 If optional argument HIGHEST is non-nil, return the coding system of
6667 highest priority.  */)
6668      (start, end, highest)
6669      Lisp_Object start, end, highest;
6670 {
6671   int from, to;
6672   int from_byte, to_byte;
6673   int include_anchor_byte = 0;
6674
6675   CHECK_NUMBER_COERCE_MARKER (start);
6676   CHECK_NUMBER_COERCE_MARKER (end);
6677
6678   validate_region (&start, &end);
6679   from = XINT (start), to = XINT (end);
6680   from_byte = CHAR_TO_BYTE (from);
6681   to_byte = CHAR_TO_BYTE (to);
6682
6683   if (from < GPT && to >= GPT)
6684     move_gap_both (to, to_byte);
6685   /* If we an anchor byte `\0' follows the region, we include it in
6686      the detecting source.  Then code detectors can handle the tailing
6687      byte sequence more accurately.
6688
6689      Fix me: This is not a perfect solution.  It is better that we
6690      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6691   */
6692   if (to == Z || (to == GPT && GAP_SIZE > 0))
6693     include_anchor_byte = 1;
6694   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6695                                to_byte - from_byte + include_anchor_byte,
6696                                !NILP (highest),
6697                                !NILP (current_buffer
6698                                       ->enable_multibyte_characters));
6699 }
6700
6701 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6702        1, 2, 0,
6703        doc: /* Detect how the byte sequence in STRING is encoded.
6704 Return a list of possible coding systems used on decoding a byte
6705 sequence containing the bytes in STRING when the coding system
6706 `undecided' is specified.  The list is ordered by priority decided in
6707 the current language environment.
6708
6709 If only ASCII characters are found, it returns a list of single element
6710 `undecided' or its subsidiary coding system according to a detected
6711 end-of-line format.
6712
6713 If optional argument HIGHEST is non-nil, return the coding system of
6714 highest priority.  */)
6715      (string, highest)
6716      Lisp_Object string, highest;
6717 {
6718   CHECK_STRING (string);
6719
6720   return detect_coding_system (SDATA (string),
6721                                /* "+ 1" is to include the anchor byte
6722                                   `\0'.  With this, code detectors can
6723                                   handle the tailing bytes more
6724                                   accurately.  */
6725                                SBYTES (string) + 1,
6726                                !NILP (highest),
6727                                STRING_MULTIBYTE (string));
6728 }
6729
6730 /*  Subroutine for Ffind_coding_systems_region_internal.
6731
6732     Return a list of coding systems that safely encode the multibyte
6733     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6734     possible coding systems.  If it is nil, it means that we have not
6735     yet found any coding systems.
6736
6737     WORK_TABLE a char-table of which element is set to t once the
6738     element is looked up.
6739
6740     If a non-ASCII single byte char is found, set
6741     *single_byte_char_found to 1.  */
6742
6743 static Lisp_Object
6744 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6745      unsigned char *p, *pend;
6746      Lisp_Object safe_codings, work_table;
6747      int *single_byte_char_found;
6748 {
6749   int c, len;
6750   Lisp_Object val, ch;
6751   Lisp_Object prev, tail;
6752
6753   if (NILP (safe_codings))
6754     goto done_safe_codings;
6755   while (p < pend)
6756     {
6757       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6758       p += len;
6759       if (ASCII_BYTE_P (c))
6760         /* We can ignore ASCII characters here.  */
6761         continue;
6762       if (SINGLE_BYTE_CHAR_P (c))
6763         *single_byte_char_found = 1;
6764       /* Check the safe coding systems for C.  */
6765       ch = make_number (c);
6766       val = Faref (work_table, ch);
6767       if (EQ (val, Qt))
6768         /* This element was already checked.  Ignore it.  */
6769         continue;
6770       /* Remember that we checked this element.  */
6771       Faset (work_table, ch, Qt);
6772
6773       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6774         {
6775           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6776           int encodable;
6777
6778           elt = XCAR (tail);
6779           if (CONSP (XCDR (elt)))
6780             {
6781               /* This entry has this format now:
6782                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6783                           ACCEPT-LATIN-EXTRA ) */
6784               val = XCDR (elt);
6785               encodable = ! NILP (Faref (XCAR (val), ch));
6786               if (! encodable)
6787                 {
6788                   val = XCDR (val);
6789                   translation_table = XCAR (val);
6790                   hash_table = XCAR (XCDR (val));
6791                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6792                 }
6793             }
6794           else
6795             {
6796               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6797               encodable = ! NILP (Faref (XCDR (elt), ch));
6798               if (! encodable)
6799                 {
6800                   /* Transform the format to:
6801                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6802                        ACCEPT-LATIN-EXTRA )  */
6803                   val = Fget (XCAR (elt), Qcoding_system);
6804                   translation_table
6805                     = Fplist_get (AREF (val, 3),
6806                                   Qtranslation_table_for_encode);
6807                   if (SYMBOLP (translation_table))
6808                     translation_table = Fget (translation_table,
6809                                               Qtranslation_table);
6810                   hash_table
6811                     = (CHAR_TABLE_P (translation_table)
6812                        ? XCHAR_TABLE (translation_table)->extras[1]
6813                        : Qnil);
6814                   accept_latin_extra
6815                     = ((EQ (AREF (val, 0), make_number (2))
6816                         && VECTORP (AREF (val, 4)))
6817                        ? AREF (AREF (val, 4), 16)
6818                        : Qnil);
6819                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6820                                         translation_table, hash_table,
6821                                         accept_latin_extra));
6822                 }
6823             }
6824
6825           if (! encodable
6826               && ((CHAR_TABLE_P (translation_table)
6827                    && ! NILP (Faref (translation_table, ch)))
6828                   || (HASH_TABLE_P (hash_table)
6829                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6830                   || (SINGLE_BYTE_CHAR_P (c)
6831                       && ! NILP (accept_latin_extra)
6832                       && VECTORP (Vlatin_extra_code_table)
6833                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6834             encodable = 1;
6835           if (encodable)
6836             prev = tail;
6837           else
6838             {
6839               /* Exclude this coding system from SAFE_CODINGS.  */
6840               if (EQ (tail, safe_codings))
6841                 {
6842                   safe_codings = XCDR (safe_codings);
6843                   if (NILP (safe_codings))
6844                     goto done_safe_codings;
6845                 }
6846               else
6847                 XSETCDR (prev, XCDR (tail));
6848             }
6849         }
6850     }
6851
6852  done_safe_codings:
6853   /* If the above loop was terminated before P reaches PEND, it means
6854      SAFE_CODINGS was set to nil.  If we have not yet found an
6855      non-ASCII single-byte char, check it now.  */
6856   if (! *single_byte_char_found)
6857     while (p < pend)
6858       {
6859         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6860         p += len;
6861         if (! ASCII_BYTE_P (c)
6862             && SINGLE_BYTE_CHAR_P (c))
6863           {
6864             *single_byte_char_found = 1;
6865             break;
6866           }
6867       }
6868   return safe_codings;
6869 }
6870
6871 DEFUN ("find-coding-systems-region-internal",
6872        Ffind_coding_systems_region_internal,
6873        Sfind_coding_systems_region_internal, 2, 2, 0,
6874        doc: /* Internal use only.  */)
6875      (start, end)
6876      Lisp_Object start, end;
6877 {
6878   Lisp_Object work_table, safe_codings;
6879   int non_ascii_p = 0;
6880   int single_byte_char_found = 0;
6881   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6882
6883   if (STRINGP (start))
6884     {
6885       if (!STRING_MULTIBYTE (start))
6886         return Qt;
6887       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6888       p2 = p2end = p1end;
6889       if (SCHARS (start) != SBYTES (start))
6890         non_ascii_p = 1;
6891     }
6892   else
6893     {
6894       int from, to, stop;
6895
6896       CHECK_NUMBER_COERCE_MARKER (start);
6897       CHECK_NUMBER_COERCE_MARKER (end);
6898       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6899         args_out_of_range (start, end);
6900       if (NILP (current_buffer->enable_multibyte_characters))
6901         return Qt;
6902       from = CHAR_TO_BYTE (XINT (start));
6903       to = CHAR_TO_BYTE (XINT (end));
6904       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6905       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6906       if (stop == to)
6907         p2 = p2end = p1end;
6908       else
6909         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6910       if (XINT (end) - XINT (start) != to - from)
6911         non_ascii_p = 1;
6912     }
6913
6914   if (!non_ascii_p)
6915     {
6916       /* We are sure that the text contains no multibyte character.
6917          Check if it contains eight-bit-graphic.  */
6918       p = p1;
6919       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6920       if (p == p1end)
6921         {
6922           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6923           if (p == p2end)
6924             return Qt;
6925         }
6926     }
6927
6928   /* The text contains non-ASCII characters.  */
6929
6930   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6931   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6932
6933   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6934                                     &single_byte_char_found);
6935   if (p2 < p2end)
6936     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6937                                       &single_byte_char_found);
6938   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6939     safe_codings = Qt;
6940   else
6941     {
6942       /* Turn safe_codings to a list of coding systems... */
6943       Lisp_Object val;
6944
6945       if (single_byte_char_found)
6946         /* ... and append these for eight-bit chars.  */
6947         val = Fcons (Qraw_text,
6948                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6949       else
6950         /* ... and append generic coding systems.  */
6951         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6952
6953       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6954         val = Fcons (XCAR (XCAR (safe_codings)), val);
6955       safe_codings = val;
6956     }
6957
6958   return safe_codings;
6959 }
6960
6961
6962 /* Search from position POS for such characters that are unencodable
6963    accoding to SAFE_CHARS, and return a list of their positions.  P
6964    points where in the memory the character at POS exists.  Limit the
6965    search at PEND or when Nth unencodable characters are found.
6966
6967    If SAFE_CHARS is a char table, an element for an unencodable
6968    character is nil.
6969
6970    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6971
6972    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6973    eight-bit-graphic characters are unencodable.  */
6974
6975 static Lisp_Object
6976 unencodable_char_position (safe_chars, pos, p, pend, n)
6977      Lisp_Object safe_chars;
6978      int pos;
6979      unsigned char *p, *pend;
6980      int n;
6981 {
6982   Lisp_Object pos_list;
6983
6984   pos_list = Qnil;
6985   while (p < pend)
6986     {
6987       int len;
6988       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6989
6990       if (c >= 128
6991           && (CHAR_TABLE_P (safe_chars)
6992               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6993               : (NILP (safe_chars) || c < 256)))
6994         {
6995           pos_list = Fcons (make_number (pos), pos_list);
6996           if (--n <= 0)
6997             break;
6998         }
6999       pos++;
7000       p += len;
7001     }
7002   return Fnreverse (pos_list);
7003 }
7004
7005
7006 DEFUN ("unencodable-char-position", Funencodable_char_position,
7007        Sunencodable_char_position, 3, 5, 0,
7008        doc: /*
7009 Return position of first un-encodable character in a region.
7010 START and END specfiy the region and CODING-SYSTEM specifies the
7011 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7012
7013 If optional 4th argument COUNT is non-nil, it specifies at most how
7014 many un-encodable characters to search.  In this case, the value is a
7015 list of positions.
7016
7017 If optional 5th argument STRING is non-nil, it is a string to search
7018 for un-encodable characters.  In that case, START and END are indexes
7019 to the string.  */)
7020      (start, end, coding_system, count, string)
7021      Lisp_Object start, end, coding_system, count, string;
7022 {
7023   int n;
7024   Lisp_Object safe_chars;
7025   struct coding_system coding;
7026   Lisp_Object positions;
7027   int from, to;
7028   unsigned char *p, *pend;
7029
7030   if (NILP (string))
7031     {
7032       validate_region (&start, &end);
7033       from = XINT (start);
7034       to = XINT (end);
7035       if (NILP (current_buffer->enable_multibyte_characters))
7036         return Qnil;
7037       p = CHAR_POS_ADDR (from);
7038       if (to == GPT)
7039         pend = GPT_ADDR;
7040       else
7041         pend = CHAR_POS_ADDR (to);
7042     }
7043   else
7044     {
7045       CHECK_STRING (string);
7046       CHECK_NATNUM (start);
7047       CHECK_NATNUM (end);
7048       from = XINT (start);
7049       to = XINT (end);
7050       if (from > to
7051           || to > SCHARS (string))
7052         args_out_of_range_3 (string, start, end);
7053       if (! STRING_MULTIBYTE (string))
7054         return Qnil;
7055       p = SDATA (string) + string_char_to_byte (string, from);
7056       pend = SDATA (string) + string_char_to_byte (string, to);
7057     }
7058
7059   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7060
7061   if (NILP (count))
7062     n = 1;
7063   else
7064     {
7065       CHECK_NATNUM (count);
7066       n = XINT (count);
7067     }
7068
7069   if (coding.type == coding_type_no_conversion
7070       || coding.type == coding_type_raw_text)
7071     return Qnil;
7072
7073   if (coding.type == coding_type_undecided)
7074     safe_chars = Qnil;
7075   else
7076     safe_chars = coding_safe_chars (coding_system);
7077
7078   if (STRINGP (string)
7079       || from >= GPT || to <= GPT)
7080     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7081   else
7082     {
7083       Lisp_Object args[2];
7084
7085       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7086       n -= XINT (Flength (args[0]));
7087       if (n <= 0)
7088         positions = args[0];
7089       else
7090         {
7091           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7092                                                pend, n);
7093           positions = Fappend (2, args);
7094         }
7095     }
7096
7097   return  (NILP (count) ? Fcar (positions) : positions);
7098 }
7099
7100
7101 Lisp_Object
7102 code_convert_region1 (start, end, coding_system, encodep)
7103      Lisp_Object start, end, coding_system;
7104      int encodep;
7105 {
7106   struct coding_system coding;
7107   int from, to;
7108
7109   CHECK_NUMBER_COERCE_MARKER (start);
7110   CHECK_NUMBER_COERCE_MARKER (end);
7111   CHECK_SYMBOL (coding_system);
7112
7113   validate_region (&start, &end);
7114   from = XFASTINT (start);
7115   to = XFASTINT (end);
7116
7117   if (NILP (coding_system) && system_eol_type == CODING_EOL_LF)
7118     return make_number (to - from);
7119
7120   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7121     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7122
7123   coding.mode |= CODING_MODE_LAST_BLOCK;
7124   coding.src_multibyte = coding.dst_multibyte
7125     = !NILP (current_buffer->enable_multibyte_characters);
7126   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7127                        &coding, encodep, 1);
7128   Vlast_coding_system_used = coding.symbol;
7129   return make_number (coding.produced_char);
7130 }
7131
7132 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7133        3, 3, "r\nzCoding system: ",
7134        doc: /* Decode the current region from the specified coding system.
7135 When called from a program, takes three arguments:
7136 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7137 This function sets `last-coding-system-used' to the precise coding system
7138 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7139 not fully specified.)
7140 It returns the length of the decoded text.  */)
7141      (start, end, coding_system)
7142      Lisp_Object start, end, coding_system;
7143 {
7144   return code_convert_region1 (start, end, coding_system, 0);
7145 }
7146
7147 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7148        3, 3, "r\nzCoding system: ",
7149        doc: /* Encode the current region into the specified coding system.
7150 When called from a program, takes three arguments:
7151 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7152 This function sets `last-coding-system-used' to the precise coding system
7153 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7154 not fully specified.)
7155 It returns the length of the encoded text.  */)
7156      (start, end, coding_system)
7157      Lisp_Object start, end, coding_system;
7158 {
7159   return code_convert_region1 (start, end, coding_system, 1);
7160 }
7161
7162 Lisp_Object
7163 code_convert_string1 (string, coding_system, nocopy, encodep)
7164      Lisp_Object string, coding_system, nocopy;
7165      int encodep;
7166 {
7167   struct coding_system coding;
7168
7169   CHECK_STRING (string);
7170   CHECK_SYMBOL (coding_system);
7171
7172   if (NILP (coding_system) && system_eol_type == CODING_EOL_LF)
7173     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7174
7175   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7176     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7177
7178   coding.mode |= CODING_MODE_LAST_BLOCK;
7179   string = (encodep
7180             ? encode_coding_string (string, &coding, !NILP (nocopy))
7181             : decode_coding_string (string, &coding, !NILP (nocopy)));
7182   Vlast_coding_system_used = coding.symbol;
7183
7184   return string;
7185 }
7186
7187 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7188        2, 3, 0,
7189        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7190 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7191 if the decoding operation is trivial.
7192 This function sets `last-coding-system-used' to the precise coding system
7193 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7194 not fully specified.)  */)
7195      (string, coding_system, nocopy)
7196      Lisp_Object string, coding_system, nocopy;
7197 {
7198   return code_convert_string1 (string, coding_system, nocopy, 0);
7199 }
7200
7201 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7202        2, 3, 0,
7203        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7204 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7205 if the encoding operation is trivial.
7206 This function sets `last-coding-system-used' to the precise coding system
7207 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7208 not fully specified.)  */)
7209      (string, coding_system, nocopy)
7210      Lisp_Object string, coding_system, nocopy;
7211 {
7212   return code_convert_string1 (string, coding_system, nocopy, 1);
7213 }
7214
7215 /* Encode or decode STRING according to CODING_SYSTEM.
7216    Do not set Vlast_coding_system_used.
7217
7218    This function is called only from macros DECODE_FILE and
7219    ENCODE_FILE, thus we ignore character composition.  */
7220
7221 Lisp_Object
7222 code_convert_string_norecord (string, coding_system, encodep)
7223      Lisp_Object string, coding_system;
7224      int encodep;
7225 {
7226   struct coding_system coding;
7227
7228   CHECK_STRING (string);
7229   CHECK_SYMBOL (coding_system);
7230
7231   if (NILP (coding_system) && system_eol_type == CODING_EOL_LF)
7232     return string;
7233
7234   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7235     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7236
7237   coding.composing = COMPOSITION_DISABLED;
7238   coding.mode |= CODING_MODE_LAST_BLOCK;
7239   return (encodep
7240           ? encode_coding_string (string, &coding, 1)
7241           : decode_coding_string (string, &coding, 1));
7242 }
7243 \f
7244 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7245        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7246 Return the corresponding character.  */)
7247      (code)
7248      Lisp_Object code;
7249 {
7250   unsigned char c1, c2, s1, s2;
7251   Lisp_Object val;
7252
7253   CHECK_NUMBER (code);
7254   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7255   if (s1 == 0)
7256     {
7257       if (s2 < 0x80)
7258         XSETFASTINT (val, s2);
7259       else if (s2 >= 0xA0 || s2 <= 0xDF)
7260         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7261       else
7262         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7263     }
7264   else
7265     {
7266       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7267           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7268         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7269       DECODE_SJIS (s1, s2, c1, c2);
7270       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7271     }
7272   return val;
7273 }
7274
7275 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7276        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7277 Return the corresponding code in SJIS.  */)
7278      (ch)
7279      Lisp_Object ch;
7280 {
7281   int charset, c1, c2, s1, s2;
7282   Lisp_Object val;
7283
7284   CHECK_NUMBER (ch);
7285   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7286   if (charset == CHARSET_ASCII)
7287     {
7288       val = ch;
7289     }
7290   else if (charset == charset_jisx0208
7291            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7292     {
7293       ENCODE_SJIS (c1, c2, s1, s2);
7294       XSETFASTINT (val, (s1 << 8) | s2);
7295     }
7296   else if (charset == charset_katakana_jisx0201
7297            && c1 > 0x20 && c2 < 0xE0)
7298     {
7299       XSETFASTINT (val, c1 | 0x80);
7300     }
7301   else
7302     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7303   return val;
7304 }
7305
7306 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7307        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7308 Return the corresponding character.  */)
7309      (code)
7310      Lisp_Object code;
7311 {
7312   int charset;
7313   unsigned char b1, b2, c1, c2;
7314   Lisp_Object val;
7315
7316   CHECK_NUMBER (code);
7317   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7318   if (b1 == 0)
7319     {
7320       if (b2 >= 0x80)
7321         error ("Invalid BIG5 code: %x", XFASTINT (code));
7322       val = code;
7323     }
7324   else
7325     {
7326       if ((b1 < 0xA1 || b1 > 0xFE)
7327           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7328         error ("Invalid BIG5 code: %x", XFASTINT (code));
7329       DECODE_BIG5 (b1, b2, charset, c1, c2);
7330       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7331     }
7332   return val;
7333 }
7334
7335 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7336        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7337 Return the corresponding character code in Big5.  */)
7338      (ch)
7339      Lisp_Object ch;
7340 {
7341   int charset, c1, c2, b1, b2;
7342   Lisp_Object val;
7343
7344   CHECK_NUMBER (ch);
7345   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7346   if (charset == CHARSET_ASCII)
7347     {
7348       val = ch;
7349     }
7350   else if ((charset == charset_big5_1
7351             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7352            || (charset == charset_big5_2
7353                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7354     {
7355       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7356       XSETFASTINT (val, (b1 << 8) | b2);
7357     }
7358   else
7359     error ("Can't encode to Big5: %d", XFASTINT (ch));
7360   return val;
7361 }
7362 \f
7363 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7364        Sset_terminal_coding_system_internal, 1, 2, 0,
7365        doc: /* Internal use only.  */)
7366      (coding_system, terminal)
7367      Lisp_Object coding_system;
7368      Lisp_Object terminal;
7369 {
7370   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
7371   CHECK_SYMBOL (coding_system);
7372   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7373   /* We had better not send unsafe characters to terminal.  */
7374   terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7375   /* Character composition should be disabled.  */
7376   terminal_coding->composing = COMPOSITION_DISABLED;
7377   /* Error notification should be suppressed.  */
7378   terminal_coding->suppress_error = 1;
7379   terminal_coding->src_multibyte = 1;
7380   terminal_coding->dst_multibyte = 0;
7381   return Qnil;
7382 }
7383
7384 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7385        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7386        doc: /* Internal use only.  */)
7387      (coding_system)
7388      Lisp_Object coding_system;
7389 {
7390   CHECK_SYMBOL (coding_system);
7391   setup_coding_system (Fcheck_coding_system (coding_system),
7392                        &safe_terminal_coding);
7393   /* Character composition should be disabled.  */
7394   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7395   /* Error notification should be suppressed.  */
7396   safe_terminal_coding.suppress_error = 1;
7397   safe_terminal_coding.src_multibyte = 1;
7398   safe_terminal_coding.dst_multibyte = 0;
7399   return Qnil;
7400 }
7401
7402 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7403        Sterminal_coding_system, 0, 1, 0,
7404        doc: /* Return coding system specified for terminal output on the given terminal.
7405 TERMINAL may be a terminal id, a frame, or nil for the selected
7406 frame's terminal device.  */)
7407      (terminal)
7408      Lisp_Object terminal;
7409 {
7410   return TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1))->symbol;
7411 }
7412
7413 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7414        Sset_keyboard_coding_system_internal, 1, 2, 0,
7415        doc: /* Internal use only.  */)
7416      (coding_system, terminal)
7417      Lisp_Object coding_system;
7418      Lisp_Object terminal;
7419 {
7420   struct terminal *t = get_terminal (terminal, 1);
7421   CHECK_SYMBOL (coding_system);
7422
7423   setup_coding_system (Fcheck_coding_system (coding_system),
7424                        TERMINAL_KEYBOARD_CODING (t));
7425   /* Character composition should be disabled.  */
7426   TERMINAL_KEYBOARD_CODING (t)->composing = COMPOSITION_DISABLED;
7427   return Qnil;
7428 }
7429
7430 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7431        Skeyboard_coding_system, 0, 1, 0,
7432        doc: /* Return coding system for decoding keyboard input on TERMINAL.
7433 TERMINAL may be a terminal id, a frame, or nil for the selected
7434 frame's terminal device.  */)
7435      (terminal)
7436      Lisp_Object terminal;
7437 {
7438   return TERMINAL_KEYBOARD_CODING (get_terminal (terminal, 1))->symbol;
7439 }
7440
7441 \f
7442 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7443        Sfind_operation_coding_system,  1, MANY, 0,
7444        doc: /* Choose a coding system for an operation based on the target name.
7445 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7446 DECODING-SYSTEM is the coding system to use for decoding
7447 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7448 for encoding (in case OPERATION does encoding).
7449
7450 The first argument OPERATION specifies an I/O primitive:
7451   For file I/O, `insert-file-contents' or `write-region'.
7452   For process I/O, `call-process', `call-process-region', or `start-process'.
7453   For network I/O, `open-network-stream'.
7454
7455 The remaining arguments should be the same arguments that were passed
7456 to the primitive.  Depending on which primitive, one of those arguments
7457 is selected as the TARGET.  For example, if OPERATION does file I/O,
7458 whichever argument specifies the file name is TARGET.
7459
7460 TARGET has a meaning which depends on OPERATION:
7461   For file I/O, TARGET is a file name.
7462   For process I/O, TARGET is a process name.
7463   For network I/O, TARGET is a service name or a port number
7464
7465 This function looks up what specified for TARGET in,
7466 `file-coding-system-alist', `process-coding-system-alist',
7467 or `network-coding-system-alist' depending on OPERATION.
7468 They may specify a coding system, a cons of coding systems,
7469 or a function symbol to call.
7470 In the last case, we call the function with one argument,
7471 which is a list of all the arguments given to this function.
7472
7473 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7474      (nargs, args)
7475      int nargs;
7476      Lisp_Object *args;
7477 {
7478   Lisp_Object operation, target_idx, target, val;
7479   register Lisp_Object chain;
7480
7481   if (nargs < 2)
7482     error ("Too few arguments");
7483   operation = args[0];
7484   if (!SYMBOLP (operation)
7485       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7486     error ("Invalid first argument");
7487   if (nargs < 1 + XINT (target_idx))
7488     error ("Too few arguments for operation: %s",
7489            SDATA (SYMBOL_NAME (operation)));
7490   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7491      argument to write-region) is string, it must be treated as a
7492      target file name.  */
7493   if (EQ (operation, Qwrite_region)
7494       && nargs > 5
7495       && STRINGP (args[5]))
7496     target_idx = make_number (4);
7497   target = args[XINT (target_idx) + 1];
7498   if (!(STRINGP (target)
7499         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7500     error ("Invalid argument %d", XINT (target_idx) + 1);
7501
7502   chain = ((EQ (operation, Qinsert_file_contents)
7503             || EQ (operation, Qwrite_region))
7504            ? Vfile_coding_system_alist
7505            : (EQ (operation, Qopen_network_stream)
7506               ? Vnetwork_coding_system_alist
7507               : Vprocess_coding_system_alist));
7508   if (NILP (chain))
7509     return Qnil;
7510
7511   for (; CONSP (chain); chain = XCDR (chain))
7512     {
7513       Lisp_Object elt;
7514       elt = XCAR (chain);
7515
7516       if (CONSP (elt)
7517           && ((STRINGP (target)
7518                && STRINGP (XCAR (elt))
7519                && fast_string_match (XCAR (elt), target) >= 0)
7520               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7521         {
7522           val = XCDR (elt);
7523           /* Here, if VAL is both a valid coding system and a valid
7524              function symbol, we return VAL as a coding system.  */
7525           if (CONSP (val))
7526             return val;
7527           if (! SYMBOLP (val))
7528             return Qnil;
7529           if (! NILP (Fcoding_system_p (val)))
7530             return Fcons (val, val);
7531           if (! NILP (Ffboundp (val)))
7532             {
7533               val = call1 (val, Flist (nargs, args));
7534               if (CONSP (val))
7535                 return val;
7536               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7537                 return Fcons (val, val);
7538             }
7539           return Qnil;
7540         }
7541     }
7542   return Qnil;
7543 }
7544
7545 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7546        Supdate_coding_systems_internal, 0, 0, 0,
7547        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7548 When values of any coding categories are changed, you must
7549 call this function.  */)
7550      ()
7551 {
7552   int i;
7553
7554   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7555     {
7556       Lisp_Object val;
7557
7558       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7559       if (!NILP (val))
7560         {
7561           if (! coding_system_table[i])
7562             coding_system_table[i] = ((struct coding_system *)
7563                                       xmalloc (sizeof (struct coding_system)));
7564           setup_coding_system (val, coding_system_table[i]);
7565         }
7566       else if (coding_system_table[i])
7567         {
7568           xfree (coding_system_table[i]);
7569           coding_system_table[i] = NULL;
7570         }
7571     }
7572
7573   return Qnil;
7574 }
7575
7576 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7577        Sset_coding_priority_internal, 0, 0, 0,
7578        doc: /* Update internal database for the current value of `coding-category-list'.
7579 This function is internal use only.  */)
7580      ()
7581 {
7582   int i = 0, idx;
7583   Lisp_Object val;
7584
7585   val = Vcoding_category_list;
7586
7587   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7588     {
7589       if (! SYMBOLP (XCAR (val)))
7590         break;
7591       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7592       if (idx >= CODING_CATEGORY_IDX_MAX)
7593         break;
7594       coding_priorities[i++] = (1 << idx);
7595       val = XCDR (val);
7596     }
7597   /* If coding-category-list is valid and contains all coding
7598      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7599      the following code saves Emacs from crashing.  */
7600   while (i < CODING_CATEGORY_IDX_MAX)
7601     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7602
7603   return Qnil;
7604 }
7605
7606 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7607        Sdefine_coding_system_internal, 1, 1, 0,
7608        doc: /* Register CODING-SYSTEM as a base coding system.
7609 This function is internal use only.  */)
7610      (coding_system)
7611      Lisp_Object coding_system;
7612 {
7613   Lisp_Object safe_chars, slot;
7614
7615   if (NILP (Fcheck_coding_system (coding_system)))
7616     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7617   safe_chars = coding_safe_chars (coding_system);
7618   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7619     error ("No valid safe-chars property for %s",
7620            SDATA (SYMBOL_NAME (coding_system)));
7621   if (EQ (safe_chars, Qt))
7622     {
7623       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7624         XSETCAR (Vcoding_system_safe_chars,
7625                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7626     }
7627   else
7628     {
7629       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7630       if (NILP (slot))
7631         XSETCDR (Vcoding_system_safe_chars,
7632                  nconc2 (XCDR (Vcoding_system_safe_chars),
7633                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7634       else
7635         XSETCDR (slot, safe_chars);
7636     }
7637   return Qnil;
7638 }
7639
7640 #endif /* emacs */
7641
7642 \f
7643 /*** 9. Post-amble ***/
7644
7645 void
7646 init_coding_once ()
7647 {
7648   int i;
7649
7650   /* Emacs' internal format specific initialize routine.  */
7651   for (i = 0; i <= 0x20; i++)
7652     emacs_code_class[i] = EMACS_control_code;
7653   emacs_code_class[0x0A] = EMACS_linefeed_code;
7654   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7655   for (i = 0x21 ; i < 0x7F; i++)
7656     emacs_code_class[i] = EMACS_ascii_code;
7657   emacs_code_class[0x7F] = EMACS_control_code;
7658   for (i = 0x80; i < 0xFF; i++)
7659     emacs_code_class[i] = EMACS_invalid_code;
7660   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7661   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7662   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7663   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7664
7665   /* ISO2022 specific initialize routine.  */
7666   for (i = 0; i < 0x20; i++)
7667     iso_code_class[i] = ISO_control_0;
7668   for (i = 0x21; i < 0x7F; i++)
7669     iso_code_class[i] = ISO_graphic_plane_0;
7670   for (i = 0x80; i < 0xA0; i++)
7671     iso_code_class[i] = ISO_control_1;
7672   for (i = 0xA1; i < 0xFF; i++)
7673     iso_code_class[i] = ISO_graphic_plane_1;
7674   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7675   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7676   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7677   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7678   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7679   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7680   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7681   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7682   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7683   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7684
7685   setup_coding_system (Qnil, &safe_terminal_coding);
7686   setup_coding_system (Qnil, &default_buffer_file_coding);
7687
7688   bzero (coding_system_table, sizeof coding_system_table);
7689
7690   bzero (ascii_skip_code, sizeof ascii_skip_code);
7691   for (i = 0; i < 128; i++)
7692     ascii_skip_code[i] = 1;
7693
7694 #if defined (MSDOS) || defined (WINDOWSNT)
7695   system_eol_type = CODING_EOL_CRLF;
7696 #else
7697   system_eol_type = CODING_EOL_LF;
7698 #endif
7699
7700   inhibit_pre_post_conversion = 0;
7701 }
7702
7703 #ifdef emacs
7704
7705 void
7706 syms_of_coding ()
7707 {
7708   staticpro (&Vcode_conversion_workbuf_name);
7709   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7710
7711   Qtarget_idx = intern ("target-idx");
7712   staticpro (&Qtarget_idx);
7713
7714   Qcoding_system_history = intern ("coding-system-history");
7715   staticpro (&Qcoding_system_history);
7716   Fset (Qcoding_system_history, Qnil);
7717
7718   /* Target FILENAME is the first argument.  */
7719   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7720   /* Target FILENAME is the third argument.  */
7721   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7722
7723   Qcall_process = intern ("call-process");
7724   staticpro (&Qcall_process);
7725   /* Target PROGRAM is the first argument.  */
7726   Fput (Qcall_process, Qtarget_idx, make_number (0));
7727
7728   Qcall_process_region = intern ("call-process-region");
7729   staticpro (&Qcall_process_region);
7730   /* Target PROGRAM is the third argument.  */
7731   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7732
7733   Qstart_process = intern ("start-process");
7734   staticpro (&Qstart_process);
7735   /* Target PROGRAM is the third argument.  */
7736   Fput (Qstart_process, Qtarget_idx, make_number (2));
7737
7738   Qopen_network_stream = intern ("open-network-stream");
7739   staticpro (&Qopen_network_stream);
7740   /* Target SERVICE is the fourth argument.  */
7741   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7742
7743   Qcoding_system = intern ("coding-system");
7744   staticpro (&Qcoding_system);
7745
7746   Qeol_type = intern ("eol-type");
7747   staticpro (&Qeol_type);
7748
7749   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7750   staticpro (&Qbuffer_file_coding_system);
7751
7752   Qpost_read_conversion = intern ("post-read-conversion");
7753   staticpro (&Qpost_read_conversion);
7754
7755   Qpre_write_conversion = intern ("pre-write-conversion");
7756   staticpro (&Qpre_write_conversion);
7757
7758   Qno_conversion = intern ("no-conversion");
7759   staticpro (&Qno_conversion);
7760
7761   Qundecided = intern ("undecided");
7762   staticpro (&Qundecided);
7763
7764   Qcoding_system_p = intern ("coding-system-p");
7765   staticpro (&Qcoding_system_p);
7766
7767   Qcoding_system_error = intern ("coding-system-error");
7768   staticpro (&Qcoding_system_error);
7769
7770   Fput (Qcoding_system_error, Qerror_conditions,
7771         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7772   Fput (Qcoding_system_error, Qerror_message,
7773         build_string ("Invalid coding system"));
7774
7775   Qcoding_category = intern ("coding-category");
7776   staticpro (&Qcoding_category);
7777   Qcoding_category_index = intern ("coding-category-index");
7778   staticpro (&Qcoding_category_index);
7779
7780   Vcoding_category_table
7781     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7782   staticpro (&Vcoding_category_table);
7783   {
7784     int i;
7785     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7786       {
7787         XVECTOR (Vcoding_category_table)->contents[i]
7788           = intern (coding_category_name[i]);
7789         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7790               Qcoding_category_index, make_number (i));
7791       }
7792   }
7793
7794   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7795   staticpro (&Vcoding_system_safe_chars);
7796
7797   Qtranslation_table = intern ("translation-table");
7798   staticpro (&Qtranslation_table);
7799   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7800
7801   Qtranslation_table_id = intern ("translation-table-id");
7802   staticpro (&Qtranslation_table_id);
7803
7804   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7805   staticpro (&Qtranslation_table_for_decode);
7806
7807   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7808   staticpro (&Qtranslation_table_for_encode);
7809
7810   Qsafe_chars = intern ("safe-chars");
7811   staticpro (&Qsafe_chars);
7812
7813   Qchar_coding_system = intern ("char-coding-system");
7814   staticpro (&Qchar_coding_system);
7815
7816   /* Intern this now in case it isn't already done.
7817      Setting this variable twice is harmless.
7818      But don't staticpro it here--that is done in alloc.c.  */
7819   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7820   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7821   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7822
7823   Qvalid_codes = intern ("valid-codes");
7824   staticpro (&Qvalid_codes);
7825
7826   Qemacs_mule = intern ("emacs-mule");
7827   staticpro (&Qemacs_mule);
7828
7829   Qraw_text = intern ("raw-text");
7830   staticpro (&Qraw_text);
7831
7832   Qutf_8 = intern ("utf-8");
7833   staticpro (&Qutf_8);
7834
7835   Qcoding_system_define_form = intern ("coding-system-define-form");
7836   staticpro (&Qcoding_system_define_form);
7837
7838   defsubr (&Scoding_system_p);
7839   defsubr (&Sread_coding_system);
7840   defsubr (&Sread_non_nil_coding_system);
7841   defsubr (&Scheck_coding_system);
7842   defsubr (&Sdetect_coding_region);
7843   defsubr (&Sdetect_coding_string);
7844   defsubr (&Sfind_coding_systems_region_internal);
7845   defsubr (&Sunencodable_char_position);
7846   defsubr (&Sdecode_coding_region);
7847   defsubr (&Sencode_coding_region);
7848   defsubr (&Sdecode_coding_string);
7849   defsubr (&Sencode_coding_string);
7850   defsubr (&Sdecode_sjis_char);
7851   defsubr (&Sencode_sjis_char);
7852   defsubr (&Sdecode_big5_char);
7853   defsubr (&Sencode_big5_char);
7854   defsubr (&Sset_terminal_coding_system_internal);
7855   defsubr (&Sset_safe_terminal_coding_system_internal);
7856   defsubr (&Sterminal_coding_system);
7857   defsubr (&Sset_keyboard_coding_system_internal);
7858   defsubr (&Skeyboard_coding_system);
7859   defsubr (&Sfind_operation_coding_system);
7860   defsubr (&Supdate_coding_systems_internal);
7861   defsubr (&Sset_coding_priority_internal);
7862   defsubr (&Sdefine_coding_system_internal);
7863
7864   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7865                doc: /* List of coding systems.
7866
7867 Do not alter the value of this variable manually.  This variable should be
7868 updated by the functions `make-coding-system' and
7869 `define-coding-system-alias'.  */);
7870   Vcoding_system_list = Qnil;
7871
7872   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7873                doc: /* Alist of coding system names.
7874 Each element is one element list of coding system name.
7875 This variable is given to `completing-read' as TABLE argument.
7876
7877 Do not alter the value of this variable manually.  This variable should be
7878 updated by the functions `make-coding-system' and
7879 `define-coding-system-alias'.  */);
7880   Vcoding_system_alist = Qnil;
7881
7882   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7883                doc: /* List of coding-categories (symbols) ordered by priority.
7884
7885 On detecting a coding system, Emacs tries code detection algorithms
7886 associated with each coding-category one by one in this order.  When
7887 one algorithm agrees with a byte sequence of source text, the coding
7888 system bound to the corresponding coding-category is selected.
7889
7890 Don't modify this variable directly, but use `set-coding-priority'.  */);
7891   {
7892     int i;
7893
7894     Vcoding_category_list = Qnil;
7895     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7896       Vcoding_category_list
7897         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7898                  Vcoding_category_list);
7899   }
7900
7901   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7902                doc: /* Specify the coding system for read operations.
7903 It is useful to bind this variable with `let', but do not set it globally.
7904 If the value is a coding system, it is used for decoding on read operation.
7905 If not, an appropriate element is used from one of the coding system alists:
7906 There are three such tables, `file-coding-system-alist',
7907 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7908   Vcoding_system_for_read = Qnil;
7909
7910   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7911                doc: /* Specify the coding system for write operations.
7912 Programs bind this variable with `let', but you should not set it globally.
7913 If the value is a coding system, it is used for encoding of output,
7914 when writing it to a file and when sending it to a file or subprocess.
7915
7916 If this does not specify a coding system, an appropriate element
7917 is used from one of the coding system alists:
7918 There are three such tables, `file-coding-system-alist',
7919 `process-coding-system-alist', and `network-coding-system-alist'.
7920 For output to files, if the above procedure does not specify a coding system,
7921 the value of `buffer-file-coding-system' is used.  */);
7922   Vcoding_system_for_write = Qnil;
7923
7924   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7925                doc: /* Coding system used in the latest file or process I/O.
7926 Also set by `encode-coding-region', `decode-coding-region',
7927 `encode-coding-string' and `decode-coding-string'.  */);
7928   Vlast_coding_system_used = Qnil;
7929
7930   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7931                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7932 See info node `Coding Systems' and info node `Text and Binary' concerning
7933 such conversion.  */);
7934   inhibit_eol_conversion = 0;
7935
7936   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7937                doc: /* Non-nil means process buffer inherits coding system of process output.
7938 Bind it to t if the process output is to be treated as if it were a file
7939 read from some filesystem.  */);
7940   inherit_process_coding_system = 0;
7941
7942   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7943                doc: /* Alist to decide a coding system to use for a file I/O operation.
7944 The format is ((PATTERN . VAL) ...),
7945 where PATTERN is a regular expression matching a file name,
7946 VAL is a coding system, a cons of coding systems, or a function symbol.
7947 If VAL is a coding system, it is used for both decoding and encoding
7948 the file contents.
7949 If VAL is a cons of coding systems, the car part is used for decoding,
7950 and the cdr part is used for encoding.
7951 If VAL is a function symbol, the function must return a coding system
7952 or a cons of coding systems which are used as above.  The function gets
7953 the arguments with which `find-operation-coding-system' was called.
7954
7955 See also the function `find-operation-coding-system'
7956 and the variable `auto-coding-alist'.  */);
7957   Vfile_coding_system_alist = Qnil;
7958
7959   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7960     doc: /* Alist to decide a coding system to use for a process I/O operation.
7961 The format is ((PATTERN . VAL) ...),
7962 where PATTERN is a regular expression matching a program name,
7963 VAL is a coding system, a cons of coding systems, or a function symbol.
7964 If VAL is a coding system, it is used for both decoding what received
7965 from the program and encoding what sent to the program.
7966 If VAL is a cons of coding systems, the car part is used for decoding,
7967 and the cdr part is used for encoding.
7968 If VAL is a function symbol, the function must return a coding system
7969 or a cons of coding systems which are used as above.
7970
7971 See also the function `find-operation-coding-system'.  */);
7972   Vprocess_coding_system_alist = Qnil;
7973
7974   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7975     doc: /* Alist to decide a coding system to use for a network I/O operation.
7976 The format is ((PATTERN . VAL) ...),
7977 where PATTERN is a regular expression matching a network service name
7978 or is a port number to connect to,
7979 VAL is a coding system, a cons of coding systems, or a function symbol.
7980 If VAL is a coding system, it is used for both decoding what received
7981 from the network stream and encoding what sent to the network stream.
7982 If VAL is a cons of coding systems, the car part is used for decoding,
7983 and the cdr part is used for encoding.
7984 If VAL is a function symbol, the function must return a coding system
7985 or a cons of coding systems which are used as above.
7986
7987 See also the function `find-operation-coding-system'.  */);
7988   Vnetwork_coding_system_alist = Qnil;
7989
7990   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7991                doc: /* Coding system to use with system messages.
7992 Also used for decoding keyboard input on X Window system.  */);
7993   Vlocale_coding_system = Qnil;
7994
7995   /* The eol mnemonics are reset in startup.el system-dependently.  */
7996   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7997                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7998   eol_mnemonic_unix = build_string (":");
7999
8000   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8001                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8002   eol_mnemonic_dos = build_string ("\\");
8003
8004   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8005                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8006   eol_mnemonic_mac = build_string ("/");
8007
8008   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8009                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8010   eol_mnemonic_undecided = build_string (":");
8011
8012   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8013                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8014   Venable_character_translation = Qt;
8015
8016   DEFVAR_LISP ("standard-translation-table-for-decode",
8017                &Vstandard_translation_table_for_decode,
8018                doc: /* Table for translating characters while decoding.  */);
8019   Vstandard_translation_table_for_decode = Qnil;
8020
8021   DEFVAR_LISP ("standard-translation-table-for-encode",
8022                &Vstandard_translation_table_for_encode,
8023                doc: /* Table for translating characters while encoding.  */);
8024   Vstandard_translation_table_for_encode = Qnil;
8025
8026   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8027                doc: /* Alist of charsets vs revision numbers.
8028 While encoding, if a charset (car part of an element) is found,
8029 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8030   Vcharset_revision_alist = Qnil;
8031
8032   DEFVAR_LISP ("default-process-coding-system",
8033                &Vdefault_process_coding_system,
8034                doc: /* Cons of coding systems used for process I/O by default.
8035 The car part is used for decoding a process output,
8036 the cdr part is used for encoding a text to be sent to a process.  */);
8037   Vdefault_process_coding_system = Qnil;
8038
8039   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8040                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8041 This is a vector of length 256.
8042 If Nth element is non-nil, the existence of code N in a file
8043 \(or output of subprocess) doesn't prevent it to be detected as
8044 a coding system of ISO 2022 variant which has a flag
8045 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8046 or reading output of a subprocess.
8047 Only 128th through 159th elements has a meaning.  */);
8048   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8049
8050   DEFVAR_LISP ("select-safe-coding-system-function",
8051                &Vselect_safe_coding_system_function,
8052                doc: /* Function to call to select safe coding system for encoding a text.
8053
8054 If set, this function is called to force a user to select a proper
8055 coding system which can encode the text in the case that a default
8056 coding system used in each operation can't encode the text.
8057
8058 The default value is `select-safe-coding-system' (which see).  */);
8059   Vselect_safe_coding_system_function = Qnil;
8060
8061   DEFVAR_BOOL ("coding-system-require-warning",
8062                &coding_system_require_warning,
8063                doc: /* Internal use only.
8064 If non-nil, on writing a file, `select-safe-coding-system-function' is
8065 called even if `coding-system-for-write' is non-nil.  The command
8066 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8067   coding_system_require_warning = 0;
8068
8069
8070   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8071                &inhibit_iso_escape_detection,
8072                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8073
8074 By default, on reading a file, Emacs tries to detect how the text is
8075 encoded.  This code detection is sensitive to escape sequences.  If
8076 the sequence is valid as ISO2022, the code is determined as one of
8077 the ISO2022 encodings, and the file is decoded by the corresponding
8078 coding system (e.g. `iso-2022-7bit').
8079
8080 However, there may be a case that you want to read escape sequences in
8081 a file as is.  In such a case, you can set this variable to non-nil.
8082 Then, as the code detection ignores any escape sequences, no file is
8083 detected as encoded in some ISO2022 encoding.  The result is that all
8084 escape sequences become visible in a buffer.
8085
8086 The default value is nil, and it is strongly recommended not to change
8087 it.  That is because many Emacs Lisp source files that contain
8088 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8089 in Emacs's distribution, and they won't be decoded correctly on
8090 reading if you suppress escape sequence detection.
8091
8092 The other way to read escape sequences in a file without decoding is
8093 to explicitly specify some coding system that doesn't use ISO2022's
8094 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8095   inhibit_iso_escape_detection = 0;
8096
8097   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8098                doc: /* Char table for translating self-inserting characters.
8099 This is applied to the result of input methods, not their input.  See also
8100 `keyboard-translate-table'.  */);
8101     Vtranslation_table_for_input = Qnil;
8102 }
8103
8104 char *
8105 emacs_strerror (error_number)
8106      int error_number;
8107 {
8108   char *str;
8109
8110   synchronize_system_messages_locale ();
8111   str = strerror (error_number);
8112
8113   if (! NILP (Vlocale_coding_system))
8114     {
8115       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8116                                                       Vlocale_coding_system,
8117                                                       0);
8118       str = (char *) SDATA (dec);
8119     }
8120
8121   return str;
8122 }
8123
8124 #endif /* emacs */
8125
8126 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8127    (do not change this comment) */