src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7
   8 This file is part of GNU Emacs.
   9
  10 GNU Emacs is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2, or (at your option)
  13 any later version.
  14
  15 GNU Emacs is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with GNU Emacs; see the file COPYING.  If not, write to
  22 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  23 Boston, MA 02110-1301, USA.  */
  24
  25 /*** TABLE OF CONTENTS ***
  26
  27   0. General comments
  28   1. Preamble
  29   2. Emacs' internal format (emacs-mule) handlers
  30   3. ISO2022 handlers
  31   4. Shift-JIS and BIG5 handlers
  32   5. CCL handlers
  33   6. End-of-line handlers
  34   7. C library functions
  35   8. Emacs Lisp library functions
  36   9. Post-amble
  37
  38 */
  39
  40 /*** 0. General comments ***/
  41
  42
  43 /*** GENERAL NOTE on CODING SYSTEMS ***
  44
  45   A coding system is an encoding mechanism for one or more character
  46   sets.  Here's a list of coding systems which Emacs can handle.  When
  47   we say "decode", it means converting some other coding system to
  48   Emacs' internal format (emacs-mule), and when we say "encode",
  49   it means converting the coding system emacs-mule to some other
  50   coding system.
  51
  52   0. Emacs' internal format (emacs-mule)
  53
  54   Emacs itself holds a multi-lingual character in buffers and strings
  55   in a special format.  Details are described in section 2.
  56
  57   1. ISO2022
  58
  59   The most famous coding system for multiple character sets.  X's
  60   Compound Text, various EUCs (Extended Unix Code), and coding
  61   systems used in Internet communication such as ISO-2022-JP are
  62   all variants of ISO2022.  Details are described in section 3.
  63
  64   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  65
  66   A coding system to encode character sets: ASCII, JISX0201, and
  67   JISX0208.  Widely used for PC's in Japan.  Details are described in
  68   section 4.
  69
  70   3. BIG5
  71
  72   A coding system to encode the character sets ASCII and Big5.  Widely
  73   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  74   described in section 4.  In this file, when we write "BIG5"
  75   (all uppercase), we mean the coding system, and when we write
  76   "Big5" (capitalized), we mean the character set.
  77
  78   4. Raw text
  79
  80   A coding system for text containing random 8-bit code.  Emacs does
  81   no code conversion on such text except for end-of-line format.
  82
  83   5. Other
  84
  85   If a user wants to read/write text encoded in a coding system not
  86   listed above, he can supply a decoder and an encoder for it as CCL
  87   (Code Conversion Language) programs.  Emacs executes the CCL program
  88   while reading/writing.
  89
  90   Emacs represents a coding system by a Lisp symbol that has a property
  91   `coding-system'.  But, before actually using the coding system, the
  92   information about it is set in a structure of type `struct
  93   coding_system' for rapid processing.  See section 6 for more details.
  94
  95 */
  96
  97 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  98
  99   How end-of-line of text is encoded depends on the operating system.
 100   For instance, Unix's format is just one byte of `line-feed' code,
 101   whereas DOS's format is two-byte sequence of `carriage-return' and
 102   `line-feed' codes.  MacOS's format is usually one byte of
 103   `carriage-return'.
 104
 105   Since text character encoding and end-of-line encoding are
 106   independent, any coding system described above can have any
 107   end-of-line format.  So Emacs has information about end-of-line
 108   format in each coding-system.  See section 6 for more details.
 109
 110 */
 111
 112 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 113
 114   These functions check if a text between SRC and SRC_END is encoded
 115   in the coding system category XXX.  Each returns an integer value in
 116   which appropriate flag bits for the category XXX are set.  The flag
 117   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 118   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 119   of the range 0x80..0x9F are in multibyte form.  */
 120 #if 0
 121 int
 122 detect_coding_emacs_mule (src, src_end, multibytep)
 123      unsigned char *src, *src_end;
 124      int multibytep;
 125 {
 126   ...
 127 }
 128 #endif
 129
 130 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 131
 132   These functions decode SRC_BYTES length of unibyte text at SOURCE
 133   encoded in CODING to Emacs' internal format.  The resulting
 134   multibyte text goes to a place pointed to by DESTINATION, the length
 135   of which should not exceed DST_BYTES.
 136
 137   These functions set the information about original and decoded texts
 138   in the members `produced', `produced_char', `consumed', and
 139   `consumed_char' of the structure *CODING.  They also set the member
 140   `result' to one of CODING_FINISH_XXX indicating how the decoding
 141   finished.
 142
 143   DST_BYTES zero means that the source area and destination area are
 144   overlapped, which means that we can produce a decoded text until it
 145   reaches the head of the not-yet-decoded source text.
 146
 147   Below is a template for these functions.  */
 148 #if 0
 149 static void
 150 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 151      struct coding_system *coding;
 152      const unsigned char *source;
 153      unsigned char *destination;
 154      int src_bytes, dst_bytes;
 155 {
 156   ...
 157 }
 158 #endif
 159
 160 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 161
 162   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 163   internal multibyte format to CODING.  The resulting unibyte text
 164   goes to a place pointed to by DESTINATION, the length of which
 165   should not exceed DST_BYTES.
 166
 167   These functions set the information about original and encoded texts
 168   in the members `produced', `produced_char', `consumed', and
 169   `consumed_char' of the structure *CODING.  They also set the member
 170   `result' to one of CODING_FINISH_XXX indicating how the encoding
 171   finished.
 172
 173   DST_BYTES zero means that the source area and destination area are
 174   overlapped, which means that we can produce encoded text until it
 175   reaches at the head of the not-yet-encoded source text.
 176
 177   Below is a template for these functions.  */
 178 #if 0
 179 static void
 180 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 181      struct coding_system *coding;
 182      unsigned char *source, *destination;
 183      int src_bytes, dst_bytes;
 184 {
 185   ...
 186 }
 187 #endif
 188
 189 /*** COMMONLY USED MACROS ***/
 190
 191 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 192    get one, two, and three bytes from the source text respectively.
 193    If there are not enough bytes in the source, they jump to
 194    `label_end_of_loop'.  The caller should set variables `coding',
 195    `src' and `src_end' to appropriate pointer in advance.  These
 196    macros are called from decoding routines `decode_coding_XXX', thus
 197    it is assumed that the source text is unibyte.  */
 198
 199 #define ONE_MORE_BYTE(c1)                                       \
 200   do {                                                          \
 201     if (src >= src_end)                                         \
 202       {                                                         \
 203         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 204         goto label_end_of_loop;                                 \
 205       }                                                         \
 206     c1 = *src++;                                                \
 207   } while (0)
 208
 209 #define TWO_MORE_BYTES(c1, c2)                                  \
 210   do {                                                          \
 211     if (src + 1 >= src_end)                                     \
 212       {                                                         \
 213         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 214         goto label_end_of_loop;                                 \
 215       }                                                         \
 216     c1 = *src++;                                                \
 217     c2 = *src++;                                                \
 218   } while (0)
 219
 220
 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 222    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 223    than SRC_END, return with RET.  */
 224
 225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 226   do {                                                          \
 227     if (src >= src_end)                                         \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         return ret;                                             \
 231       }                                                         \
 232     c1 = *src++;                                                \
 233     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 234       c1 = *src++ - 0x20;                                       \
 235   } while (0)
 236
 237 /* Set C to the next character at the source text pointed by `src'.
 238    If there are not enough characters in the source, jump to
 239    `label_end_of_loop'.  The caller should set variables `coding'
 240    `src', `src_end', and `translation_table' to appropriate pointers
 241    in advance.  This macro is used in encoding routines
 242    `encode_coding_XXX', thus it assumes that the source text is in
 243    multibyte form except for 8-bit characters.  8-bit characters are
 244    in multibyte form if coding->src_multibyte is nonzero, else they
 245    are represented by a single byte.  */
 246
 247 #define ONE_MORE_CHAR(c)                                        \
 248   do {                                                          \
 249     int len = src_end - src;                                    \
 250     int bytes;                                                  \
 251     if (len <= 0)                                               \
 252       {                                                         \
 253         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 254         goto label_end_of_loop;                                 \
 255       }                                                         \
 256     if (coding->src_multibyte                                   \
 257         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 258       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 259     else                                                        \
 260       c = *src, bytes = 1;                                      \
 261     if (!NILP (translation_table))                              \
 262       c = translate_char (translation_table, c, -1, 0, 0);      \
 263     src += bytes;                                               \
 264   } while (0)
 265
 266
 267 /* Produce a multibyte form of character C to `dst'.  Jump to
 268    `label_end_of_loop' if there's not enough space at `dst'.
 269
 270    If we are now in the middle of a composition sequence, the decoded
 271    character may be ALTCHAR (for the current composition).  In that
 272    case, the character goes to coding->cmp_data->data instead of
 273    `dst'.
 274
 275    This macro is used in decoding routines.  */
 276
 277 #define EMIT_CHAR(c)                                                    \
 278   do {                                                                  \
 279     if (! COMPOSING_P (coding)                                          \
 280         || coding->composing == COMPOSITION_RELATIVE                    \
 281         || coding->composing == COMPOSITION_WITH_RULE)                  \
 282       {                                                                 \
 283         int bytes = CHAR_BYTES (c);                                     \
 284         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 285           {                                                             \
 286             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 287             goto label_end_of_loop;                                     \
 288           }                                                             \
 289         dst += CHAR_STRING (c, dst);                                    \
 290         coding->produced_char++;                                        \
 291       }                                                                 \
 292                                                                         \
 293     if (COMPOSING_P (coding)                                            \
 294         && coding->composing != COMPOSITION_RELATIVE)                   \
 295       {                                                                 \
 296         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 297         coding->composition_rule_follows                                \
 298           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 299       }                                                                 \
 300   } while (0)
 301
 302
 303 #define EMIT_ONE_BYTE(c)                                        \
 304   do {                                                          \
 305     if (dst >= (dst_bytes ? dst_end : src))                     \
 306       {                                                         \
 307         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 308         goto label_end_of_loop;                                 \
 309       }                                                         \
 310     *dst++ = c;                                                 \
 311   } while (0)
 312
 313 #define EMIT_TWO_BYTES(c1, c2)                                  \
 314   do {                                                          \
 315     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 316       {                                                         \
 317         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 318         goto label_end_of_loop;                                 \
 319       }                                                         \
 320     *dst++ = c1, *dst++ = c2;                                   \
 321   } while (0)
 322
 323 #define EMIT_BYTES(from, to)                                    \
 324   do {                                                          \
 325     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 326       {                                                         \
 327         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 328         goto label_end_of_loop;                                 \
 329       }                                                         \
 330     while (from < to)                                           \
 331       *dst++ = *from++;                                         \
 332   } while (0)
 333
 334 \f
 335 /*** 1. Preamble ***/
 336
 337 #ifdef emacs
 338 #include <config.h>
 339 #endif
 340
 341 #include <stdio.h>
 342
 343 #ifdef emacs
 344
 345 #include "lisp.h"
 346 #include "buffer.h"
 347 #include "charset.h"
 348 #include "composite.h"
 349 #include "ccl.h"
 350 #include "coding.h"
 351 #include "window.h"
 352 #include "intervals.h"
 353
 354 #else  /* not emacs */
 355
 356 #include "mulelib.h"
 357
 358 #endif /* not emacs */
 359
 360 Lisp_Object Qcoding_system, Qeol_type;
 361 Lisp_Object Qbuffer_file_coding_system;
 362 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 363 Lisp_Object Qno_conversion, Qundecided;
 364 Lisp_Object Qcoding_system_history;
 365 Lisp_Object Qsafe_chars;
 366 Lisp_Object Qvalid_codes;
 367 Lisp_Object Qascii_incompatible;
 368
 369 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 370 Lisp_Object Qcall_process, Qcall_process_region;
 371 Lisp_Object Qstart_process, Qopen_network_stream;
 372 Lisp_Object Qtarget_idx;
 373
 374 /* If a symbol has this property, evaluate the value to define the
 375    symbol as a coding system.  */
 376 Lisp_Object Qcoding_system_define_form;
 377
 378 Lisp_Object Vselect_safe_coding_system_function;
 379
 380 int coding_system_require_warning;
 381
 382 /* Mnemonic string for each format of end-of-line.  */
 383 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 384 /* Mnemonic string to indicate format of end-of-line is not yet
 385    decided.  */
 386 Lisp_Object eol_mnemonic_undecided;
 387
 388 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 389    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 390    This has an effect only for external encoding (i.e. for output to
 391    file and process), not for in-buffer or Lisp string encoding.  */
 392 int system_eol_type;
 393
 394 #ifdef emacs
 395
 396 /* Information about which coding system is safe for which chars.
 397    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 398
 399    GENERIC-LIST is a list of generic coding systems which can encode
 400    any characters.
 401
 402    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 403    corresponding char table that contains safe chars.  */
 404 Lisp_Object Vcoding_system_safe_chars;
 405
 406 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 407
 408 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 409
 410 /* Coding system emacs-mule and raw-text are for converting only
 411    end-of-line format.  */
 412 Lisp_Object Qemacs_mule, Qraw_text;
 413
 414 Lisp_Object Qutf_8;
 415
 416 /* Coding-systems are handed between Emacs Lisp programs and C internal
 417    routines by the following three variables.  */
 418 /* Coding-system for reading files and receiving data from process.  */
 419 Lisp_Object Vcoding_system_for_read;
 420 /* Coding-system for writing files and sending data to process.  */
 421 Lisp_Object Vcoding_system_for_write;
 422 /* Coding-system actually used in the latest I/O.  */
 423 Lisp_Object Vlast_coding_system_used;
 424
 425 /* A vector of length 256 which contains information about special
 426    Latin codes (especially for dealing with Microsoft codes).  */
 427 Lisp_Object Vlatin_extra_code_table;
 428
 429 /* Flag to inhibit code conversion of end-of-line format.  */
 430 int inhibit_eol_conversion;
 431
 432 /* Flag to inhibit ISO2022 escape sequence detection.  */
 433 int inhibit_iso_escape_detection;
 434
 435 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 436 int inherit_process_coding_system;
 437
 438 /* Coding system to be used to encode text for terminal display.  */
 439 struct coding_system terminal_coding;
 440
 441 /* Coding system to be used to encode text for terminal display when
 442    terminal coding system is nil.  */
 443 struct coding_system safe_terminal_coding;
 444
 445 /* Coding system of what is sent from terminal keyboard.  */
 446 struct coding_system keyboard_coding;
 447
 448 /* Default coding system to be used to write a file.  */
 449 struct coding_system default_buffer_file_coding;
 450
 451 Lisp_Object Vfile_coding_system_alist;
 452 Lisp_Object Vprocess_coding_system_alist;
 453 Lisp_Object Vnetwork_coding_system_alist;
 454
 455 Lisp_Object Vlocale_coding_system;
 456
 457 #endif /* emacs */
 458
 459 Lisp_Object Qcoding_category, Qcoding_category_index;
 460
 461 /* List of symbols `coding-category-xxx' ordered by priority.  */
 462 Lisp_Object Vcoding_category_list;
 463
 464 /* Table of coding categories (Lisp symbols).  */
 465 Lisp_Object Vcoding_category_table;
 466
 467 /* Table of names of symbol for each coding-category.  */
 468 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 469   "coding-category-emacs-mule",
 470   "coding-category-sjis",
 471   "coding-category-iso-7",
 472   "coding-category-iso-7-tight",
 473   "coding-category-iso-8-1",
 474   "coding-category-iso-8-2",
 475   "coding-category-iso-7-else",
 476   "coding-category-iso-8-else",
 477   "coding-category-ccl",
 478   "coding-category-big5",
 479   "coding-category-utf-8",
 480   "coding-category-utf-16-be",
 481   "coding-category-utf-16-le",
 482   "coding-category-raw-text",
 483   "coding-category-binary"
 484 };
 485
 486 /* Table of pointers to coding systems corresponding to each coding
 487    categories.  */
 488 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 489
 490 /* Table of coding category masks.  Nth element is a mask for a coding
 491    category of which priority is Nth.  */
 492 static
 493 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 494
 495 /* Flag to tell if we look up translation table on character code
 496    conversion.  */
 497 Lisp_Object Venable_character_translation;
 498 /* Standard translation table to look up on decoding (reading).  */
 499 Lisp_Object Vstandard_translation_table_for_decode;
 500 /* Standard translation table to look up on encoding (writing).  */
 501 Lisp_Object Vstandard_translation_table_for_encode;
 502
 503 Lisp_Object Qtranslation_table;
 504 Lisp_Object Qtranslation_table_id;
 505 Lisp_Object Qtranslation_table_for_decode;
 506 Lisp_Object Qtranslation_table_for_encode;
 507
 508 /* Alist of charsets vs revision number.  */
 509 Lisp_Object Vcharset_revision_alist;
 510
 511 /* Default coding systems used for process I/O.  */
 512 Lisp_Object Vdefault_process_coding_system;
 513
 514 /* Char table for translating Quail and self-inserting input.  */
 515 Lisp_Object Vtranslation_table_for_input;
 516
 517 /* Global flag to tell that we can't call post-read-conversion and
 518    pre-write-conversion functions.  Usually the value is zero, but it
 519    is set to 1 temporarily while such functions are running.  This is
 520    to avoid infinite recursive call.  */
 521 static int inhibit_pre_post_conversion;
 522
 523 Lisp_Object Qchar_coding_system;
 524
 525 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 526    its validity.  */
 527
 528 Lisp_Object
 529 coding_safe_chars (coding_system)
 530      Lisp_Object coding_system;
 531 {
 532   Lisp_Object coding_spec, plist, safe_chars;
 533
 534   coding_spec = Fget (coding_system, Qcoding_system);
 535   plist = XVECTOR (coding_spec)->contents[3];
 536   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 537   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 538 }
 539
 540 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 541   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 542
 543 \f
 544 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 545
 546 /* Emacs' internal format for representation of multiple character
 547    sets is a kind of multi-byte encoding, i.e. characters are
 548    represented by variable-length sequences of one-byte codes.
 549
 550    ASCII characters and control characters (e.g. `tab', `newline') are
 551    represented by one-byte sequences which are their ASCII codes, in
 552    the range 0x00 through 0x7F.
 553
 554    8-bit characters of the range 0x80..0x9F are represented by
 555    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 556    code + 0x20).
 557
 558    8-bit characters of the range 0xA0..0xFF are represented by
 559    one-byte sequences which are their 8-bit code.
 560
 561    The other characters are represented by a sequence of `base
 562    leading-code', optional `extended leading-code', and one or two
 563    `position-code's.  The length of the sequence is determined by the
 564    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 565    whereas extended leading-code and position-code take the range 0xA0
 566    through 0xFF.  See `charset.h' for more details about leading-code
 567    and position-code.
 568
 569    --- CODE RANGE of Emacs' internal format ---
 570    character set        range
 571    -------------        -----
 572    ascii                0x00..0x7F
 573    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 574    eight-bit-graphic    0xA0..0xBF
 575    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 576    ---------------------------------------------
 577
 578    As this is the internal character representation, the format is
 579    usually not used externally (i.e. in a file or in a data sent to a
 580    process).  But, it is possible to have a text externally in this
 581    format (i.e. by encoding by the coding system `emacs-mule').
 582
 583    In that case, a sequence of one-byte codes has a slightly different
 584    form.
 585
 586    Firstly, all characters in eight-bit-control are represented by
 587    one-byte sequences which are their 8-bit code.
 588
 589    Next, character composition data are represented by the byte
 590    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 591    where,
 592         METHOD is 0xF0 plus one of composition method (enum
 593         composition_method),
 594
 595         BYTES is 0xA0 plus the byte length of these composition data,
 596
 597         CHARS is 0xA0 plus the number of characters composed by these
 598         data,
 599
 600         COMPONENTs are characters of multibyte form or composition
 601         rules encoded by two-byte of ASCII codes.
 602
 603    In addition, for backward compatibility, the following formats are
 604    also recognized as composition data on decoding.
 605
 606    0x80 MSEQ ...
 607    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 608
 609    Here,
 610         MSEQ is a multibyte form but in these special format:
 611           ASCII: 0xA0 ASCII_CODE+0x80,
 612           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 613         RULE is a one byte code of the range 0xA0..0xF0 that
 614         represents a composition rule.
 615   */
 616
 617 enum emacs_code_class_type emacs_code_class[256];
 618
 619 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 620    Check if a text is encoded in Emacs' internal format.  If it is,
 621    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 622
 623 static int
 624 detect_coding_emacs_mule (src, src_end, multibytep)
 625       unsigned char *src, *src_end;
 626       int multibytep;
 627 {
 628   unsigned char c;
 629   int composing = 0;
 630   /* Dummy for ONE_MORE_BYTE.  */
 631   struct coding_system dummy_coding;
 632   struct coding_system *coding = &dummy_coding;
 633
 634   while (1)
 635     {
 636       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 637                                      CODING_CATEGORY_MASK_EMACS_MULE);
 638       if (composing)
 639         {
 640           if (c < 0xA0)
 641             composing = 0;
 642           else if (c == 0xA0)
 643             {
 644               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 645               c &= 0x7F;
 646             }
 647           else
 648             c -= 0x20;
 649         }
 650
 651       if (c < 0x20)
 652         {
 653           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 654             return 0;
 655         }
 656       else if (c >= 0x80 && c < 0xA0)
 657         {
 658           if (c == 0x80)
 659             /* Old leading code for a composite character.  */
 660             composing = 1;
 661           else
 662             {
 663               unsigned char *src_base = src - 1;
 664               int bytes;
 665
 666               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 667                                                bytes))
 668                 return 0;
 669               src = src_base + bytes;
 670             }
 671         }
 672     }
 673 }
 674
 675
 676 /* Record the starting position START and METHOD of one composition.  */
 677
 678 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 679   do {                                                          \
 680     struct composition_data *cmp_data = coding->cmp_data;       \
 681     int *data = cmp_data->data + cmp_data->used;                \
 682     coding->cmp_data_start = cmp_data->used;                    \
 683     data[0] = -1;                                               \
 684     data[1] = cmp_data->char_offset + start;                    \
 685     data[3] = (int) method;                                     \
 686     cmp_data->used += 4;                                        \
 687   } while (0)
 688
 689 /* Record the ending position END of the current composition.  */
 690
 691 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 692   do {                                                          \
 693     struct composition_data *cmp_data = coding->cmp_data;       \
 694     int *data = cmp_data->data + coding->cmp_data_start;        \
 695     data[0] = cmp_data->used - coding->cmp_data_start;          \
 696     data[2] = cmp_data->char_offset + end;                      \
 697   } while (0)
 698
 699 /* Record one COMPONENT (alternate character or composition rule).  */
 700
 701 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 702   do {                                                                  \
 703     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 704     if (coding->cmp_data->used - coding->cmp_data_start                 \
 705         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 706       {                                                                 \
 707         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 708         coding->composing = COMPOSITION_NO;                             \
 709       }                                                                 \
 710   } while (0)
 711
 712
 713 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 714    is not less than SRC_END, return -1 without incrementing Src.  */
 715
 716 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 717
 718
 719 /* Decode a character represented as a component of composition
 720    sequence of Emacs 20 style at SRC.  Set C to that character, store
 721    its multibyte form sequence at P, and set P to the end of that
 722    sequence.  If no valid character is found, set C to -1.  */
 723
 724 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 725   do {                                                          \
 726     int bytes;                                                  \
 727                                                                 \
 728     c = SAFE_ONE_MORE_BYTE ();                                  \
 729     if (c < 0)                                                  \
 730       break;                                                    \
 731     if (CHAR_HEAD_P (c))                                        \
 732       c = -1;                                                   \
 733     else if (c == 0xA0)                                         \
 734       {                                                         \
 735         c = SAFE_ONE_MORE_BYTE ();                              \
 736         if (c < 0xA0)                                           \
 737           c = -1;                                               \
 738         else                                                    \
 739           {                                                     \
 740             c -= 0x80;                                          \
 741             *p++ = c;                                           \
 742           }                                                     \
 743       }                                                         \
 744     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 745       {                                                         \
 746         unsigned char *p0 = p;                                  \
 747                                                                 \
 748         c -= 0x20;                                              \
 749         *p++ = c;                                               \
 750         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 751         while (--bytes)                                         \
 752           {                                                     \
 753             c = SAFE_ONE_MORE_BYTE ();                          \
 754             if (c < 0)                                          \
 755               break;                                            \
 756             *p++ = c;                                           \
 757           }                                                     \
 758         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 759             || (coding->flags /* We are recovering a file.  */  \
 760                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 761                 && ! CHAR_HEAD_P (p0[1])))                      \
 762           c = STRING_CHAR (p0, bytes);                          \
 763         else                                                    \
 764           c = -1;                                               \
 765       }                                                         \
 766     else                                                        \
 767       c = -1;                                                   \
 768   } while (0)
 769
 770
 771 /* Decode a composition rule represented as a component of composition
 772    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 773    valid rule is found, set C to -1.  */
 774
 775 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 776   do {                                                  \
 777     c = SAFE_ONE_MORE_BYTE ();                          \
 778     c -= 0xA0;                                          \
 779     if (c < 0 || c >= 81)                               \
 780       c = -1;                                           \
 781     else                                                \
 782       {                                                 \
 783         gref = c / 9, nref = c % 9;                     \
 784         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 785       }                                                 \
 786   } while (0)
 787
 788
 789 /* Decode composition sequence encoded by `emacs-mule' at the source
 790    pointed by SRC.  SRC_END is the end of source.  Store information
 791    of the composition in CODING->cmp_data.
 792
 793    For backward compatibility, decode also a composition sequence of
 794    Emacs 20 style.  In that case, the composition sequence contains
 795    characters that should be extracted into a buffer or string.  Store
 796    those characters at *DESTINATION in multibyte form.
 797
 798    If we encounter an invalid byte sequence, return 0.
 799    If we encounter an insufficient source or destination, or
 800    insufficient space in CODING->cmp_data, return 1.
 801    Otherwise, return consumed bytes in the source.
 802
 803 */
 804 static INLINE int
 805 decode_composition_emacs_mule (coding, src, src_end,
 806                                destination, dst_end, dst_bytes)
 807      struct coding_system *coding;
 808      const unsigned char *src, *src_end;
 809      unsigned char **destination, *dst_end;
 810      int dst_bytes;
 811 {
 812   unsigned char *dst = *destination;
 813   int method, data_len, nchars;
 814   const unsigned char *src_base = src++;
 815   /* Store components of composition.  */
 816   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 817   int ncomponent;
 818   /* Store multibyte form of characters to be composed.  This is for
 819      Emacs 20 style composition sequence.  */
 820   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 821   unsigned char *bufp = buf;
 822   int c, i, gref, nref;
 823
 824   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 825       >= COMPOSITION_DATA_SIZE)
 826     {
 827       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 828       return -1;
 829     }
 830
 831   ONE_MORE_BYTE (c);
 832   if (c - 0xF0 >= COMPOSITION_RELATIVE
 833            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 834     {
 835       int with_rule;
 836
 837       method = c - 0xF0;
 838       with_rule = (method == COMPOSITION_WITH_RULE
 839                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 840       ONE_MORE_BYTE (c);
 841       data_len = c - 0xA0;
 842       if (data_len < 4
 843           || src_base + data_len > src_end)
 844         return 0;
 845       ONE_MORE_BYTE (c);
 846       nchars = c - 0xA0;
 847       if (c < 1)
 848         return 0;
 849       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 850         {
 851           /* If it is longer than this, it can't be valid.  */
 852           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 853             return 0;
 854
 855           if (ncomponent % 2 && with_rule)
 856             {
 857               ONE_MORE_BYTE (gref);
 858               gref -= 32;
 859               ONE_MORE_BYTE (nref);
 860               nref -= 32;
 861               c = COMPOSITION_ENCODE_RULE (gref, nref);
 862             }
 863           else
 864             {
 865               int bytes;
 866               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 867                   || (coding->flags /* We are recovering a file.  */
 868                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 869                       && ! CHAR_HEAD_P (src[1])))
 870                 c = STRING_CHAR (src, bytes);
 871               else
 872                 c = *src, bytes = 1;
 873               src += bytes;
 874             }
 875           component[ncomponent] = c;
 876         }
 877     }
 878   else if (c >= 0x80)
 879     {
 880       /* This may be an old Emacs 20 style format.  See the comment at
 881          the section 2 of this file.  */
 882       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 883       if (src == src_end
 884           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 885         goto label_end_of_loop;
 886
 887       src_end = src;
 888       src = src_base + 1;
 889       if (c < 0xC0)
 890         {
 891           method = COMPOSITION_RELATIVE;
 892           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 893             {
 894               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 895               if (c < 0)
 896                 break;
 897               component[ncomponent++] = c;
 898             }
 899           if (ncomponent < 2)
 900             return 0;
 901           nchars = ncomponent;
 902         }
 903       else if (c == 0xFF)
 904         {
 905           method = COMPOSITION_WITH_RULE;
 906           src++;
 907           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 908           if (c < 0)
 909             return 0;
 910           component[0] = c;
 911           for (ncomponent = 1;
 912                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 913             {
 914               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 915               if (c < 0)
 916                 break;
 917               component[ncomponent++] = c;
 918               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 919               if (c < 0)
 920                 break;
 921               component[ncomponent++] = c;
 922             }
 923           if (ncomponent < 3)
 924             return 0;
 925           nchars = (ncomponent + 1) / 2;
 926         }
 927       else
 928         return 0;
 929     }
 930   else
 931     return 0;
 932
 933   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 934     {
 935       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 936       for (i = 0; i < ncomponent; i++)
 937         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 938       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 939       if (buf < bufp)
 940         {
 941           unsigned char *p = buf;
 942           EMIT_BYTES (p, bufp);
 943           *destination += bufp - buf;
 944           coding->produced_char += nchars;
 945         }
 946       return (src - src_base);
 947     }
 948  label_end_of_loop:
 949   return -1;
 950 }
 951
 952 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 953
 954 static void
 955 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 956      struct coding_system *coding;
 957      const unsigned char *source;
 958      unsigned char *destination;
 959      int src_bytes, dst_bytes;
 960 {
 961   const unsigned char *src = source;
 962   const unsigned char *src_end = source + src_bytes;
 963   unsigned char *dst = destination;
 964   unsigned char *dst_end = destination + dst_bytes;
 965   /* SRC_BASE remembers the start position in source in each loop.
 966      The loop will be exited when there's not enough source code, or
 967      when there's not enough destination area to produce a
 968      character.  */
 969   const unsigned char *src_base;
 970
 971   coding->produced_char = 0;
 972   while ((src_base = src) < src_end)
 973     {
 974       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 975       const unsigned char *p;
 976       int bytes;
 977
 978       if (*src == '\r')
 979         {
 980           int c = *src++;
 981
 982           if (coding->eol_type == CODING_EOL_CR)
 983             c = '\n';
 984           else if (coding->eol_type == CODING_EOL_CRLF)
 985             {
 986               ONE_MORE_BYTE (c);
 987               if (c != '\n')
 988                 {
 989                   src--;
 990                   c = '\r';
 991                 }
 992             }
 993           *dst++ = c;
 994           coding->produced_char++;
 995           continue;
 996         }
 997       else if (*src == '\n')
 998         {
 999           if ((coding->eol_type == CODING_EOL_CR
1000                || coding->eol_type == CODING_EOL_CRLF)
1001               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1002             {
1003               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1004               goto label_end_of_loop;
1005             }
1006           *dst++ = *src++;
1007           coding->produced_char++;
1008           continue;
1009         }
1010       else if (*src == 0x80 && coding->cmp_data)
1011         {
1012           /* Start of composition data.  */
1013           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1014                                                          &dst, dst_end,
1015                                                          dst_bytes);
1016           if (consumed < 0)
1017             goto label_end_of_loop;
1018           else if (consumed > 0)
1019             {
1020               src += consumed;
1021               continue;
1022             }
1023           bytes = CHAR_STRING (*src, tmp);
1024           p = tmp;
1025           src++;
1026         }
1027       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1028                || (coding->flags /* We are recovering a file.  */
1029                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1030                    && ! CHAR_HEAD_P (src[1])))
1031         {
1032           p = src;
1033           src += bytes;
1034         }
1035       else
1036         {
1037           int i, c;
1038
1039           bytes = BYTES_BY_CHAR_HEAD (*src);
1040           src++;
1041           for (i = 1; i < bytes; i++)
1042             {
1043               ONE_MORE_BYTE (c);
1044               if (CHAR_HEAD_P (c))
1045                 break;
1046             }
1047           if (i < bytes)
1048             {
1049               bytes = CHAR_STRING (*src_base, tmp);
1050               p = tmp;
1051               src = src_base + 1;
1052             }
1053           else
1054             {
1055               p = src_base;
1056             }
1057         }
1058       if (dst + bytes >= (dst_bytes ? dst_end : src))
1059         {
1060           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1061           break;
1062         }
1063       while (bytes--) *dst++ = *p++;
1064       coding->produced_char++;
1065     }
1066  label_end_of_loop:
1067   coding->consumed = coding->consumed_char = src_base - source;
1068   coding->produced = dst - destination;
1069 }
1070
1071
1072 /* Encode composition data stored at DATA into a special byte sequence
1073    starting by 0x80.  Update CODING->cmp_data_start and maybe
1074    CODING->cmp_data for the next call.  */
1075
1076 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1077   do {                                                                  \
1078     unsigned char buf[1024], *p0 = buf, *p;                             \
1079     int len = data[0];                                                  \
1080     int i;                                                              \
1081                                                                         \
1082     buf[0] = 0x80;                                                      \
1083     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1084     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1085     p = buf + 4;                                                        \
1086     if (data[3] == COMPOSITION_WITH_RULE                                \
1087         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1088       {                                                                 \
1089         p += CHAR_STRING (data[4], p);                                  \
1090         for (i = 5; i < len; i += 2)                                    \
1091           {                                                             \
1092             int gref, nref;                                             \
1093              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1094             *p++ = 0x20 + gref;                                         \
1095             *p++ = 0x20 + nref;                                         \
1096             p += CHAR_STRING (data[i + 1], p);                          \
1097           }                                                             \
1098       }                                                                 \
1099     else                                                                \
1100       {                                                                 \
1101         for (i = 4; i < len; i++)                                       \
1102           p += CHAR_STRING (data[i], p);                                \
1103       }                                                                 \
1104     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1105                                                                         \
1106     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1107       {                                                                 \
1108         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1109         goto label_end_of_loop;                                         \
1110       }                                                                 \
1111     while (p0 < p)                                                      \
1112       *dst++ = *p0++;                                                   \
1113     coding->cmp_data_start += data[0];                                  \
1114     if (coding->cmp_data_start == coding->cmp_data->used                \
1115         && coding->cmp_data->next)                                      \
1116       {                                                                 \
1117         coding->cmp_data = coding->cmp_data->next;                      \
1118         coding->cmp_data_start = 0;                                     \
1119       }                                                                 \
1120   } while (0)
1121
1122
1123 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1124                             unsigned char *, int, int));
1125
1126 static void
1127 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1128      struct coding_system *coding;
1129      const unsigned char *source;
1130      unsigned char *destination;
1131      int src_bytes, dst_bytes;
1132 {
1133   const unsigned char *src = source;
1134   const unsigned char *src_end = source + src_bytes;
1135   unsigned char *dst = destination;
1136   unsigned char *dst_end = destination + dst_bytes;
1137   const unsigned char *src_base;
1138   int c;
1139   int char_offset;
1140   int *data;
1141
1142   Lisp_Object translation_table;
1143
1144   translation_table = Qnil;
1145
1146   /* Optimization for the case that there's no composition.  */
1147   if (!coding->cmp_data || coding->cmp_data->used == 0)
1148     {
1149       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1150       return;
1151     }
1152
1153   char_offset = coding->cmp_data->char_offset;
1154   data = coding->cmp_data->data + coding->cmp_data_start;
1155   while (1)
1156     {
1157       src_base = src;
1158
1159       /* If SRC starts a composition, encode the information about the
1160          composition in advance.  */
1161       if (coding->cmp_data_start < coding->cmp_data->used
1162           && char_offset + coding->consumed_char == data[1])
1163         {
1164           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1165           char_offset = coding->cmp_data->char_offset;
1166           data = coding->cmp_data->data + coding->cmp_data_start;
1167         }
1168
1169       ONE_MORE_CHAR (c);
1170       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1171                         || coding->eol_type == CODING_EOL_CR))
1172         {
1173           if (coding->eol_type == CODING_EOL_CRLF)
1174             EMIT_TWO_BYTES ('\r', c);
1175           else
1176             EMIT_ONE_BYTE ('\r');
1177         }
1178       else if (SINGLE_BYTE_CHAR_P (c))
1179         {
1180           if (coding->flags && ! ASCII_BYTE_P (c))
1181             {
1182               /* As we are auto saving, retain the multibyte form for
1183                  8-bit chars.  */
1184               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1185               int bytes = CHAR_STRING (c, buf);
1186
1187               if (bytes == 1)
1188                 EMIT_ONE_BYTE (buf[0]);
1189               else
1190                 EMIT_TWO_BYTES (buf[0], buf[1]);
1191             }
1192           else
1193             EMIT_ONE_BYTE (c);
1194         }
1195       else
1196         EMIT_BYTES (src_base, src);
1197       coding->consumed_char++;
1198     }
1199  label_end_of_loop:
1200   coding->consumed = src_base - source;
1201   coding->produced = coding->produced_char = dst - destination;
1202   return;
1203 }
1204
1205 \f
1206 /*** 3. ISO2022 handlers ***/
1207
1208 /* The following note describes the coding system ISO2022 briefly.
1209    Since the intention of this note is to help understand the
1210    functions in this file, some parts are NOT ACCURATE or are OVERLY
1211    SIMPLIFIED.  For thorough understanding, please refer to the
1212    original document of ISO2022.  This is equivalent to the standard
1213    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1214
1215    ISO2022 provides many mechanisms to encode several character sets
1216    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1217    is encoded using bytes less than 128.  This may make the encoded
1218    text a little bit longer, but the text passes more easily through
1219    several types of gateway, some of which strip off the MSB (Most
1220    Significant Bit).
1221
1222    There are two kinds of character sets: control character sets and
1223    graphic character sets.  The former contain control characters such
1224    as `newline' and `escape' to provide control functions (control
1225    functions are also provided by escape sequences).  The latter
1226    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1227    two control character sets and many graphic character sets.
1228
1229    Graphic character sets are classified into one of the following
1230    four classes, according to the number of bytes (DIMENSION) and
1231    number of characters in one dimension (CHARS) of the set:
1232    - DIMENSION1_CHARS94
1233    - DIMENSION1_CHARS96
1234    - DIMENSION2_CHARS94
1235    - DIMENSION2_CHARS96
1236
1237    In addition, each character set is assigned an identification tag,
1238    unique for each set, called the "final character" (denoted as <F>
1239    hereafter).  The <F> of each character set is decided by ECMA(*)
1240    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1241    (0x30..0x3F are for private use only).
1242
1243    Note (*): ECMA = European Computer Manufacturers Association
1244
1245    Here are examples of graphic character sets [NAME(<F>)]:
1246         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1247         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1248         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1249         o DIMENSION2_CHARS96 -- none for the moment
1250
1251    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1252         C0 [0x00..0x1F] -- control character plane 0
1253         GL [0x20..0x7F] -- graphic character plane 0
1254         C1 [0x80..0x9F] -- control character plane 1
1255         GR [0xA0..0xFF] -- graphic character plane 1
1256
1257    A control character set is directly designated and invoked to C0 or
1258    C1 by an escape sequence.  The most common case is that:
1259    - ISO646's  control character set is designated/invoked to C0, and
1260    - ISO6429's control character set is designated/invoked to C1,
1261    and usually these designations/invocations are omitted in encoded
1262    text.  In a 7-bit environment, only C0 can be used, and a control
1263    character for C1 is encoded by an appropriate escape sequence to
1264    fit into the environment.  All control characters for C1 are
1265    defined to have corresponding escape sequences.
1266
1267    A graphic character set is at first designated to one of four
1268    graphic registers (G0 through G3), then these graphic registers are
1269    invoked to GL or GR.  These designations and invocations can be
1270    done independently.  The most common case is that G0 is invoked to
1271    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1272    these invocations and designations are omitted in encoded text.
1273    In a 7-bit environment, only GL can be used.
1274
1275    When a graphic character set of CHARS94 is invoked to GL, codes
1276    0x20 and 0x7F of the GL area work as control characters SPACE and
1277    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1278    be used.
1279
1280    There are two ways of invocation: locking-shift and single-shift.
1281    With locking-shift, the invocation lasts until the next different
1282    invocation, whereas with single-shift, the invocation affects the
1283    following character only and doesn't affect the locking-shift
1284    state.  Invocations are done by the following control characters or
1285    escape sequences:
1286
1287    ----------------------------------------------------------------------
1288    abbrev  function                  cntrl escape seq   description
1289    ----------------------------------------------------------------------
1290    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1291    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1292    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1293    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1294    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1295    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1296    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1297    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1298    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1299    ----------------------------------------------------------------------
1300    (*) These are not used by any known coding system.
1301
1302    Control characters for these functions are defined by macros
1303    ISO_CODE_XXX in `coding.h'.
1304
1305    Designations are done by the following escape sequences:
1306    ----------------------------------------------------------------------
1307    escape sequence      description
1308    ----------------------------------------------------------------------
1309    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1310    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1311    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1312    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1313    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1314    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1315    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1316    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1317    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1318    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1319    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1320    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1321    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1322    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1323    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1324    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1325    ----------------------------------------------------------------------
1326
1327    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1328    of dimension 1, chars 94, and final character <F>, etc...
1329
1330    Note (*): Although these designations are not allowed in ISO2022,
1331    Emacs accepts them on decoding, and produces them on encoding
1332    CHARS96 character sets in a coding system which is characterized as
1333    7-bit environment, non-locking-shift, and non-single-shift.
1334
1335    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1336    '(' can be omitted.  We refer to this as "short-form" hereafter.
1337
1338    Now you may notice that there are a lot of ways of encoding the
1339    same multilingual text in ISO2022.  Actually, there exist many
1340    coding systems such as Compound Text (used in X11's inter client
1341    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1342    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1343    localized platforms), and all of these are variants of ISO2022.
1344
1345    In addition to the above, Emacs handles two more kinds of escape
1346    sequences: ISO6429's direction specification and Emacs' private
1347    sequence for specifying character composition.
1348
1349    ISO6429's direction specification takes the following form:
1350         o CSI ']'      -- end of the current direction
1351         o CSI '0' ']'  -- end of the current direction
1352         o CSI '1' ']'  -- start of left-to-right text
1353         o CSI '2' ']'  -- start of right-to-left text
1354    The control character CSI (0x9B: control sequence introducer) is
1355    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1356
1357    Character composition specification takes the following form:
1358         o ESC '0' -- start relative composition
1359         o ESC '1' -- end composition
1360         o ESC '2' -- start rule-base composition (*)
1361         o ESC '3' -- start relative composition with alternate chars  (**)
1362         o ESC '4' -- start rule-base composition with alternate chars  (**)
1363   Since these are not standard escape sequences of any ISO standard,
1364   the use of them with these meanings is restricted to Emacs only.
1365
1366   (*) This form is used only in Emacs 20.5 and older versions,
1367   but the newer versions can safely decode it.
1368   (**) This form is used only in Emacs 21.1 and newer versions,
1369   and the older versions can't decode it.
1370
1371   Here's a list of example usages of these composition escape
1372   sequences (categorized by `enum composition_method').
1373
1374   COMPOSITION_RELATIVE:
1375         ESC 0 CHAR [ CHAR ] ESC 1
1376   COMPOSITION_WITH_RULE:
1377         ESC 2 CHAR [ RULE CHAR ] ESC 1
1378   COMPOSITION_WITH_ALTCHARS:
1379         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1380   COMPOSITION_WITH_RULE_ALTCHARS:
1381         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1382
1383 enum iso_code_class_type iso_code_class[256];
1384
1385 #define CHARSET_OK(idx, charset, c)                                     \
1386   (coding_system_table[idx]                                             \
1387    && (charset == CHARSET_ASCII                                         \
1388        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1389            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1390    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1391                                               charset)                  \
1392        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1393
1394 #define SHIFT_OUT_OK(idx) \
1395   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1396
1397 #define COMPOSITION_OK(idx)     \
1398   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1399
1400 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1401    Check if a text is encoded in ISO2022.  If it is, return an
1402    integer in which appropriate flag bits any of:
1403         CODING_CATEGORY_MASK_ISO_7
1404         CODING_CATEGORY_MASK_ISO_7_TIGHT
1405         CODING_CATEGORY_MASK_ISO_8_1
1406         CODING_CATEGORY_MASK_ISO_8_2
1407         CODING_CATEGORY_MASK_ISO_7_ELSE
1408         CODING_CATEGORY_MASK_ISO_8_ELSE
1409    are set.  If a code which should never appear in ISO2022 is found,
1410    returns 0.  */
1411
1412 static int
1413 detect_coding_iso2022 (src, src_end, multibytep)
1414      unsigned char *src, *src_end;
1415      int multibytep;
1416 {
1417   int mask = CODING_CATEGORY_MASK_ISO;
1418   int mask_found = 0;
1419   int reg[4], shift_out = 0, single_shifting = 0;
1420   int c, c1, charset;
1421   /* Dummy for ONE_MORE_BYTE.  */
1422   struct coding_system dummy_coding;
1423   struct coding_system *coding = &dummy_coding;
1424   Lisp_Object safe_chars;
1425
1426   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1427   while (mask)
1428     {
1429       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1430     retry:
1431       switch (c)
1432         {
1433         case ISO_CODE_ESC:
1434           if (inhibit_iso_escape_detection)
1435             break;
1436           single_shifting = 0;
1437           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1438           if (c >= '(' && c <= '/')
1439             {
1440               /* Designation sequence for a charset of dimension 1.  */
1441               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1442               if (c1 < ' ' || c1 >= 0x80
1443                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1444                 /* Invalid designation sequence.  Just ignore.  */
1445                 break;
1446               reg[(c - '(') % 4] = charset;
1447             }
1448           else if (c == '$')
1449             {
1450               /* Designation sequence for a charset of dimension 2.  */
1451               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1452               if (c >= '@' && c <= 'B')
1453                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1454                 reg[0] = charset = iso_charset_table[1][0][c];
1455               else if (c >= '(' && c <= '/')
1456                 {
1457                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1458                                                  mask & mask_found);
1459                   if (c1 < ' ' || c1 >= 0x80
1460                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1461                     /* Invalid designation sequence.  Just ignore.  */
1462                     break;
1463                   reg[(c - '(') % 4] = charset;
1464                 }
1465               else
1466                 /* Invalid designation sequence.  Just ignore.  */
1467                 break;
1468             }
1469           else if (c == 'N' || c == 'O')
1470             {
1471               /* ESC <Fe> for SS2 or SS3.  */
1472               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1473               break;
1474             }
1475           else if (c >= '0' && c <= '4')
1476             {
1477               /* ESC <Fp> for start/end composition.  */
1478               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1479                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1480               else
1481                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1482               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1483                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1484               else
1485                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1490               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1491                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1492               else
1493                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1494               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1495                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1496               else
1497                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1498               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1499                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1500               else
1501                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1502               break;
1503             }
1504           else
1505             /* Invalid escape sequence.  Just ignore.  */
1506             break;
1507
1508           /* We found a valid designation sequence for CHARSET.  */
1509           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1510           c = MAKE_CHAR (charset, 0, 0);
1511           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1512             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1513           else
1514             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1515           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1516             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1517           else
1518             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1519           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1520             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1521           else
1522             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1523           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1524             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1525           else
1526             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1527           break;
1528
1529         case ISO_CODE_SO:
1530           if (inhibit_iso_escape_detection)
1531             break;
1532           single_shifting = 0;
1533           if (shift_out == 0
1534               && (reg[1] >= 0
1535                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1536                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1537             {
1538               /* Locking shift out.  */
1539               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541             }
1542           break;
1543
1544         case ISO_CODE_SI:
1545           if (inhibit_iso_escape_detection)
1546             break;
1547           single_shifting = 0;
1548           if (shift_out == 1)
1549             {
1550               /* Locking shift in.  */
1551               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1552               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1553             }
1554           break;
1555
1556         case ISO_CODE_CSI:
1557           single_shifting = 0;
1558         case ISO_CODE_SS2:
1559         case ISO_CODE_SS3:
1560           {
1561             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1562
1563             if (inhibit_iso_escape_detection)
1564               break;
1565             if (c != ISO_CODE_CSI)
1566               {
1567                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1569                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1572                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573                 single_shifting = 1;
1574               }
1575             if (VECTORP (Vlatin_extra_code_table)
1576                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1577               {
1578                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1579                     & CODING_FLAG_ISO_LATIN_EXTRA)
1580                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1581                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1582                     & CODING_FLAG_ISO_LATIN_EXTRA)
1583                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1584               }
1585             mask &= newmask;
1586             mask_found |= newmask;
1587           }
1588           break;
1589
1590         default:
1591           if (c < 0x80)
1592             {
1593               single_shifting = 0;
1594               break;
1595             }
1596           else if (c < 0xA0)
1597             {
1598               single_shifting = 0;
1599               if (VECTORP (Vlatin_extra_code_table)
1600                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1601                 {
1602                   int newmask = 0;
1603
1604                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1605                       & CODING_FLAG_ISO_LATIN_EXTRA)
1606                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1607                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1608                       & CODING_FLAG_ISO_LATIN_EXTRA)
1609                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1610                   mask &= newmask;
1611                   mask_found |= newmask;
1612                 }
1613               else
1614                 return 0;
1615             }
1616           else
1617             {
1618               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1619                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1620               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1621               /* Check the length of succeeding codes of the range
1622                  0xA0..0FF.  If the byte length is odd, we exclude
1623                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1624                  when we are not single shifting.  */
1625               if (!single_shifting
1626                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1627                 {
1628                   int i = 1;
1629
1630                   c = -1;
1631                   while (src < src_end)
1632                     {
1633                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1634                                                      mask & mask_found);
1635                       if (c < 0xA0)
1636                         break;
1637                       i++;
1638                     }
1639
1640                   if (i & 1 && src < src_end)
1641                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1642                   else
1643                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1644                   if (c >= 0)
1645                     /* This means that we have read one extra byte.  */
1646                     goto retry;
1647                 }
1648             }
1649           break;
1650         }
1651     }
1652   return (mask & mask_found);
1653 }
1654
1655 /* Decode a character of which charset is CHARSET, the 1st position
1656    code is C1, the 2nd position code is C2, and return the decoded
1657    character code.  If the variable `translation_table' is non-nil,
1658    returned the translated code.  */
1659
1660 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1661   (NILP (translation_table)                     \
1662    ? MAKE_CHAR (charset, c1, c2)                \
1663    : translate_char (translation_table, -1, charset, c1, c2))
1664
1665 /* Set designation state into CODING.  */
1666 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1667   do {                                                                     \
1668     int charset, c;                                                        \
1669                                                                            \
1670     if (final_char < '0' || final_char >= 128)                             \
1671       goto label_invalid_code;                                             \
1672     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1673                                  make_number (chars),                      \
1674                                  make_number (final_char));                \
1675     c = MAKE_CHAR (charset, 0, 0);                                         \
1676     if (charset >= 0                                                       \
1677         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1678             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1679       {                                                                    \
1680         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1681             && reg == 0                                                    \
1682             && charset == CHARSET_ASCII)                                   \
1683           {                                                                \
1684             /* We should insert this designation sequence as is so         \
1685                that it is surely written back to a file.  */               \
1686             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1687             goto label_invalid_code;                                       \
1688           }                                                                \
1689         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1690         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1691             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1692           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1693         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1694       }                                                                    \
1695     else                                                                   \
1696       {                                                                    \
1697         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1698         goto label_invalid_code;                                           \
1699       }                                                                    \
1700   } while (0)
1701
1702 /* Allocate a memory block for storing information about compositions.
1703    The block is chained to the already allocated blocks.  */
1704
1705 void
1706 coding_allocate_composition_data (coding, char_offset)
1707      struct coding_system *coding;
1708      int char_offset;
1709 {
1710   struct composition_data *cmp_data
1711     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1712
1713   cmp_data->char_offset = char_offset;
1714   cmp_data->used = 0;
1715   cmp_data->prev = coding->cmp_data;
1716   cmp_data->next = NULL;
1717   if (coding->cmp_data)
1718     coding->cmp_data->next = cmp_data;
1719   coding->cmp_data = cmp_data;
1720   coding->cmp_data_start = 0;
1721   coding->composing = COMPOSITION_NO;
1722 }
1723
1724 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1725    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1726    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1727    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1728    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1729   */
1730
1731 #define DECODE_COMPOSITION_START(c1)                                       \
1732   do {                                                                     \
1733     if (coding->composing == COMPOSITION_DISABLED)                         \
1734       {                                                                    \
1735         *dst++ = ISO_CODE_ESC;                                             \
1736         *dst++ = c1 & 0x7f;                                                \
1737         coding->produced_char += 2;                                        \
1738       }                                                                    \
1739     else if (!COMPOSING_P (coding))                                        \
1740       {                                                                    \
1741         /* This is surely the start of a composition.  We must be sure     \
1742            that coding->cmp_data has enough space to store the             \
1743            information about the composition.  If not, terminate the       \
1744            current decoding loop, allocate one more memory block for       \
1745            coding->cmp_data in the caller, then start the decoding         \
1746            loop again.  We can't allocate memory here directly because     \
1747            it may cause buffer/string relocation.  */                      \
1748         if (!coding->cmp_data                                              \
1749             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1750                 >= COMPOSITION_DATA_SIZE))                                 \
1751           {                                                                \
1752             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1753             goto label_end_of_loop;                                        \
1754           }                                                                \
1755         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1756                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1757                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1758                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1759         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1760                                       coding->composing);                  \
1761         coding->composition_rule_follows = 0;                              \
1762       }                                                                    \
1763     else                                                                   \
1764       {                                                                    \
1765         /* We are already handling a composition.  If the method is        \
1766            the following two, the codes following the current escape       \
1767            sequence are actual characters stored in a buffer.  */          \
1768         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1769             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1770           {                                                                \
1771             coding->composing = COMPOSITION_RELATIVE;                      \
1772             coding->composition_rule_follows = 0;                          \
1773           }                                                                \
1774       }                                                                    \
1775   } while (0)
1776
1777 /* Handle composition end sequence ESC 1.  */
1778
1779 #define DECODE_COMPOSITION_END(c1)                                      \
1780   do {                                                                  \
1781     if (! COMPOSING_P (coding))                                         \
1782       {                                                                 \
1783         *dst++ = ISO_CODE_ESC;                                          \
1784         *dst++ = c1;                                                    \
1785         coding->produced_char += 2;                                     \
1786       }                                                                 \
1787     else                                                                \
1788       {                                                                 \
1789         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1790         coding->composing = COMPOSITION_NO;                             \
1791       }                                                                 \
1792   } while (0)
1793
1794 /* Decode a composition rule from the byte C1 (and maybe one more byte
1795    from SRC) and store one encoded composition rule in
1796    coding->cmp_data.  */
1797
1798 #define DECODE_COMPOSITION_RULE(c1)                                     \
1799   do {                                                                  \
1800     int rule = 0;                                                       \
1801     (c1) -= 32;                                                         \
1802     if (c1 < 81)                /* old format (before ver.21) */        \
1803       {                                                                 \
1804         int gref = (c1) / 9;                                            \
1805         int nref = (c1) % 9;                                            \
1806         if (gref == 4) gref = 10;                                       \
1807         if (nref == 4) nref = 10;                                       \
1808         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1809       }                                                                 \
1810     else if (c1 < 93)           /* new format (after ver.21) */         \
1811       {                                                                 \
1812         ONE_MORE_BYTE (c2);                                             \
1813         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1814       }                                                                 \
1815     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1816     coding->composition_rule_follows = 0;                               \
1817   } while (0)
1818
1819
1820 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1821
1822 static void
1823 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1824      struct coding_system *coding;
1825      const unsigned char *source;
1826      unsigned char *destination;
1827      int src_bytes, dst_bytes;
1828 {
1829   const unsigned char *src = source;
1830   const unsigned char *src_end = source + src_bytes;
1831   unsigned char *dst = destination;
1832   unsigned char *dst_end = destination + dst_bytes;
1833   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1834   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1835   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1836   /* SRC_BASE remembers the start position in source in each loop.
1837      The loop will be exited when there's not enough source code
1838      (within macro ONE_MORE_BYTE), or when there's not enough
1839      destination area to produce a character (within macro
1840      EMIT_CHAR).  */
1841   const unsigned char *src_base;
1842   int c, charset;
1843   Lisp_Object translation_table;
1844   Lisp_Object safe_chars;
1845
1846   safe_chars = coding_safe_chars (coding->symbol);
1847
1848   if (NILP (Venable_character_translation))
1849     translation_table = Qnil;
1850   else
1851     {
1852       translation_table = coding->translation_table_for_decode;
1853       if (NILP (translation_table))
1854         translation_table = Vstandard_translation_table_for_decode;
1855     }
1856
1857   coding->result = CODING_FINISH_NORMAL;
1858
1859   while (1)
1860     {
1861       int c1, c2 = 0;
1862
1863       src_base = src;
1864       ONE_MORE_BYTE (c1);
1865
1866       /* We produce no character or one character.  */
1867       switch (iso_code_class [c1])
1868         {
1869         case ISO_0x20_or_0x7F:
1870           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1871             {
1872               DECODE_COMPOSITION_RULE (c1);
1873               continue;
1874             }
1875           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1876             {
1877               /* This is SPACE or DEL.  */
1878               charset = CHARSET_ASCII;
1879               break;
1880             }
1881           /* This is a graphic character, we fall down ...  */
1882
1883         case ISO_graphic_plane_0:
1884           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1885             {
1886               DECODE_COMPOSITION_RULE (c1);
1887               continue;
1888             }
1889           charset = charset0;
1890           break;
1891
1892         case ISO_0xA0_or_0xFF:
1893           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1894               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1895             goto label_invalid_code;
1896           /* This is a graphic character, we fall down ... */
1897
1898         case ISO_graphic_plane_1:
1899           if (charset1 < 0)
1900             goto label_invalid_code;
1901           charset = charset1;
1902           break;
1903
1904         case ISO_control_0:
1905           if (COMPOSING_P (coding))
1906             DECODE_COMPOSITION_END ('1');
1907
1908           /* All ISO2022 control characters in this class have the
1909              same representation in Emacs internal format.  */
1910           if (c1 == '\n'
1911               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1912               && (coding->eol_type == CODING_EOL_CR
1913                   || coding->eol_type == CODING_EOL_CRLF))
1914             {
1915               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1916               goto label_end_of_loop;
1917             }
1918           charset = CHARSET_ASCII;
1919           break;
1920
1921         case ISO_control_1:
1922           if (COMPOSING_P (coding))
1923             DECODE_COMPOSITION_END ('1');
1924           goto label_invalid_code;
1925
1926         case ISO_carriage_return:
1927           if (COMPOSING_P (coding))
1928             DECODE_COMPOSITION_END ('1');
1929
1930           if (coding->eol_type == CODING_EOL_CR)
1931             c1 = '\n';
1932           else if (coding->eol_type == CODING_EOL_CRLF)
1933             {
1934               ONE_MORE_BYTE (c1);
1935               if (c1 != ISO_CODE_LF)
1936                 {
1937                   src--;
1938                   c1 = '\r';
1939                 }
1940             }
1941           charset = CHARSET_ASCII;
1942           break;
1943
1944         case ISO_shift_out:
1945           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1946               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1947             goto label_invalid_code;
1948           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1949           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950           continue;
1951
1952         case ISO_shift_in:
1953           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1954             goto label_invalid_code;
1955           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1956           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1957           continue;
1958
1959         case ISO_single_shift_2_7:
1960         case ISO_single_shift_2:
1961           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962             goto label_invalid_code;
1963           /* SS2 is handled as an escape sequence of ESC 'N' */
1964           c1 = 'N';
1965           goto label_escape_sequence;
1966
1967         case ISO_single_shift_3:
1968           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1969             goto label_invalid_code;
1970           /* SS2 is handled as an escape sequence of ESC 'O' */
1971           c1 = 'O';
1972           goto label_escape_sequence;
1973
1974         case ISO_control_sequence_introducer:
1975           /* CSI is handled as an escape sequence of ESC '[' ...  */
1976           c1 = '[';
1977           goto label_escape_sequence;
1978
1979         case ISO_escape:
1980           ONE_MORE_BYTE (c1);
1981         label_escape_sequence:
1982           /* Escape sequences handled by Emacs are invocation,
1983              designation, direction specification, and character
1984              composition specification.  */
1985           switch (c1)
1986             {
1987             case '&':           /* revision of following character set */
1988               ONE_MORE_BYTE (c1);
1989               if (!(c1 >= '@' && c1 <= '~'))
1990                 goto label_invalid_code;
1991               ONE_MORE_BYTE (c1);
1992               if (c1 != ISO_CODE_ESC)
1993                 goto label_invalid_code;
1994               ONE_MORE_BYTE (c1);
1995               goto label_escape_sequence;
1996
1997             case '$':           /* designation of 2-byte character set */
1998               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1999                 goto label_invalid_code;
2000               ONE_MORE_BYTE (c1);
2001               if (c1 >= '@' && c1 <= 'B')
2002                 {       /* designation of JISX0208.1978, GB2312.1980,
2003                            or JISX0208.1980 */
2004                   DECODE_DESIGNATION (0, 2, 94, c1);
2005                 }
2006               else if (c1 >= 0x28 && c1 <= 0x2B)
2007                 {       /* designation of DIMENSION2_CHARS94 character set */
2008                   ONE_MORE_BYTE (c2);
2009                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2010                 }
2011               else if (c1 >= 0x2C && c1 <= 0x2F)
2012                 {       /* designation of DIMENSION2_CHARS96 character set */
2013                   ONE_MORE_BYTE (c2);
2014                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2015                 }
2016               else
2017                 goto label_invalid_code;
2018               /* We must update these variables now.  */
2019               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2020               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2021               continue;
2022
2023             case 'n':           /* invocation of locking-shift-2 */
2024               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2025                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2026                 goto label_invalid_code;
2027               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2028               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2029               continue;
2030
2031             case 'o':           /* invocation of locking-shift-3 */
2032               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2033                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2034                 goto label_invalid_code;
2035               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2036               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2037               continue;
2038
2039             case 'N':           /* invocation of single-shift-2 */
2040               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2041                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2042                 goto label_invalid_code;
2043               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2044               ONE_MORE_BYTE (c1);
2045               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2046                 goto label_invalid_code;
2047               break;
2048
2049             case 'O':           /* invocation of single-shift-3 */
2050               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2051                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2052                 goto label_invalid_code;
2053               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2054               ONE_MORE_BYTE (c1);
2055               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2056                 goto label_invalid_code;
2057               break;
2058
2059             case '0': case '2': case '3': case '4': /* start composition */
2060               DECODE_COMPOSITION_START (c1);
2061               continue;
2062
2063             case '1':           /* end composition */
2064               DECODE_COMPOSITION_END (c1);
2065               continue;
2066
2067             case '[':           /* specification of direction */
2068               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2069                 goto label_invalid_code;
2070               /* For the moment, nested direction is not supported.
2071                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2072                  left-to-right, and nonzero means right-to-left.  */
2073               ONE_MORE_BYTE (c1);
2074               switch (c1)
2075                 {
2076                 case ']':       /* end of the current direction */
2077                   coding->mode &= ~CODING_MODE_DIRECTION;
2078
2079                 case '0':       /* end of the current direction */
2080                 case '1':       /* start of left-to-right direction */
2081                   ONE_MORE_BYTE (c1);
2082                   if (c1 == ']')
2083                     coding->mode &= ~CODING_MODE_DIRECTION;
2084                   else
2085                     goto label_invalid_code;
2086                   break;
2087
2088                 case '2':       /* start of right-to-left direction */
2089                   ONE_MORE_BYTE (c1);
2090                   if (c1 == ']')
2091                     coding->mode |= CODING_MODE_DIRECTION;
2092                   else
2093                     goto label_invalid_code;
2094                   break;
2095
2096                 default:
2097                   goto label_invalid_code;
2098                 }
2099               continue;
2100
2101             case '%':
2102               if (COMPOSING_P (coding))
2103                 DECODE_COMPOSITION_END ('1');
2104               ONE_MORE_BYTE (c1);
2105               if (c1 == '/')
2106                 {
2107                   /* CTEXT extended segment:
2108                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2109                      We keep these bytes as is for the moment.
2110                      They may be decoded by post-read-conversion.  */
2111                   int dim, M, L;
2112                   int size, required;
2113                   int produced_chars;
2114
2115                   ONE_MORE_BYTE (dim);
2116                   ONE_MORE_BYTE (M);
2117                   ONE_MORE_BYTE (L);
2118                   size = ((M - 128) * 128) + (L - 128);
2119                   required = 8 + size * 2;
2120                   if (dst + required > (dst_bytes ? dst_end : src))
2121                     goto label_end_of_loop;
2122                   *dst++ = ISO_CODE_ESC;
2123                   *dst++ = '%';
2124                   *dst++ = '/';
2125                   *dst++ = dim;
2126                   produced_chars = 4;
2127                   dst += CHAR_STRING (M, dst), produced_chars++;
2128                   dst += CHAR_STRING (L, dst), produced_chars++;
2129                   while (size-- > 0)
2130                     {
2131                       ONE_MORE_BYTE (c1);
2132                       dst += CHAR_STRING (c1, dst), produced_chars++;
2133                     }
2134                   coding->produced_char += produced_chars;
2135                 }
2136               else if (c1 == 'G')
2137                 {
2138                   unsigned char *d = dst;
2139                   int produced_chars;
2140
2141                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2142                      ESC % G --UTF-8-BYTES-- ESC % @
2143                      We keep these bytes as is for the moment.
2144                      They may be decoded by post-read-conversion.  */
2145                   if (d + 6 > (dst_bytes ? dst_end : src))
2146                     goto label_end_of_loop;
2147                   *d++ = ISO_CODE_ESC;
2148                   *d++ = '%';
2149                   *d++ = 'G';
2150                   produced_chars = 3;
2151                   while (d + 1 < (dst_bytes ? dst_end : src))
2152                     {
2153                       ONE_MORE_BYTE (c1);
2154                       if (c1 == ISO_CODE_ESC
2155                           && src + 1 < src_end
2156                           && src[0] == '%'
2157                           && src[1] == '@')
2158                         {
2159                           src += 2;
2160                           break;
2161                         }
2162                       d += CHAR_STRING (c1, d), produced_chars++;
2163                     }
2164                   if (d + 3 > (dst_bytes ? dst_end : src))
2165                     goto label_end_of_loop;
2166                   *d++ = ISO_CODE_ESC;
2167                   *d++ = '%';
2168                   *d++ = '@';
2169                   dst = d;
2170                   coding->produced_char += produced_chars + 3;
2171                 }
2172               else
2173                 goto label_invalid_code;
2174               continue;
2175
2176             default:
2177               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2178                 goto label_invalid_code;
2179               if (c1 >= 0x28 && c1 <= 0x2B)
2180                 {       /* designation of DIMENSION1_CHARS94 character set */
2181                   ONE_MORE_BYTE (c2);
2182                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2183                 }
2184               else if (c1 >= 0x2C && c1 <= 0x2F)
2185                 {       /* designation of DIMENSION1_CHARS96 character set */
2186                   ONE_MORE_BYTE (c2);
2187                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2188                 }
2189               else
2190                 goto label_invalid_code;
2191               /* We must update these variables now.  */
2192               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2193               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2194               continue;
2195             }
2196         }
2197
2198       /* Now we know CHARSET and 1st position code C1 of a character.
2199          Produce a multibyte sequence for that character while getting
2200          2nd position code C2 if necessary.  */
2201       if (CHARSET_DIMENSION (charset) == 2)
2202         {
2203           ONE_MORE_BYTE (c2);
2204           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2205             /* C2 is not in a valid range.  */
2206             goto label_invalid_code;
2207         }
2208       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2209       EMIT_CHAR (c);
2210       continue;
2211
2212     label_invalid_code:
2213       coding->errors++;
2214       if (COMPOSING_P (coding))
2215         DECODE_COMPOSITION_END ('1');
2216       src = src_base;
2217       c = *src++;
2218       if (! NILP (translation_table))
2219         c = translate_char (translation_table, c, 0, 0, 0);
2220       EMIT_CHAR (c);
2221     }
2222
2223  label_end_of_loop:
2224   coding->consumed = coding->consumed_char = src_base - source;
2225   coding->produced = dst - destination;
2226   return;
2227 }
2228
2229
2230 /* ISO2022 encoding stuff.  */
2231
2232 /*
2233    It is not enough to say just "ISO2022" on encoding, we have to
2234    specify more details.  In Emacs, each ISO2022 coding system
2235    variant has the following specifications:
2236         1. Initial designation to G0 through G3.
2237         2. Allows short-form designation?
2238         3. ASCII should be designated to G0 before control characters?
2239         4. ASCII should be designated to G0 at end of line?
2240         5. 7-bit environment or 8-bit environment?
2241         6. Use locking-shift?
2242         7. Use Single-shift?
2243    And the following two are only for Japanese:
2244         8. Use ASCII in place of JIS0201-1976-Roman?
2245         9. Use JISX0208-1983 in place of JISX0208-1978?
2246    These specifications are encoded in `coding->flags' as flag bits
2247    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2248    details.
2249 */
2250
2251 /* Produce codes (escape sequence) for designating CHARSET to graphic
2252    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2253    '@', 'A', or 'B' and the coding system CODING allows, produce
2254    designation sequence of short-form.  */
2255
2256 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2257   do {                                                                  \
2258     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2259     char *intermediate_char_94 = "()*+";                                \
2260     char *intermediate_char_96 = ",-./";                                \
2261     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2262                                                                         \
2263     if (revision < 255)                                                 \
2264       {                                                                 \
2265         *dst++ = ISO_CODE_ESC;                                          \
2266         *dst++ = '&';                                                   \
2267         *dst++ = '@' + revision;                                        \
2268       }                                                                 \
2269     *dst++ = ISO_CODE_ESC;                                              \
2270     if (CHARSET_DIMENSION (charset) == 1)                               \
2271       {                                                                 \
2272         if (CHARSET_CHARS (charset) == 94)                              \
2273           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2274         else                                                            \
2275           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2276       }                                                                 \
2277     else                                                                \
2278       {                                                                 \
2279         *dst++ = '$';                                                   \
2280         if (CHARSET_CHARS (charset) == 94)                              \
2281           {                                                             \
2282             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2283                 || reg != 0                                             \
2284                 || final_char < '@' || final_char > 'B')                \
2285               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2286           }                                                             \
2287         else                                                            \
2288           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2289       }                                                                 \
2290     *dst++ = final_char;                                                \
2291     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2292   } while (0)
2293
2294 /* The following two macros produce codes (control character or escape
2295    sequence) for ISO2022 single-shift functions (single-shift-2 and
2296    single-shift-3).  */
2297
2298 #define ENCODE_SINGLE_SHIFT_2                           \
2299   do {                                                  \
2300     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2301       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2302     else                                                \
2303       *dst++ = ISO_CODE_SS2;                            \
2304     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2305   } while (0)
2306
2307 #define ENCODE_SINGLE_SHIFT_3                           \
2308   do {                                                  \
2309     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2310       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2311     else                                                \
2312       *dst++ = ISO_CODE_SS3;                            \
2313     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2314   } while (0)
2315
2316 /* The following four macros produce codes (control character or
2317    escape sequence) for ISO2022 locking-shift functions (shift-in,
2318    shift-out, locking-shift-2, and locking-shift-3).  */
2319
2320 #define ENCODE_SHIFT_IN                         \
2321   do {                                          \
2322     *dst++ = ISO_CODE_SI;                       \
2323     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2324   } while (0)
2325
2326 #define ENCODE_SHIFT_OUT                        \
2327   do {                                          \
2328     *dst++ = ISO_CODE_SO;                       \
2329     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2330   } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_2                  \
2333   do {                                          \
2334     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2335     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2336   } while (0)
2337
2338 #define ENCODE_LOCKING_SHIFT_3                  \
2339   do {                                          \
2340     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2341     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2342   } while (0)
2343
2344 /* Produce codes for a DIMENSION1 character whose character set is
2345    CHARSET and whose position-code is C1.  Designation and invocation
2346    sequences are also produced in advance if necessary.  */
2347
2348 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2349   do {                                                                  \
2350     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2351       {                                                                 \
2352         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2353           *dst++ = c1 & 0x7F;                                           \
2354         else                                                            \
2355           *dst++ = c1 | 0x80;                                           \
2356         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2357         break;                                                          \
2358       }                                                                 \
2359     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2360       {                                                                 \
2361         *dst++ = c1 & 0x7F;                                             \
2362         break;                                                          \
2363       }                                                                 \
2364     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2365       {                                                                 \
2366         *dst++ = c1 | 0x80;                                             \
2367         break;                                                          \
2368       }                                                                 \
2369     else                                                                \
2370       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2371          must invoke it, or, at first, designate it to some graphic     \
2372          register.  Then repeat the loop to actually produce the        \
2373          character.  */                                                 \
2374       dst = encode_invocation_designation (charset, coding, dst);       \
2375   } while (1)
2376
2377 /* Produce codes for a DIMENSION2 character whose character set is
2378    CHARSET and whose position-codes are C1 and C2.  Designation and
2379    invocation codes are also produced in advance if necessary.  */
2380
2381 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2382   do {                                                                  \
2383     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2384       {                                                                 \
2385         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2386           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2387         else                                                            \
2388           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2389         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2390         break;                                                          \
2391       }                                                                 \
2392     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2393       {                                                                 \
2394         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2395         break;                                                          \
2396       }                                                                 \
2397     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2398       {                                                                 \
2399         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2400         break;                                                          \
2401       }                                                                 \
2402     else                                                                \
2403       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2404          must invoke it, or, at first, designate it to some graphic     \
2405          register.  Then repeat the loop to actually produce the        \
2406          character.  */                                                 \
2407       dst = encode_invocation_designation (charset, coding, dst);       \
2408   } while (1)
2409
2410 #define ENCODE_ISO_CHARACTER(c)                                 \
2411   do {                                                          \
2412     int charset, c1, c2;                                        \
2413                                                                 \
2414     SPLIT_CHAR (c, charset, c1, c2);                            \
2415     if (CHARSET_DEFINED_P (charset))                            \
2416       {                                                         \
2417         if (CHARSET_DIMENSION (charset) == 1)                   \
2418           {                                                     \
2419             if (charset == CHARSET_ASCII                        \
2420                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2421               charset = charset_latin_jisx0201;                 \
2422             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2423           }                                                     \
2424         else                                                    \
2425           {                                                     \
2426             if (charset == charset_jisx0208                     \
2427                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2428               charset = charset_jisx0208_1978;                  \
2429             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2430           }                                                     \
2431       }                                                         \
2432     else                                                        \
2433       {                                                         \
2434         *dst++ = c1;                                            \
2435         if (c2 >= 0)                                            \
2436           *dst++ = c2;                                          \
2437       }                                                         \
2438   } while (0)
2439
2440
2441 /* Instead of encoding character C, produce one or two `?'s.  */
2442
2443 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2444   do {                                                          \
2445     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2446     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2447       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2448   } while (0)
2449
2450
2451 /* Produce designation and invocation codes at a place pointed by DST
2452    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2453    Return new DST.  */
2454
2455 unsigned char *
2456 encode_invocation_designation (charset, coding, dst)
2457      int charset;
2458      struct coding_system *coding;
2459      unsigned char *dst;
2460 {
2461   int reg;                      /* graphic register number */
2462
2463   /* At first, check designations.  */
2464   for (reg = 0; reg < 4; reg++)
2465     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2466       break;
2467
2468   if (reg >= 4)
2469     {
2470       /* CHARSET is not yet designated to any graphic registers.  */
2471       /* At first check the requested designation.  */
2472       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2473       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2474         /* Since CHARSET requests no special designation, designate it
2475            to graphic register 0.  */
2476         reg = 0;
2477
2478       ENCODE_DESIGNATION (charset, reg, coding);
2479     }
2480
2481   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2482       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2483     {
2484       /* Since the graphic register REG is not invoked to any graphic
2485          planes, invoke it to graphic plane 0.  */
2486       switch (reg)
2487         {
2488         case 0:                 /* graphic register 0 */
2489           ENCODE_SHIFT_IN;
2490           break;
2491
2492         case 1:                 /* graphic register 1 */
2493           ENCODE_SHIFT_OUT;
2494           break;
2495
2496         case 2:                 /* graphic register 2 */
2497           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498             ENCODE_SINGLE_SHIFT_2;
2499           else
2500             ENCODE_LOCKING_SHIFT_2;
2501           break;
2502
2503         case 3:                 /* graphic register 3 */
2504           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2505             ENCODE_SINGLE_SHIFT_3;
2506           else
2507             ENCODE_LOCKING_SHIFT_3;
2508           break;
2509         }
2510     }
2511
2512   return dst;
2513 }
2514
2515 /* Produce 2-byte codes for encoded composition rule RULE.  */
2516
2517 #define ENCODE_COMPOSITION_RULE(rule)           \
2518   do {                                          \
2519     int gref, nref;                             \
2520     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2521     *dst++ = 32 + 81 + gref;                    \
2522     *dst++ = 32 + nref;                         \
2523   } while (0)
2524
2525 /* Produce codes for indicating the start of a composition sequence
2526    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2527    which specify information about the composition.  See the comment
2528    in coding.h for the format of DATA.  */
2529
2530 #define ENCODE_COMPOSITION_START(coding, data)                          \
2531   do {                                                                  \
2532     coding->composing = data[3];                                        \
2533     *dst++ = ISO_CODE_ESC;                                              \
2534     if (coding->composing == COMPOSITION_RELATIVE)                      \
2535       *dst++ = '0';                                                     \
2536     else                                                                \
2537       {                                                                 \
2538         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2539                   ? '3' : '4');                                         \
2540         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2541         coding->composition_rule_follows = 0;                           \
2542       }                                                                 \
2543   } while (0)
2544
2545 /* Produce codes for indicating the end of the current composition.  */
2546
2547 #define ENCODE_COMPOSITION_END(coding, data)                    \
2548   do {                                                          \
2549     *dst++ = ISO_CODE_ESC;                                      \
2550     *dst++ = '1';                                               \
2551     coding->cmp_data_start += data[0];                          \
2552     coding->composing = COMPOSITION_NO;                         \
2553     if (coding->cmp_data_start == coding->cmp_data->used        \
2554         && coding->cmp_data->next)                              \
2555       {                                                         \
2556         coding->cmp_data = coding->cmp_data->next;              \
2557         coding->cmp_data_start = 0;                             \
2558       }                                                         \
2559   } while (0)
2560
2561 /* Produce composition start sequence ESC 0.  Here, this sequence
2562    doesn't mean the start of a new composition but means that we have
2563    just produced components (alternate chars and composition rules) of
2564    the composition and the actual text follows in SRC.  */
2565
2566 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2567   do {                                          \
2568     *dst++ = ISO_CODE_ESC;                      \
2569     *dst++ = '0';                               \
2570     coding->composing = COMPOSITION_RELATIVE;   \
2571   } while (0)
2572
2573 /* The following three macros produce codes for indicating direction
2574    of text.  */
2575 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2576   do {                                                  \
2577     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2578       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2579     else                                                \
2580       *dst++ = ISO_CODE_CSI;                            \
2581   } while (0)
2582
2583 #define ENCODE_DIRECTION_R2L    \
2584   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2585
2586 #define ENCODE_DIRECTION_L2R    \
2587   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2588
2589 /* Produce codes for designation and invocation to reset the graphic
2590    planes and registers to initial state.  */
2591 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2592   do {                                                                      \
2593     int reg;                                                                \
2594     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2595       ENCODE_SHIFT_IN;                                                      \
2596     for (reg = 0; reg < 4; reg++)                                           \
2597       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2598           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2599               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2600         ENCODE_DESIGNATION                                                  \
2601           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2602   } while (0)
2603
2604 /* Produce designation sequences of charsets in the line started from
2605    SRC to a place pointed by DST, and return updated DST.
2606
2607    If the current block ends before any end-of-line, we may fail to
2608    find all the necessary designations.  */
2609
2610 static unsigned char *
2611 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2612      struct coding_system *coding;
2613      Lisp_Object translation_table;
2614      const unsigned char *src, *src_end;
2615      unsigned char *dst;
2616 {
2617   int charset, c, found = 0, reg;
2618   /* Table of charsets to be designated to each graphic register.  */
2619   int r[4];
2620
2621   for (reg = 0; reg < 4; reg++)
2622     r[reg] = -1;
2623
2624   while (found < 4)
2625     {
2626       ONE_MORE_CHAR (c);
2627       if (c == '\n')
2628         break;
2629
2630       charset = CHAR_CHARSET (c);
2631       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2632       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2633         {
2634           found++;
2635           r[reg] = charset;
2636         }
2637     }
2638
2639  label_end_of_loop:
2640   if (found)
2641     {
2642       for (reg = 0; reg < 4; reg++)
2643         if (r[reg] >= 0
2644             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2645           ENCODE_DESIGNATION (r[reg], reg, coding);
2646     }
2647
2648   return dst;
2649 }
2650
2651 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2652
2653 static void
2654 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2655      struct coding_system *coding;
2656      const unsigned char *source;
2657      unsigned char *destination;
2658      int src_bytes, dst_bytes;
2659 {
2660   const unsigned char *src = source;
2661   const unsigned char *src_end = source + src_bytes;
2662   unsigned char *dst = destination;
2663   unsigned char *dst_end = destination + dst_bytes;
2664   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2665      from DST_END to assure overflow checking is necessary only at the
2666      head of loop.  */
2667   unsigned char *adjusted_dst_end = dst_end - 19;
2668   /* SRC_BASE remembers the start position in source in each loop.
2669      The loop will be exited when there's not enough source text to
2670      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2671      there's not enough destination area to produce encoded codes
2672      (within macro EMIT_BYTES).  */
2673   const unsigned char *src_base;
2674   int c;
2675   Lisp_Object translation_table;
2676   Lisp_Object safe_chars;
2677
2678   if (coding->flags & CODING_FLAG_ISO_SAFE)
2679     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2680
2681   safe_chars = coding_safe_chars (coding->symbol);
2682
2683   if (NILP (Venable_character_translation))
2684     translation_table = Qnil;
2685   else
2686     {
2687       translation_table = coding->translation_table_for_encode;
2688       if (NILP (translation_table))
2689         translation_table = Vstandard_translation_table_for_encode;
2690     }
2691
2692   coding->consumed_char = 0;
2693   coding->errors = 0;
2694   while (1)
2695     {
2696       src_base = src;
2697
2698       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2699         {
2700           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2701           break;
2702         }
2703
2704       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2705           && CODING_SPEC_ISO_BOL (coding))
2706         {
2707           /* We have to produce designation sequences if any now.  */
2708           dst = encode_designation_at_bol (coding, translation_table,
2709                                            src, src_end, dst);
2710           CODING_SPEC_ISO_BOL (coding) = 0;
2711         }
2712
2713       /* Check composition start and end.  */
2714       if (coding->composing != COMPOSITION_DISABLED
2715           && coding->cmp_data_start < coding->cmp_data->used)
2716         {
2717           struct composition_data *cmp_data = coding->cmp_data;
2718           int *data = cmp_data->data + coding->cmp_data_start;
2719           int this_pos = cmp_data->char_offset + coding->consumed_char;
2720
2721           if (coding->composing == COMPOSITION_RELATIVE)
2722             {
2723               if (this_pos == data[2])
2724                 {
2725                   ENCODE_COMPOSITION_END (coding, data);
2726                   cmp_data = coding->cmp_data;
2727                   data = cmp_data->data + coding->cmp_data_start;
2728                 }
2729             }
2730           else if (COMPOSING_P (coding))
2731             {
2732               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2733               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2734                 /* We have consumed components of the composition.
2735                    What follows in SRC is the composition's base
2736                    text.  */
2737                 ENCODE_COMPOSITION_FAKE_START (coding);
2738               else
2739                 {
2740                   int c = cmp_data->data[coding->cmp_data_index++];
2741                   if (coding->composition_rule_follows)
2742                     {
2743                       ENCODE_COMPOSITION_RULE (c);
2744                       coding->composition_rule_follows = 0;
2745                     }
2746                   else
2747                     {
2748                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2749                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2750                         ENCODE_UNSAFE_CHARACTER (c);
2751                       else
2752                         ENCODE_ISO_CHARACTER (c);
2753                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2754                         coding->composition_rule_follows = 1;
2755                     }
2756                   continue;
2757                 }
2758             }
2759           if (!COMPOSING_P (coding))
2760             {
2761               if (this_pos == data[1])
2762                 {
2763                   ENCODE_COMPOSITION_START (coding, data);
2764                   continue;
2765                 }
2766             }
2767         }
2768
2769       ONE_MORE_CHAR (c);
2770
2771       /* Now encode the character C.  */
2772       if (c < 0x20 || c == 0x7F)
2773         {
2774           if (c == '\r')
2775             {
2776               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2777                 {
2778                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2779                     ENCODE_RESET_PLANE_AND_REGISTER;
2780                   *dst++ = c;
2781                   continue;
2782                 }
2783               /* fall down to treat '\r' as '\n' ...  */
2784               c = '\n';
2785             }
2786           if (c == '\n')
2787             {
2788               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2789                 ENCODE_RESET_PLANE_AND_REGISTER;
2790               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2791                 bcopy (coding->spec.iso2022.initial_designation,
2792                        coding->spec.iso2022.current_designation,
2793                        sizeof coding->spec.iso2022.initial_designation);
2794               if (coding->eol_type == CODING_EOL_LF
2795                   || coding->eol_type == CODING_EOL_UNDECIDED)
2796                 *dst++ = ISO_CODE_LF;
2797               else if (coding->eol_type == CODING_EOL_CRLF)
2798                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2799               else
2800                 *dst++ = ISO_CODE_CR;
2801               CODING_SPEC_ISO_BOL (coding) = 1;
2802             }
2803           else
2804             {
2805               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2806                 ENCODE_RESET_PLANE_AND_REGISTER;
2807               *dst++ = c;
2808             }
2809         }
2810       else if (ASCII_BYTE_P (c))
2811         ENCODE_ISO_CHARACTER (c);
2812       else if (SINGLE_BYTE_CHAR_P (c))
2813         {
2814           *dst++ = c;
2815           coding->errors++;
2816         }
2817       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2818                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2819         ENCODE_UNSAFE_CHARACTER (c);
2820       else
2821         ENCODE_ISO_CHARACTER (c);
2822
2823       coding->consumed_char++;
2824     }
2825
2826  label_end_of_loop:
2827   coding->consumed = src_base - source;
2828   coding->produced = coding->produced_char = dst - destination;
2829 }
2830
2831 \f
2832 /*** 4. SJIS and BIG5 handlers ***/
2833
2834 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2835    quite widely.  So, for the moment, Emacs supports them in the bare
2836    C code.  But, in the future, they may be supported only by CCL.  */
2837
2838 /* SJIS is a coding system encoding three character sets: ASCII, right
2839    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2840    as is.  A character of charset katakana-jisx0201 is encoded by
2841    "position-code + 0x80".  A character of charset japanese-jisx0208
2842    is encoded in 2-byte but two position-codes are divided and shifted
2843    so that it fits in the range below.
2844
2845    --- CODE RANGE of SJIS ---
2846    (character set)      (range)
2847    ASCII                0x00 .. 0x7F
2848    KATAKANA-JISX0201    0xA1 .. 0xDF
2849    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2850             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2851    -------------------------------
2852
2853 */
2854
2855 /* BIG5 is a coding system encoding two character sets: ASCII and
2856    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2857    character set and is encoded in two bytes.
2858
2859    --- CODE RANGE of BIG5 ---
2860    (character set)      (range)
2861    ASCII                0x00 .. 0x7F
2862    Big5 (1st byte)      0xA1 .. 0xFE
2863         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2864    --------------------------
2865
2866    Since the number of characters in Big5 is larger than maximum
2867    characters in Emacs' charset (96x96), it can't be handled as one
2868    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2869    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2870    contains frequently used characters and the latter contains less
2871    frequently used characters.  */
2872
2873 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2874    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2875    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2876    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2877
2878 /* Number of Big5 characters which have the same code in 1st byte.  */
2879 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2880
2881 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2882   do {                                                                  \
2883     unsigned int temp                                                   \
2884       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2885     if (b1 < 0xC9)                                                      \
2886       charset = charset_big5_1;                                         \
2887     else                                                                \
2888       {                                                                 \
2889         charset = charset_big5_2;                                       \
2890         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2891       }                                                                 \
2892     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2893     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2894   } while (0)
2895
2896 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2897   do {                                                                  \
2898     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2899     if (charset == charset_big5_2)                                      \
2900       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2901     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2902     b2 = temp % BIG5_SAME_ROW;                                          \
2903     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2904   } while (0)
2905
2906 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2907    Check if a text is encoded in SJIS.  If it is, return
2908    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2909
2910 static int
2911 detect_coding_sjis (src, src_end, multibytep)
2912      unsigned char *src, *src_end;
2913      int multibytep;
2914 {
2915   int c;
2916   /* Dummy for ONE_MORE_BYTE.  */
2917   struct coding_system dummy_coding;
2918   struct coding_system *coding = &dummy_coding;
2919
2920   while (1)
2921     {
2922       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2923       if (c < 0x80)
2924         continue;
2925       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2926         return 0;
2927       if (c <= 0x9F || c >= 0xE0)
2928         {
2929           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2930           if (c < 0x40 || c == 0x7F || c > 0xFC)
2931             return 0;
2932         }
2933     }
2934 }
2935
2936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2937    Check if a text is encoded in BIG5.  If it is, return
2938    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2939
2940 static int
2941 detect_coding_big5 (src, src_end, multibytep)
2942      unsigned char *src, *src_end;
2943      int multibytep;
2944 {
2945   int c;
2946   /* Dummy for ONE_MORE_BYTE.  */
2947   struct coding_system dummy_coding;
2948   struct coding_system *coding = &dummy_coding;
2949
2950   while (1)
2951     {
2952       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2953       if (c < 0x80)
2954         continue;
2955       if (c < 0xA1 || c > 0xFE)
2956         return 0;
2957       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2958       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2959         return 0;
2960     }
2961 }
2962
2963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2964    Check if a text is encoded in UTF-8.  If it is, return
2965    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2966
2967 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2968 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2969 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2970 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2971 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2972 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2973 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2974
2975 static int
2976 detect_coding_utf_8 (src, src_end, multibytep)
2977      unsigned char *src, *src_end;
2978      int multibytep;
2979 {
2980   unsigned char c;
2981   int seq_maybe_bytes;
2982   /* Dummy for ONE_MORE_BYTE.  */
2983   struct coding_system dummy_coding;
2984   struct coding_system *coding = &dummy_coding;
2985
2986   while (1)
2987     {
2988       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2989       if (UTF_8_1_OCTET_P (c))
2990         continue;
2991       else if (UTF_8_2_OCTET_LEADING_P (c))
2992         seq_maybe_bytes = 1;
2993       else if (UTF_8_3_OCTET_LEADING_P (c))
2994         seq_maybe_bytes = 2;
2995       else if (UTF_8_4_OCTET_LEADING_P (c))
2996         seq_maybe_bytes = 3;
2997       else if (UTF_8_5_OCTET_LEADING_P (c))
2998         seq_maybe_bytes = 4;
2999       else if (UTF_8_6_OCTET_LEADING_P (c))
3000         seq_maybe_bytes = 5;
3001       else
3002         return 0;
3003
3004       do
3005         {
3006           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3007           if (!UTF_8_EXTRA_OCTET_P (c))
3008             return 0;
3009           seq_maybe_bytes--;
3010         }
3011       while (seq_maybe_bytes > 0);
3012     }
3013 }
3014
3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3016    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3017    Little Endian (otherwise).  If it is, return
3018    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3019    else return 0.  */
3020
3021 #define UTF_16_INVALID_P(val)   \
3022   (((val) == 0xFFFE)            \
3023    || ((val) == 0xFFFF))
3024
3025 #define UTF_16_HIGH_SURROGATE_P(val) \
3026   (((val) & 0xD800) == 0xD800)
3027
3028 #define UTF_16_LOW_SURROGATE_P(val) \
3029   (((val) & 0xDC00) == 0xDC00)
3030
3031 static int
3032 detect_coding_utf_16 (src, src_end, multibytep)
3033      unsigned char *src, *src_end;
3034      int multibytep;
3035 {
3036   unsigned char c1, c2;
3037   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3038   struct coding_system dummy_coding;
3039   struct coding_system *coding = &dummy_coding;
3040
3041   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3042   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3043
3044   if ((c1 == 0xFF) && (c2 == 0xFE))
3045     return CODING_CATEGORY_MASK_UTF_16_LE;
3046   else if ((c1 == 0xFE) && (c2 == 0xFF))
3047     return CODING_CATEGORY_MASK_UTF_16_BE;
3048   return 0;
3049 }
3050
3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3052    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3053
3054 static void
3055 decode_coding_sjis_big5 (coding, source, destination,
3056                          src_bytes, dst_bytes, sjis_p)
3057      struct coding_system *coding;
3058      const unsigned char *source;
3059      unsigned char  *destination;
3060      int src_bytes, dst_bytes;
3061      int sjis_p;
3062 {
3063   const unsigned char *src = source;
3064   const unsigned char *src_end = source + src_bytes;
3065   unsigned char *dst = destination;
3066   unsigned char *dst_end = destination + dst_bytes;
3067   /* SRC_BASE remembers the start position in source in each loop.
3068      The loop will be exited when there's not enough source code
3069      (within macro ONE_MORE_BYTE), or when there's not enough
3070      destination area to produce a character (within macro
3071      EMIT_CHAR).  */
3072   const unsigned char *src_base;
3073   Lisp_Object translation_table;
3074
3075   if (NILP (Venable_character_translation))
3076     translation_table = Qnil;
3077   else
3078     {
3079       translation_table = coding->translation_table_for_decode;
3080       if (NILP (translation_table))
3081         translation_table = Vstandard_translation_table_for_decode;
3082     }
3083
3084   coding->produced_char = 0;
3085   while (1)
3086     {
3087       int c, charset, c1, c2 = 0;
3088
3089       src_base = src;
3090       ONE_MORE_BYTE (c1);
3091
3092       if (c1 < 0x80)
3093         {
3094           charset = CHARSET_ASCII;
3095           if (c1 < 0x20)
3096             {
3097               if (c1 == '\r')
3098                 {
3099                   if (coding->eol_type == CODING_EOL_CRLF)
3100                     {
3101                       ONE_MORE_BYTE (c2);
3102                       if (c2 == '\n')
3103                         c1 = c2;
3104                       else
3105                         /* To process C2 again, SRC is subtracted by 1.  */
3106                         src--;
3107                     }
3108                   else if (coding->eol_type == CODING_EOL_CR)
3109                     c1 = '\n';
3110                 }
3111               else if (c1 == '\n'
3112                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3113                        && (coding->eol_type == CODING_EOL_CR
3114                            || coding->eol_type == CODING_EOL_CRLF))
3115                 {
3116                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3117                   goto label_end_of_loop;
3118                 }
3119             }
3120         }
3121       else
3122         {
3123           if (sjis_p)
3124             {
3125               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3126                 goto label_invalid_code;
3127               if (c1 <= 0x9F || c1 >= 0xE0)
3128                 {
3129                   /* SJIS -> JISX0208 */
3130                   ONE_MORE_BYTE (c2);
3131                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3132                     goto label_invalid_code;
3133                   DECODE_SJIS (c1, c2, c1, c2);
3134                   charset = charset_jisx0208;
3135                 }
3136               else
3137                 /* SJIS -> JISX0201-Kana */
3138                 charset = charset_katakana_jisx0201;
3139             }
3140           else
3141             {
3142               /* BIG5 -> Big5 */
3143               if (c1 < 0xA0 || c1 > 0xFE)
3144                 goto label_invalid_code;
3145               ONE_MORE_BYTE (c2);
3146               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3147                 goto label_invalid_code;
3148               DECODE_BIG5 (c1, c2, charset, c1, c2);
3149             }
3150         }
3151
3152       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3153       EMIT_CHAR (c);
3154       continue;
3155
3156     label_invalid_code:
3157       coding->errors++;
3158       src = src_base;
3159       c = *src++;
3160       EMIT_CHAR (c);
3161     }
3162
3163  label_end_of_loop:
3164   coding->consumed = coding->consumed_char = src_base - source;
3165   coding->produced = dst - destination;
3166   return;
3167 }
3168
3169 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3170    This function can encode charsets `ascii', `katakana-jisx0201',
3171    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3172    are sure that all these charsets are registered as official charset
3173    (i.e. do not have extended leading-codes).  Characters of other
3174    charsets are produced without any encoding.  If SJIS_P is 1, encode
3175    SJIS text, else encode BIG5 text.  */
3176
3177 static void
3178 encode_coding_sjis_big5 (coding, source, destination,
3179                          src_bytes, dst_bytes, sjis_p)
3180      struct coding_system *coding;
3181      unsigned char *source, *destination;
3182      int src_bytes, dst_bytes;
3183      int sjis_p;
3184 {
3185   unsigned char *src = source;
3186   unsigned char *src_end = source + src_bytes;
3187   unsigned char *dst = destination;
3188   unsigned char *dst_end = destination + dst_bytes;
3189   /* SRC_BASE remembers the start position in source in each loop.
3190      The loop will be exited when there's not enough source text to
3191      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3192      there's not enough destination area to produce encoded codes
3193      (within macro EMIT_BYTES).  */
3194   unsigned char *src_base;
3195   Lisp_Object translation_table;
3196
3197   if (NILP (Venable_character_translation))
3198     translation_table = Qnil;
3199   else
3200     {
3201       translation_table = coding->translation_table_for_encode;
3202       if (NILP (translation_table))
3203         translation_table = Vstandard_translation_table_for_encode;
3204     }
3205
3206   while (1)
3207     {
3208       int c, charset, c1, c2;
3209
3210       src_base = src;
3211       ONE_MORE_CHAR (c);
3212
3213       /* Now encode the character C.  */
3214       if (SINGLE_BYTE_CHAR_P (c))
3215         {
3216           switch (c)
3217             {
3218             case '\r':
3219               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3220                 {
3221                   EMIT_ONE_BYTE (c);
3222                   break;
3223                 }
3224               c = '\n';
3225             case '\n':
3226               if (coding->eol_type == CODING_EOL_CRLF)
3227                 {
3228                   EMIT_TWO_BYTES ('\r', c);
3229                   break;
3230                 }
3231               else if (coding->eol_type == CODING_EOL_CR)
3232                 c = '\r';
3233             default:
3234               EMIT_ONE_BYTE (c);
3235             }
3236         }
3237       else
3238         {
3239           SPLIT_CHAR (c, charset, c1, c2);
3240           if (sjis_p)
3241             {
3242               if (charset == charset_jisx0208
3243                   || charset == charset_jisx0208_1978)
3244                 {
3245                   ENCODE_SJIS (c1, c2, c1, c2);
3246                   EMIT_TWO_BYTES (c1, c2);
3247                 }
3248               else if (charset == charset_katakana_jisx0201)
3249                 EMIT_ONE_BYTE (c1 | 0x80);
3250               else if (charset == charset_latin_jisx0201)
3251                 EMIT_ONE_BYTE (c1);
3252               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3253                 {
3254                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3255                   if (CHARSET_WIDTH (charset) > 1)
3256                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3257                 }
3258               else
3259                 /* There's no way other than producing the internal
3260                    codes as is.  */
3261                 EMIT_BYTES (src_base, src);
3262             }
3263           else
3264             {
3265               if (charset == charset_big5_1 || charset == charset_big5_2)
3266                 {
3267                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3268                   EMIT_TWO_BYTES (c1, c2);
3269                 }
3270               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3271                 {
3272                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3273                   if (CHARSET_WIDTH (charset) > 1)
3274                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275                 }
3276               else
3277                 /* There's no way other than producing the internal
3278                    codes as is.  */
3279                 EMIT_BYTES (src_base, src);
3280             }
3281         }
3282       coding->consumed_char++;
3283     }
3284
3285  label_end_of_loop:
3286   coding->consumed = src_base - source;
3287   coding->produced = coding->produced_char = dst - destination;
3288 }
3289
3290 \f
3291 /*** 5. CCL handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3294    Check if a text is encoded in a coding system of which
3295    encoder/decoder are written in CCL program.  If it is, return
3296    CODING_CATEGORY_MASK_CCL, else return 0.  */
3297
3298 static int
3299 detect_coding_ccl (src, src_end, multibytep)
3300      unsigned char *src, *src_end;
3301      int multibytep;
3302 {
3303   unsigned char *valid;
3304   int c;
3305   /* Dummy for ONE_MORE_BYTE.  */
3306   struct coding_system dummy_coding;
3307   struct coding_system *coding = &dummy_coding;
3308
3309   /* No coding system is assigned to coding-category-ccl.  */
3310   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3311     return 0;
3312
3313   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3314   while (1)
3315     {
3316       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3317       if (! valid[c])
3318         return 0;
3319     }
3320 }
3321
3322 \f
3323 /*** 6. End-of-line handlers ***/
3324
3325 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3326
3327 static void
3328 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3329      struct coding_system *coding;
3330      const unsigned char *source;
3331      unsigned char *destination;
3332      int src_bytes, dst_bytes;
3333 {
3334   const unsigned char *src = source;
3335   unsigned char *dst = destination;
3336   const unsigned char *src_end = src + src_bytes;
3337   unsigned char *dst_end = dst + dst_bytes;
3338   Lisp_Object translation_table;
3339   /* SRC_BASE remembers the start position in source in each loop.
3340      The loop will be exited when there's not enough source code
3341      (within macro ONE_MORE_BYTE), or when there's not enough
3342      destination area to produce a character (within macro
3343      EMIT_CHAR).  */
3344   const unsigned char *src_base;
3345   int c;
3346
3347   translation_table = Qnil;
3348   switch (coding->eol_type)
3349     {
3350     case CODING_EOL_CRLF:
3351       while (1)
3352         {
3353           src_base = src;
3354           ONE_MORE_BYTE (c);
3355           if (c == '\r')
3356             {
3357               ONE_MORE_BYTE (c);
3358               if (c != '\n')
3359                 {
3360                   src--;
3361                   c = '\r';
3362                 }
3363             }
3364           else if (c == '\n'
3365                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3366             {
3367               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3368               goto label_end_of_loop;
3369             }
3370           EMIT_CHAR (c);
3371         }
3372       break;
3373
3374     case CODING_EOL_CR:
3375       while (1)
3376         {
3377           src_base = src;
3378           ONE_MORE_BYTE (c);
3379           if (c == '\n')
3380             {
3381               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3382                 {
3383                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3384                   goto label_end_of_loop;
3385                 }
3386             }
3387           else if (c == '\r')
3388             c = '\n';
3389           EMIT_CHAR (c);
3390         }
3391       break;
3392
3393     default:                    /* no need for EOL handling */
3394       while (1)
3395         {
3396           src_base = src;
3397           ONE_MORE_BYTE (c);
3398           EMIT_CHAR (c);
3399         }
3400     }
3401
3402  label_end_of_loop:
3403   coding->consumed = coding->consumed_char = src_base - source;
3404   coding->produced = dst - destination;
3405   return;
3406 }
3407
3408 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3409    format of end-of-line according to `coding->eol_type'.  It also
3410    convert multibyte form 8-bit characters to unibyte if
3411    CODING->src_multibyte is nonzero.  If `coding->mode &
3412    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3413    also means end-of-line.  */
3414
3415 static void
3416 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3417      struct coding_system *coding;
3418      const unsigned char *source;
3419      unsigned char *destination;
3420      int src_bytes, dst_bytes;
3421 {
3422   const unsigned char *src = source;
3423   unsigned char *dst = destination;
3424   const unsigned char *src_end = src + src_bytes;
3425   unsigned char *dst_end = dst + dst_bytes;
3426   Lisp_Object translation_table;
3427   /* SRC_BASE remembers the start position in source in each loop.
3428      The loop will be exited when there's not enough source text to
3429      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3430      there's not enough destination area to produce encoded codes
3431      (within macro EMIT_BYTES).  */
3432   const unsigned char *src_base;
3433   unsigned char *tmp;
3434   int c;
3435   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3436
3437   translation_table = Qnil;
3438   if (coding->src_multibyte
3439       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3440     {
3441       src_end--;
3442       src_bytes--;
3443       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3444     }
3445
3446   if (coding->eol_type == CODING_EOL_CRLF)
3447     {
3448       while (src < src_end)
3449         {
3450           src_base = src;
3451           c = *src++;
3452           if (c >= 0x20)
3453             EMIT_ONE_BYTE (c);
3454           else if (c == '\n' || (c == '\r' && selective_display))
3455             EMIT_TWO_BYTES ('\r', '\n');
3456           else
3457             EMIT_ONE_BYTE (c);
3458         }
3459       src_base = src;
3460     label_end_of_loop:
3461       ;
3462     }
3463   else
3464     {
3465       if (!dst_bytes || src_bytes <= dst_bytes)
3466         {
3467           safe_bcopy (src, dst, src_bytes);
3468           src_base = src_end;
3469           dst += src_bytes;
3470         }
3471       else
3472         {
3473           if (coding->src_multibyte
3474               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3475             dst_bytes--;
3476           safe_bcopy (src, dst, dst_bytes);
3477           src_base = src + dst_bytes;
3478           dst = destination + dst_bytes;
3479           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3480         }
3481       if (coding->eol_type == CODING_EOL_CR)
3482         {
3483           for (tmp = destination; tmp < dst; tmp++)
3484             if (*tmp == '\n') *tmp = '\r';
3485         }
3486       else if (selective_display)
3487         {
3488           for (tmp = destination; tmp < dst; tmp++)
3489             if (*tmp == '\r') *tmp = '\n';
3490         }
3491     }
3492   if (coding->src_multibyte)
3493     dst = destination + str_as_unibyte (destination, dst - destination);
3494
3495   coding->consumed = src_base - source;
3496   coding->produced = dst - destination;
3497   coding->produced_char = coding->produced;
3498 }
3499
3500 \f
3501 /*** 7. C library functions ***/
3502
3503 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3504    has a property `coding-system'.  The value of this property is a
3505    vector of length 5 (called the coding-vector).  Among elements of
3506    this vector, the first (element[0]) and the fifth (element[4])
3507    carry important information for decoding/encoding.  Before
3508    decoding/encoding, this information should be set in fields of a
3509    structure of type `coding_system'.
3510
3511    The value of the property `coding-system' can be a symbol of another
3512    subsidiary coding-system.  In that case, Emacs gets coding-vector
3513    from that symbol.
3514
3515    `element[0]' contains information to be set in `coding->type'.  The
3516    value and its meaning is as follows:
3517
3518    0 -- coding_type_emacs_mule
3519    1 -- coding_type_sjis
3520    2 -- coding_type_iso2022
3521    3 -- coding_type_big5
3522    4 -- coding_type_ccl encoder/decoder written in CCL
3523    nil -- coding_type_no_conversion
3524    t -- coding_type_undecided (automatic conversion on decoding,
3525                                no-conversion on encoding)
3526
3527    `element[4]' contains information to be set in `coding->flags' and
3528    `coding->spec'.  The meaning varies by `coding->type'.
3529
3530    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3531    of length 32 (of which the first 13 sub-elements are used now).
3532    Meanings of these sub-elements are:
3533
3534    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3535         If the value is an integer of valid charset, the charset is
3536         assumed to be designated to graphic register N initially.
3537
3538         If the value is minus, it is a minus value of charset which
3539         reserves graphic register N, which means that the charset is
3540         not designated initially but should be designated to graphic
3541         register N just before encoding a character in that charset.
3542
3543         If the value is nil, graphic register N is never used on
3544         encoding.
3545
3546    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3547         Each value takes t or nil.  See the section ISO2022 of
3548         `coding.h' for more information.
3549
3550    If `coding->type' is `coding_type_big5', element[4] is t to denote
3551    BIG5-ETen or nil to denote BIG5-HKU.
3552
3553    If `coding->type' takes the other value, element[4] is ignored.
3554
3555    Emacs Lisp's coding systems also carry information about format of
3556    end-of-line in a value of property `eol-type'.  If the value is
3557    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3558    means CODING_EOL_CR.  If it is not integer, it should be a vector
3559    of subsidiary coding systems of which property `eol-type' has one
3560    of the above values.
3561
3562 */
3563
3564 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3565    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3566    is setup so that no conversion is necessary and return -1, else
3567    return 0.  */
3568
3569 int
3570 setup_coding_system (coding_system, coding)
3571      Lisp_Object coding_system;
3572      struct coding_system *coding;
3573 {
3574   Lisp_Object coding_spec, coding_type, eol_type, plist;
3575   Lisp_Object val;
3576
3577   /* At first, zero clear all members.  */
3578   bzero (coding, sizeof (struct coding_system));
3579
3580   /* Initialize some fields required for all kinds of coding systems.  */
3581   coding->symbol = coding_system;
3582   coding->heading_ascii = -1;
3583   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3584   coding->composing = COMPOSITION_DISABLED;
3585   coding->cmp_data = NULL;
3586
3587   if (NILP (coding_system))
3588     goto label_invalid_coding_system;
3589
3590   coding_spec = Fget (coding_system, Qcoding_system);
3591
3592   if (!VECTORP (coding_spec)
3593       || XVECTOR (coding_spec)->size != 5
3594       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3595     goto label_invalid_coding_system;
3596
3597   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3598   if (VECTORP (eol_type))
3599     {
3600       coding->eol_type = CODING_EOL_UNDECIDED;
3601       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3602       if (system_eol_type != CODING_EOL_LF)
3603         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3604     }
3605   else if (XFASTINT (eol_type) == 1)
3606     {
3607       coding->eol_type = CODING_EOL_CRLF;
3608       coding->common_flags
3609         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3610     }
3611   else if (XFASTINT (eol_type) == 2)
3612     {
3613       coding->eol_type = CODING_EOL_CR;
3614       coding->common_flags
3615         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3616     }
3617   else
3618     {
3619       coding->common_flags = 0;
3620       coding->eol_type = CODING_EOL_LF;
3621     }
3622
3623   coding_type = XVECTOR (coding_spec)->contents[0];
3624   /* Try short cut.  */
3625   if (SYMBOLP (coding_type))
3626     {
3627       if (EQ (coding_type, Qt))
3628         {
3629           coding->type = coding_type_undecided;
3630           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3631         }
3632       else
3633         coding->type = coding_type_no_conversion;
3634       /* Initialize this member.  Any thing other than
3635          CODING_CATEGORY_IDX_UTF_16_BE and
3636          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3637          special treatment in detect_eol.  */
3638       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3639
3640       return 0;
3641     }
3642
3643   /* Get values of coding system properties:
3644      `post-read-conversion', `pre-write-conversion',
3645      `translation-table-for-decode', `translation-table-for-encode'.  */
3646   plist = XVECTOR (coding_spec)->contents[3];
3647   /* Pre & post conversion functions should be disabled if
3648      inhibit_eol_conversion is nonzero.  This is the case that a code
3649      conversion function is called while those functions are running.  */
3650   if (! inhibit_pre_post_conversion)
3651     {
3652       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3653       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3654     }
3655   val = Fplist_get (plist, Qtranslation_table_for_decode);
3656   if (SYMBOLP (val))
3657     val = Fget (val, Qtranslation_table_for_decode);
3658   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3659   val = Fplist_get (plist, Qtranslation_table_for_encode);
3660   if (SYMBOLP (val))
3661     val = Fget (val, Qtranslation_table_for_encode);
3662   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3663   val = Fplist_get (plist, Qcoding_category);
3664   if (!NILP (val))
3665     {
3666       val = Fget (val, Qcoding_category_index);
3667       if (INTEGERP (val))
3668         coding->category_idx = XINT (val);
3669       else
3670         goto label_invalid_coding_system;
3671     }
3672   else
3673     goto label_invalid_coding_system;
3674
3675   /* If the coding system has non-nil `composition' property, enable
3676      composition handling.  */
3677   val = Fplist_get (plist, Qcomposition);
3678   if (!NILP (val))
3679     coding->composing = COMPOSITION_NO;
3680
3681   /* If the coding system is ascii-incompatible, record it in
3682      common_flags.   */
3683   val = Fplist_get (plist, Qascii_incompatible);
3684   if (! NILP (val))
3685     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3686
3687   switch (XFASTINT (coding_type))
3688     {
3689     case 0:
3690       coding->type = coding_type_emacs_mule;
3691       coding->common_flags
3692         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3693       if (!NILP (coding->post_read_conversion))
3694         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3695       if (!NILP (coding->pre_write_conversion))
3696         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3697       break;
3698
3699     case 1:
3700       coding->type = coding_type_sjis;
3701       coding->common_flags
3702         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3703       break;
3704
3705     case 2:
3706       coding->type = coding_type_iso2022;
3707       coding->common_flags
3708         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3709       {
3710         Lisp_Object val, temp;
3711         Lisp_Object *flags;
3712         int i, charset, reg_bits = 0;
3713
3714         val = XVECTOR (coding_spec)->contents[4];
3715
3716         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3717           goto label_invalid_coding_system;
3718
3719         flags = XVECTOR (val)->contents;
3720         coding->flags
3721           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3722              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3723              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3724              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3725              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3726              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3727              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3728              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3729              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3730              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3731              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3732              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3733              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3734              );
3735
3736         /* Invoke graphic register 0 to plane 0.  */
3737         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3738         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3739         CODING_SPEC_ISO_INVOCATION (coding, 1)
3740           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3741         /* Not single shifting at first.  */
3742         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3743         /* Beginning of buffer should also be regarded as bol. */
3744         CODING_SPEC_ISO_BOL (coding) = 1;
3745
3746         for (charset = 0; charset <= MAX_CHARSET; charset++)
3747           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3748         val = Vcharset_revision_alist;
3749         while (CONSP (val))
3750           {
3751             charset = get_charset_id (Fcar_safe (XCAR (val)));
3752             if (charset >= 0
3753                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3754                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3755               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3756             val = XCDR (val);
3757           }
3758
3759         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3760            FLAGS[REG] can be one of below:
3761                 integer CHARSET: CHARSET occupies register I,
3762                 t: designate nothing to REG initially, but can be used
3763                   by any charsets,
3764                 list of integer, nil, or t: designate the first
3765                   element (if integer) to REG initially, the remaining
3766                   elements (if integer) is designated to REG on request,
3767                   if an element is t, REG can be used by any charsets,
3768                 nil: REG is never used.  */
3769         for (charset = 0; charset <= MAX_CHARSET; charset++)
3770           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3771             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3772         for (i = 0; i < 4; i++)
3773           {
3774             if ((INTEGERP (flags[i])
3775                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3776                 || (charset = get_charset_id (flags[i])) >= 0)
3777               {
3778                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3779                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3780               }
3781             else if (EQ (flags[i], Qt))
3782               {
3783                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3784                 reg_bits |= 1 << i;
3785                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3786               }
3787             else if (CONSP (flags[i]))
3788               {
3789                 Lisp_Object tail;
3790                 tail = flags[i];
3791
3792                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3793                 if ((INTEGERP (XCAR (tail))
3794                      && (charset = XINT (XCAR (tail)),
3795                          CHARSET_VALID_P (charset)))
3796                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3797                   {
3798                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3799                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3800                   }
3801                 else
3802                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3803                 tail = XCDR (tail);
3804                 while (CONSP (tail))
3805                   {
3806                     if ((INTEGERP (XCAR (tail))
3807                          && (charset = XINT (XCAR (tail)),
3808                              CHARSET_VALID_P (charset)))
3809                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3810                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3811                         = i;
3812                     else if (EQ (XCAR (tail), Qt))
3813                       reg_bits |= 1 << i;
3814                     tail = XCDR (tail);
3815                   }
3816               }
3817             else
3818               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3819
3820             CODING_SPEC_ISO_DESIGNATION (coding, i)
3821               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3822           }
3823
3824         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3825           {
3826             /* REG 1 can be used only by locking shift in 7-bit env.  */
3827             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3828               reg_bits &= ~2;
3829             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3830               /* Without any shifting, only REG 0 and 1 can be used.  */
3831               reg_bits &= 3;
3832           }
3833
3834         if (reg_bits)
3835           for (charset = 0; charset <= MAX_CHARSET; charset++)
3836             {
3837               if (CHARSET_DEFINED_P (charset)
3838                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3839                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3840                 {
3841                   /* There exist some default graphic registers to be
3842                      used by CHARSET.  */
3843
3844                   /* We had better avoid designating a charset of
3845                      CHARS96 to REG 0 as far as possible.  */
3846                   if (CHARSET_CHARS (charset) == 96)
3847                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3848                       = (reg_bits & 2
3849                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3850                   else
3851                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3852                       = (reg_bits & 1
3853                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3854                 }
3855             }
3856       }
3857       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3858       coding->spec.iso2022.last_invalid_designation_register = -1;
3859       break;
3860
3861     case 3:
3862       coding->type = coding_type_big5;
3863       coding->common_flags
3864         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3865       coding->flags
3866         = (NILP (XVECTOR (coding_spec)->contents[4])
3867            ? CODING_FLAG_BIG5_HKU
3868            : CODING_FLAG_BIG5_ETEN);
3869       break;
3870
3871     case 4:
3872       coding->type = coding_type_ccl;
3873       coding->common_flags
3874         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3875       {
3876         val = XVECTOR (coding_spec)->contents[4];
3877         if (! CONSP (val)
3878             || setup_ccl_program (&(coding->spec.ccl.decoder),
3879                                   XCAR (val)) < 0
3880             || setup_ccl_program (&(coding->spec.ccl.encoder),
3881                                   XCDR (val)) < 0)
3882           goto label_invalid_coding_system;
3883
3884         bzero (coding->spec.ccl.valid_codes, 256);
3885         val = Fplist_get (plist, Qvalid_codes);
3886         if (CONSP (val))
3887           {
3888             Lisp_Object this;
3889
3890             for (; CONSP (val); val = XCDR (val))
3891               {
3892                 this = XCAR (val);
3893                 if (INTEGERP (this)
3894                     && XINT (this) >= 0 && XINT (this) < 256)
3895                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3896                 else if (CONSP (this)
3897                          && INTEGERP (XCAR (this))
3898                          && INTEGERP (XCDR (this)))
3899                   {
3900                     int start = XINT (XCAR (this));
3901                     int end = XINT (XCDR (this));
3902
3903                     if (start >= 0 && start <= end && end < 256)
3904                       while (start <= end)
3905                         coding->spec.ccl.valid_codes[start++] = 1;
3906                   }
3907               }
3908           }
3909       }
3910       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3911       coding->spec.ccl.cr_carryover = 0;
3912       coding->spec.ccl.eight_bit_carryover[0] = 0;
3913       break;
3914
3915     case 5:
3916       coding->type = coding_type_raw_text;
3917       break;
3918
3919     default:
3920       goto label_invalid_coding_system;
3921     }
3922   return 0;
3923
3924  label_invalid_coding_system:
3925   coding->type = coding_type_no_conversion;
3926   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3927   coding->common_flags = 0;
3928   coding->eol_type = CODING_EOL_UNDECIDED;
3929   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3930   return NILP (coding_system) ? 0 : -1;
3931 }
3932
3933 /* Free memory blocks allocated for storing composition information.  */
3934
3935 void
3936 coding_free_composition_data (coding)
3937      struct coding_system *coding;
3938 {
3939   struct composition_data *cmp_data = coding->cmp_data, *next;
3940
3941   if (!cmp_data)
3942     return;
3943   /* Memory blocks are chained.  At first, rewind to the first, then,
3944      free blocks one by one.  */
3945   while (cmp_data->prev)
3946     cmp_data = cmp_data->prev;
3947   while (cmp_data)
3948     {
3949       next = cmp_data->next;
3950       xfree (cmp_data);
3951       cmp_data = next;
3952     }
3953   coding->cmp_data = NULL;
3954 }
3955
3956 /* Set `char_offset' member of all memory blocks pointed by
3957    coding->cmp_data to POS.  */
3958
3959 void
3960 coding_adjust_composition_offset (coding, pos)
3961      struct coding_system *coding;
3962      int pos;
3963 {
3964   struct composition_data *cmp_data;
3965
3966   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3967     cmp_data->char_offset = pos;
3968 }
3969
3970 /* Setup raw-text or one of its subsidiaries in the structure
3971    coding_system CODING according to the already setup value eol_type
3972    in CODING.  CODING should be setup for some coding system in
3973    advance.  */
3974
3975 void
3976 setup_raw_text_coding_system (coding)
3977      struct coding_system *coding;
3978 {
3979   if (coding->type != coding_type_raw_text)
3980     {
3981       coding->symbol = Qraw_text;
3982       coding->type = coding_type_raw_text;
3983       if (coding->eol_type != CODING_EOL_UNDECIDED)
3984         {
3985           Lisp_Object subsidiaries;
3986           subsidiaries = Fget (Qraw_text, Qeol_type);
3987
3988           if (VECTORP (subsidiaries)
3989               && XVECTOR (subsidiaries)->size == 3)
3990             coding->symbol
3991               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3992         }
3993       setup_coding_system (coding->symbol, coding);
3994     }
3995   return;
3996 }
3997
3998 /* Emacs has a mechanism to automatically detect a coding system if it
3999    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4000    it's impossible to distinguish some coding systems accurately
4001    because they use the same range of codes.  So, at first, coding
4002    systems are categorized into 7, those are:
4003
4004    o coding-category-emacs-mule
4005
4006         The category for a coding system which has the same code range
4007         as Emacs' internal format.  Assigned the coding-system (Lisp
4008         symbol) `emacs-mule' by default.
4009
4010    o coding-category-sjis
4011
4012         The category for a coding system which has the same code range
4013         as SJIS.  Assigned the coding-system (Lisp
4014         symbol) `japanese-shift-jis' by default.
4015
4016    o coding-category-iso-7
4017
4018         The category for a coding system which has the same code range
4019         as ISO2022 of 7-bit environment.  This doesn't use any locking
4020         shift and single shift functions.  This can encode/decode all
4021         charsets.  Assigned the coding-system (Lisp symbol)
4022         `iso-2022-7bit' by default.
4023
4024    o coding-category-iso-7-tight
4025
4026         Same as coding-category-iso-7 except that this can
4027         encode/decode only the specified charsets.
4028
4029    o coding-category-iso-8-1
4030
4031         The category for a coding system which has the same code range
4032         as ISO2022 of 8-bit environment and graphic plane 1 used only
4033         for DIMENSION1 charset.  This doesn't use any locking shift
4034         and single shift functions.  Assigned the coding-system (Lisp
4035         symbol) `iso-latin-1' by default.
4036
4037    o coding-category-iso-8-2
4038
4039         The category for a coding system which has the same code range
4040         as ISO2022 of 8-bit environment and graphic plane 1 used only
4041         for DIMENSION2 charset.  This doesn't use any locking shift
4042         and single shift functions.  Assigned the coding-system (Lisp
4043         symbol) `japanese-iso-8bit' by default.
4044
4045    o coding-category-iso-7-else
4046
4047         The category for a coding system which has the same code range
4048         as ISO2022 of 7-bit environment but uses locking shift or
4049         single shift functions.  Assigned the coding-system (Lisp
4050         symbol) `iso-2022-7bit-lock' by default.
4051
4052    o coding-category-iso-8-else
4053
4054         The category for a coding system which has the same code range
4055         as ISO2022 of 8-bit environment but uses locking shift or
4056         single shift functions.  Assigned the coding-system (Lisp
4057         symbol) `iso-2022-8bit-ss2' by default.
4058
4059    o coding-category-big5
4060
4061         The category for a coding system which has the same code range
4062         as BIG5.  Assigned the coding-system (Lisp symbol)
4063         `cn-big5' by default.
4064
4065    o coding-category-utf-8
4066
4067         The category for a coding system which has the same code range
4068         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4069         symbol) `utf-8' by default.
4070
4071    o coding-category-utf-16-be
4072
4073         The category for a coding system in which a text has an
4074         Unicode signature (cf. Unicode Standard) in the order of BIG
4075         endian at the head.  Assigned the coding-system (Lisp symbol)
4076         `utf-16-be' by default.
4077
4078    o coding-category-utf-16-le
4079
4080         The category for a coding system in which a text has an
4081         Unicode signature (cf. Unicode Standard) in the order of
4082         LITTLE endian at the head.  Assigned the coding-system (Lisp
4083         symbol) `utf-16-le' by default.
4084
4085    o coding-category-ccl
4086
4087         The category for a coding system of which encoder/decoder is
4088         written in CCL programs.  The default value is nil, i.e., no
4089         coding system is assigned.
4090
4091    o coding-category-binary
4092
4093         The category for a coding system not categorized in any of the
4094         above.  Assigned the coding-system (Lisp symbol)
4095         `no-conversion' by default.
4096
4097    Each of them is a Lisp symbol and the value is an actual
4098    `coding-system' (this is also a Lisp symbol) assigned by a user.
4099    What Emacs does actually is to detect a category of coding system.
4100    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4101    decide a single possible category, it selects a category of the
4102    highest priority.  Priorities of categories are also specified by a
4103    user in a Lisp variable `coding-category-list'.
4104
4105 */
4106
4107 static
4108 int ascii_skip_code[256];
4109
4110 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4111    If it detects possible coding systems, return an integer in which
4112    appropriate flag bits are set.  Flag bits are defined by macros
4113    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4114    it should point the table `coding_priorities'.  In that case, only
4115    the flag bit for a coding system of the highest priority is set in
4116    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4117    range 0x80..0x9F are in multibyte form.
4118
4119    How many ASCII characters are at the head is returned as *SKIP.  */
4120
4121 static int
4122 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4123      unsigned char *source;
4124      int src_bytes, *priorities, *skip;
4125      int multibytep;
4126 {
4127   register unsigned char c;
4128   unsigned char *src = source, *src_end = source + src_bytes;
4129   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4130   int i;
4131
4132   /* At first, skip all ASCII characters and control characters except
4133      for three ISO2022 specific control characters.  */
4134   ascii_skip_code[ISO_CODE_SO] = 0;
4135   ascii_skip_code[ISO_CODE_SI] = 0;
4136   ascii_skip_code[ISO_CODE_ESC] = 0;
4137
4138  label_loop_detect_coding:
4139   while (src < src_end && ascii_skip_code[*src]) src++;
4140   *skip = src - source;
4141
4142   if (src >= src_end)
4143     /* We found nothing other than ASCII.  There's nothing to do.  */
4144     return 0;
4145
4146   c = *src;
4147   /* The text seems to be encoded in some multilingual coding system.
4148      Now, try to find in which coding system the text is encoded.  */
4149   if (c < 0x80)
4150     {
4151       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4152       /* C is an ISO2022 specific control code of C0.  */
4153       mask = detect_coding_iso2022 (src, src_end, multibytep);
4154       if (mask == 0)
4155         {
4156           /* No valid ISO2022 code follows C.  Try again.  */
4157           src++;
4158           if (c == ISO_CODE_ESC)
4159             ascii_skip_code[ISO_CODE_ESC] = 1;
4160           else
4161             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4162           goto label_loop_detect_coding;
4163         }
4164       if (priorities)
4165         {
4166           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4167             {
4168               if (mask & priorities[i])
4169                 return priorities[i];
4170             }
4171           return CODING_CATEGORY_MASK_RAW_TEXT;
4172         }
4173     }
4174   else
4175     {
4176       int try;
4177
4178       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4179         c = src[1] - 0x20;
4180
4181       if (c < 0xA0)
4182         {
4183           /* C is the first byte of SJIS character code,
4184              or a leading-code of Emacs' internal format (emacs-mule),
4185              or the first byte of UTF-16.  */
4186           try = (CODING_CATEGORY_MASK_SJIS
4187                   | CODING_CATEGORY_MASK_EMACS_MULE
4188                   | CODING_CATEGORY_MASK_UTF_16_BE
4189                   | CODING_CATEGORY_MASK_UTF_16_LE);
4190
4191           /* Or, if C is a special latin extra code,
4192              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4193              or is an ISO2022 control-sequence-introducer (CSI),
4194              we should also consider the possibility of ISO2022 codings.  */
4195           if ((VECTORP (Vlatin_extra_code_table)
4196                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4197               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4198               || (c == ISO_CODE_CSI
4199                   && (src < src_end
4200                       && (*src == ']'
4201                           || ((*src == '0' || *src == '1' || *src == '2')
4202                               && src + 1 < src_end
4203                               && src[1] == ']')))))
4204             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4205                      | CODING_CATEGORY_MASK_ISO_8BIT);
4206         }
4207       else
4208         /* C is a character of ISO2022 in graphic plane right,
4209            or a SJIS's 1-byte character code (i.e. JISX0201),
4210            or the first byte of BIG5's 2-byte code,
4211            or the first byte of UTF-8/16.  */
4212         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4213                 | CODING_CATEGORY_MASK_ISO_8BIT
4214                 | CODING_CATEGORY_MASK_SJIS
4215                 | CODING_CATEGORY_MASK_BIG5
4216                 | CODING_CATEGORY_MASK_UTF_8
4217                 | CODING_CATEGORY_MASK_UTF_16_BE
4218                 | CODING_CATEGORY_MASK_UTF_16_LE);
4219
4220       /* Or, we may have to consider the possibility of CCL.  */
4221       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4222           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4223               ->spec.ccl.valid_codes)[c])
4224         try |= CODING_CATEGORY_MASK_CCL;
4225
4226       mask = 0;
4227       utf16_examined_p = iso2022_examined_p = 0;
4228       if (priorities)
4229         {
4230           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4231             {
4232               if (!iso2022_examined_p
4233                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4234                 {
4235                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4236                   iso2022_examined_p = 1;
4237                 }
4238               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4239                 mask |= detect_coding_sjis (src, src_end, multibytep);
4240               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4241                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4242               else if (!utf16_examined_p
4243                        && (priorities[i] & try &
4244                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4245                 {
4246                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4247                   utf16_examined_p = 1;
4248                 }
4249               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4250                 mask |= detect_coding_big5 (src, src_end, multibytep);
4251               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4252                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4253               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4254                 mask |= detect_coding_ccl (src, src_end, multibytep);
4255               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4256                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4257               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4258                 mask |= CODING_CATEGORY_MASK_BINARY;
4259               if (mask & priorities[i])
4260                 return priorities[i];
4261             }
4262           return CODING_CATEGORY_MASK_RAW_TEXT;
4263         }
4264       if (try & CODING_CATEGORY_MASK_ISO)
4265         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4266       if (try & CODING_CATEGORY_MASK_SJIS)
4267         mask |= detect_coding_sjis (src, src_end, multibytep);
4268       if (try & CODING_CATEGORY_MASK_BIG5)
4269         mask |= detect_coding_big5 (src, src_end, multibytep);
4270       if (try & CODING_CATEGORY_MASK_UTF_8)
4271         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4272       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4273         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4274       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4275         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4276       if (try & CODING_CATEGORY_MASK_CCL)
4277         mask |= detect_coding_ccl (src, src_end, multibytep);
4278     }
4279   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4280 }
4281
4282 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4283    The information of the detected coding system is set in CODING.  */
4284
4285 void
4286 detect_coding (coding, src, src_bytes)
4287      struct coding_system *coding;
4288      const unsigned char *src;
4289      int src_bytes;
4290 {
4291   unsigned int idx;
4292   int skip, mask;
4293   Lisp_Object val;
4294
4295   val = Vcoding_category_list;
4296   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4297                              coding->src_multibyte);
4298   coding->heading_ascii = skip;
4299
4300   if (!mask) return;
4301
4302   /* We found a single coding system of the highest priority in MASK.  */
4303   idx = 0;
4304   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4305   if (! mask)
4306     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4307
4308   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4309
4310   if (coding->eol_type != CODING_EOL_UNDECIDED)
4311     {
4312       Lisp_Object tmp;
4313
4314       tmp = Fget (val, Qeol_type);
4315       if (VECTORP (tmp))
4316         val = XVECTOR (tmp)->contents[coding->eol_type];
4317     }
4318
4319   /* Setup this new coding system while preserving some slots.  */
4320   {
4321     int src_multibyte = coding->src_multibyte;
4322     int dst_multibyte = coding->dst_multibyte;
4323
4324     setup_coding_system (val, coding);
4325     coding->src_multibyte = src_multibyte;
4326     coding->dst_multibyte = dst_multibyte;
4327     coding->heading_ascii = skip;
4328   }
4329 }
4330
4331 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4332    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4333    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4334
4335    How many non-eol characters are at the head is returned as *SKIP.  */
4336
4337 #define MAX_EOL_CHECK_COUNT 3
4338
4339 static int
4340 detect_eol_type (source, src_bytes, skip)
4341      unsigned char *source;
4342      int src_bytes, *skip;
4343 {
4344   unsigned char *src = source, *src_end = src + src_bytes;
4345   unsigned char c;
4346   int total = 0;                /* How many end-of-lines are found so far.  */
4347   int eol_type = CODING_EOL_UNDECIDED;
4348   int this_eol_type;
4349
4350   *skip = 0;
4351
4352   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4353     {
4354       c = *src++;
4355       if (c == '\n' || c == '\r')
4356         {
4357           if (*skip == 0)
4358             *skip = src - 1 - source;
4359           total++;
4360           if (c == '\n')
4361             this_eol_type = CODING_EOL_LF;
4362           else if (src >= src_end || *src != '\n')
4363             this_eol_type = CODING_EOL_CR;
4364           else
4365             this_eol_type = CODING_EOL_CRLF, src++;
4366
4367           if (eol_type == CODING_EOL_UNDECIDED)
4368             /* This is the first end-of-line.  */
4369             eol_type = this_eol_type;
4370           else if (eol_type != this_eol_type)
4371             {
4372               /* The found type is different from what found before.  */
4373               eol_type = CODING_EOL_INCONSISTENT;
4374               break;
4375             }
4376         }
4377     }
4378
4379   if (*skip == 0)
4380     *skip = src_end - source;
4381   return eol_type;
4382 }
4383
4384 /* Like detect_eol_type, but detect EOL type in 2-octet
4385    big-endian/little-endian format for coding systems utf-16-be and
4386    utf-16-le.  */
4387
4388 static int
4389 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4390      unsigned char *source;
4391      int src_bytes, *skip, big_endian_p;
4392 {
4393   unsigned char *src = source, *src_end = src + src_bytes;
4394   unsigned int c1, c2;
4395   int total = 0;                /* How many end-of-lines are found so far.  */
4396   int eol_type = CODING_EOL_UNDECIDED;
4397   int this_eol_type;
4398   int msb, lsb;
4399
4400   if (big_endian_p)
4401     msb = 0, lsb = 1;
4402   else
4403     msb = 1, lsb = 0;
4404
4405   *skip = 0;
4406
4407   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4408     {
4409       c1 = (src[msb] << 8) | (src[lsb]);
4410       src += 2;
4411
4412       if (c1 == '\n' || c1 == '\r')
4413         {
4414           if (*skip == 0)
4415             *skip = src - 2 - source;
4416           total++;
4417           if (c1 == '\n')
4418             {
4419               this_eol_type = CODING_EOL_LF;
4420             }
4421           else
4422             {
4423               if ((src + 1) >= src_end)
4424                 {
4425                   this_eol_type = CODING_EOL_CR;
4426                 }
4427               else
4428                 {
4429                   c2 = (src[msb] << 8) | (src[lsb]);
4430                   if (c2 == '\n')
4431                     this_eol_type = CODING_EOL_CRLF, src += 2;
4432                   else
4433                     this_eol_type = CODING_EOL_CR;
4434                 }
4435             }
4436
4437           if (eol_type == CODING_EOL_UNDECIDED)
4438             /* This is the first end-of-line.  */
4439             eol_type = this_eol_type;
4440           else if (eol_type != this_eol_type)
4441             {
4442               /* The found type is different from what found before.  */
4443               eol_type = CODING_EOL_INCONSISTENT;
4444               break;
4445             }
4446         }
4447     }
4448
4449   if (*skip == 0)
4450     *skip = src_end - source;
4451   return eol_type;
4452 }
4453
4454 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4455    is encoded.  If it detects an appropriate format of end-of-line, it
4456    sets the information in *CODING.  */
4457
4458 void
4459 detect_eol (coding, src, src_bytes)
4460      struct coding_system *coding;
4461      const unsigned char *src;
4462      int src_bytes;
4463 {
4464   Lisp_Object val;
4465   int skip;
4466   int eol_type;
4467
4468   switch (coding->category_idx)
4469     {
4470     case CODING_CATEGORY_IDX_UTF_16_BE:
4471       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4472       break;
4473     case CODING_CATEGORY_IDX_UTF_16_LE:
4474       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4475       break;
4476     default:
4477       eol_type = detect_eol_type (src, src_bytes, &skip);
4478       break;
4479     }
4480
4481   if (coding->heading_ascii > skip)
4482     coding->heading_ascii = skip;
4483   else
4484     skip = coding->heading_ascii;
4485
4486   if (eol_type == CODING_EOL_UNDECIDED)
4487     return;
4488   if (eol_type == CODING_EOL_INCONSISTENT)
4489     {
4490 #if 0
4491       /* This code is suppressed until we find a better way to
4492          distinguish raw text file and binary file.  */
4493
4494       /* If we have already detected that the coding is raw-text, the
4495          coding should actually be no-conversion.  */
4496       if (coding->type == coding_type_raw_text)
4497         {
4498           setup_coding_system (Qno_conversion, coding);
4499           return;
4500         }
4501       /* Else, let's decode only text code anyway.  */
4502 #endif /* 0 */
4503       eol_type = CODING_EOL_LF;
4504     }
4505
4506   val = Fget (coding->symbol, Qeol_type);
4507   if (VECTORP (val) && XVECTOR (val)->size == 3)
4508     {
4509       int src_multibyte = coding->src_multibyte;
4510       int dst_multibyte = coding->dst_multibyte;
4511       struct composition_data *cmp_data = coding->cmp_data;
4512
4513       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4514       coding->src_multibyte = src_multibyte;
4515       coding->dst_multibyte = dst_multibyte;
4516       coding->heading_ascii = skip;
4517       coding->cmp_data = cmp_data;
4518     }
4519 }
4520
4521 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4522
4523 #define DECODING_BUFFER_MAG(coding)                     \
4524   (coding->type == coding_type_iso2022                  \
4525    ? 3                                                  \
4526    : (coding->type == coding_type_ccl                   \
4527       ? coding->spec.ccl.decoder.buf_magnification      \
4528       : 2))
4529
4530 /* Return maximum size (bytes) of a buffer enough for decoding
4531    SRC_BYTES of text encoded in CODING.  */
4532
4533 int
4534 decoding_buffer_size (coding, src_bytes)
4535      struct coding_system *coding;
4536      int src_bytes;
4537 {
4538   return (src_bytes * DECODING_BUFFER_MAG (coding)
4539           + CONVERSION_BUFFER_EXTRA_ROOM);
4540 }
4541
4542 /* Return maximum size (bytes) of a buffer enough for encoding
4543    SRC_BYTES of text to CODING.  */
4544
4545 int
4546 encoding_buffer_size (coding, src_bytes)
4547      struct coding_system *coding;
4548      int src_bytes;
4549 {
4550   int magnification;
4551
4552   if (coding->type == coding_type_ccl)
4553     {
4554       magnification = coding->spec.ccl.encoder.buf_magnification;
4555       if (coding->eol_type == CODING_EOL_CRLF)
4556         magnification *= 2;
4557     }
4558   else if (CODING_REQUIRE_ENCODING (coding))
4559     magnification = 3;
4560   else
4561     magnification = 1;
4562
4563   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4564 }
4565
4566 /* Working buffer for code conversion.  */
4567 struct conversion_buffer
4568 {
4569   int size;                     /* size of data.  */
4570   int on_stack;                 /* 1 if allocated by alloca.  */
4571   unsigned char *data;
4572 };
4573
4574 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4575 #define allocate_conversion_buffer(buf, len)            \
4576   do {                                                  \
4577     if (len < MAX_ALLOCA)                               \
4578       {                                                 \
4579         buf.data = (unsigned char *) alloca (len);      \
4580         buf.on_stack = 1;                               \
4581       }                                                 \
4582     else                                                \
4583       {                                                 \
4584         buf.data = (unsigned char *) xmalloc (len);     \
4585         buf.on_stack = 0;                               \
4586       }                                                 \
4587     buf.size = len;                                     \
4588   } while (0)
4589
4590 /* Double the allocated memory for *BUF.  */
4591 static void
4592 extend_conversion_buffer (buf)
4593      struct conversion_buffer *buf;
4594 {
4595   if (buf->on_stack)
4596     {
4597       unsigned char *save = buf->data;
4598       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4599       bcopy (save, buf->data, buf->size);
4600       buf->on_stack = 0;
4601     }
4602   else
4603     {
4604       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4605     }
4606   buf->size *= 2;
4607 }
4608
4609 /* Free the allocated memory for BUF if it is not on stack.  */
4610 static void
4611 free_conversion_buffer (buf)
4612      struct conversion_buffer *buf;
4613 {
4614   if (!buf->on_stack)
4615     xfree (buf->data);
4616 }
4617
4618 int
4619 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4620      struct coding_system *coding;
4621      unsigned char *source, *destination;
4622      int src_bytes, dst_bytes, encodep;
4623 {
4624   struct ccl_program *ccl
4625     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4626   unsigned char *dst = destination;
4627
4628   ccl->suppress_error = coding->suppress_error;
4629   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4630   if (encodep)
4631     {
4632       /* On encoding, EOL format is converted within ccl_driver.  For
4633          that, setup proper information in the structure CCL.  */
4634       ccl->eol_type = coding->eol_type;
4635       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4636         ccl->eol_type = CODING_EOL_LF;
4637       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4638       ccl->eight_bit_control = coding->dst_multibyte;
4639     }
4640   else
4641     ccl->eight_bit_control = 1;
4642   ccl->multibyte = coding->src_multibyte;
4643   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4644     {
4645       /* Move carryover bytes to DESTINATION.  */
4646       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4647       while (*p)
4648         *dst++ = *p++;
4649       coding->spec.ccl.eight_bit_carryover[0] = 0;
4650       if (dst_bytes)
4651         dst_bytes -= dst - destination;
4652     }
4653
4654   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4655                                   &(coding->consumed))
4656                       + dst - destination);
4657
4658   if (encodep)
4659     {
4660       coding->produced_char = coding->produced;
4661       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4662     }
4663   else if (!ccl->eight_bit_control)
4664     {
4665       /* The produced bytes forms a valid multibyte sequence. */
4666       coding->produced_char
4667         = multibyte_chars_in_text (destination, coding->produced);
4668       coding->spec.ccl.eight_bit_carryover[0] = 0;
4669     }
4670   else
4671     {
4672       /* On decoding, the destination should always multibyte.  But,
4673          CCL program might have been generated an invalid multibyte
4674          sequence.  Here we make such a sequence valid as
4675          multibyte.  */
4676       int bytes
4677         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4678
4679       if ((coding->consumed < src_bytes
4680            || !ccl->last_block)
4681           && coding->produced >= 1
4682           && destination[coding->produced - 1] >= 0x80)
4683         {
4684           /* We should not convert the tailing 8-bit codes to
4685              multibyte form even if they doesn't form a valid
4686              multibyte sequence.  They may form a valid sequence in
4687              the next call.  */
4688           int carryover = 0;
4689
4690           if (destination[coding->produced - 1] < 0xA0)
4691             carryover = 1;
4692           else if (coding->produced >= 2)
4693             {
4694               if (destination[coding->produced - 2] >= 0x80)
4695                 {
4696                   if (destination[coding->produced - 2] < 0xA0)
4697                     carryover = 2;
4698                   else if (coding->produced >= 3
4699                            && destination[coding->produced - 3] >= 0x80
4700                            && destination[coding->produced - 3] < 0xA0)
4701                     carryover = 3;
4702                 }
4703             }
4704           if (carryover > 0)
4705             {
4706               BCOPY_SHORT (destination + coding->produced - carryover,
4707                            coding->spec.ccl.eight_bit_carryover,
4708                            carryover);
4709               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4710               coding->produced -= carryover;
4711             }
4712         }
4713       coding->produced = str_as_multibyte (destination, bytes,
4714                                            coding->produced,
4715                                            &(coding->produced_char));
4716     }
4717
4718   switch (ccl->status)
4719     {
4720     case CCL_STAT_SUSPEND_BY_SRC:
4721       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4722       break;
4723     case CCL_STAT_SUSPEND_BY_DST:
4724       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4725       break;
4726     case CCL_STAT_QUIT:
4727     case CCL_STAT_INVALID_CMD:
4728       coding->result = CODING_FINISH_INTERRUPT;
4729       break;
4730     default:
4731       coding->result = CODING_FINISH_NORMAL;
4732       break;
4733     }
4734   return coding->result;
4735 }
4736
4737 /* Decode EOL format of the text at PTR of BYTES length destructively
4738    according to CODING->eol_type.  This is called after the CCL
4739    program produced a decoded text at PTR.  If we do CRLF->LF
4740    conversion, update CODING->produced and CODING->produced_char.  */
4741
4742 static void
4743 decode_eol_post_ccl (coding, ptr, bytes)
4744      struct coding_system *coding;
4745      unsigned char *ptr;
4746      int bytes;
4747 {
4748   Lisp_Object val, saved_coding_symbol;
4749   unsigned char *pend = ptr + bytes;
4750   int dummy;
4751
4752   /* Remember the current coding system symbol.  We set it back when
4753      an inconsistent EOL is found so that `last-coding-system-used' is
4754      set to the coding system that doesn't specify EOL conversion.  */
4755   saved_coding_symbol = coding->symbol;
4756
4757   coding->spec.ccl.cr_carryover = 0;
4758   if (coding->eol_type == CODING_EOL_UNDECIDED)
4759     {
4760       /* Here, to avoid the call of setup_coding_system, we directly
4761          call detect_eol_type.  */
4762       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4763       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4764         coding->eol_type = CODING_EOL_LF;
4765       if (coding->eol_type != CODING_EOL_UNDECIDED)
4766         {
4767           val = Fget (coding->symbol, Qeol_type);
4768           if (VECTORP (val) && XVECTOR (val)->size == 3)
4769             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4770         }
4771       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4772     }
4773
4774   if (coding->eol_type == CODING_EOL_LF
4775       || coding->eol_type == CODING_EOL_UNDECIDED)
4776     {
4777       /* We have nothing to do.  */
4778       ptr = pend;
4779     }
4780   else if (coding->eol_type == CODING_EOL_CRLF)
4781     {
4782       unsigned char *pstart = ptr, *p = ptr;
4783
4784       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4785           && *(pend - 1) == '\r')
4786         {
4787           /* If the last character is CR, we can't handle it here
4788              because LF will be in the not-yet-decoded source text.
4789              Record that the CR is not yet processed.  */
4790           coding->spec.ccl.cr_carryover = 1;
4791           coding->produced--;
4792           coding->produced_char--;
4793           pend--;
4794         }
4795       while (ptr < pend)
4796         {
4797           if (*ptr == '\r')
4798             {
4799               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4800                 {
4801                   *p++ = '\n';
4802                   ptr += 2;
4803                 }
4804               else
4805                 {
4806                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4807                     goto undo_eol_conversion;
4808                   *p++ = *ptr++;
4809                 }
4810             }
4811           else if (*ptr == '\n'
4812                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4813             goto undo_eol_conversion;
4814           else
4815             *p++ = *ptr++;
4816           continue;
4817
4818         undo_eol_conversion:
4819           /* We have faced with inconsistent EOL format at PTR.
4820              Convert all LFs before PTR back to CRLFs.  */
4821           for (p--, ptr--; p >= pstart; p--)
4822             {
4823               if (*p == '\n')
4824                 *ptr-- = '\n', *ptr-- = '\r';
4825               else
4826                 *ptr-- = *p;
4827             }
4828           /*  If carryover is recorded, cancel it because we don't
4829               convert CRLF anymore.  */
4830           if (coding->spec.ccl.cr_carryover)
4831             {
4832               coding->spec.ccl.cr_carryover = 0;
4833               coding->produced++;
4834               coding->produced_char++;
4835               pend++;
4836             }
4837           p = ptr = pend;
4838           coding->eol_type = CODING_EOL_LF;
4839           coding->symbol = saved_coding_symbol;
4840         }
4841       if (p < pend)
4842         {
4843           /* As each two-byte sequence CRLF was converted to LF, (PEND
4844              - P) is the number of deleted characters.  */
4845           coding->produced -= pend - p;
4846           coding->produced_char -= pend - p;
4847         }
4848     }
4849   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4850     {
4851       unsigned char *p = ptr;
4852
4853       for (; ptr < pend; ptr++)
4854         {
4855           if (*ptr == '\r')
4856             *ptr = '\n';
4857           else if (*ptr == '\n'
4858                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4859             {
4860               for (; p < ptr; p++)
4861                 {
4862                   if (*p == '\n')
4863                     *p = '\r';
4864                 }
4865               ptr = pend;
4866               coding->eol_type = CODING_EOL_LF;
4867               coding->symbol = saved_coding_symbol;
4868             }
4869         }
4870     }
4871 }
4872
4873 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4874    decoding, it may detect coding system and format of end-of-line if
4875    those are not yet decided.  The source should be unibyte, the
4876    result is multibyte if CODING->dst_multibyte is nonzero, else
4877    unibyte.  */
4878
4879 int
4880 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4881      struct coding_system *coding;
4882      const unsigned char *source;
4883      unsigned char *destination;
4884      int src_bytes, dst_bytes;
4885 {
4886   int extra = 0;
4887
4888   if (coding->type == coding_type_undecided)
4889     detect_coding (coding, source, src_bytes);
4890
4891   if (coding->eol_type == CODING_EOL_UNDECIDED
4892       && coding->type != coding_type_ccl)
4893     {
4894       detect_eol (coding, source, src_bytes);
4895       /* We had better recover the original eol format if we
4896          encounter an inconsistent eol format while decoding.  */
4897       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4898     }
4899
4900   coding->produced = coding->produced_char = 0;
4901   coding->consumed = coding->consumed_char = 0;
4902   coding->errors = 0;
4903   coding->result = CODING_FINISH_NORMAL;
4904
4905   switch (coding->type)
4906     {
4907     case coding_type_sjis:
4908       decode_coding_sjis_big5 (coding, source, destination,
4909                                src_bytes, dst_bytes, 1);
4910       break;
4911
4912     case coding_type_iso2022:
4913       decode_coding_iso2022 (coding, source, destination,
4914                              src_bytes, dst_bytes);
4915       break;
4916
4917     case coding_type_big5:
4918       decode_coding_sjis_big5 (coding, source, destination,
4919                                src_bytes, dst_bytes, 0);
4920       break;
4921
4922     case coding_type_emacs_mule:
4923       decode_coding_emacs_mule (coding, source, destination,
4924                                 src_bytes, dst_bytes);
4925       break;
4926
4927     case coding_type_ccl:
4928       if (coding->spec.ccl.cr_carryover)
4929         {
4930           /* Put the CR which was not processed by the previous call
4931              of decode_eol_post_ccl in DESTINATION.  It will be
4932              decoded together with the following LF by the call to
4933              decode_eol_post_ccl below.  */
4934           *destination = '\r';
4935           coding->produced++;
4936           coding->produced_char++;
4937           dst_bytes--;
4938           extra = coding->spec.ccl.cr_carryover;
4939         }
4940       ccl_coding_driver (coding, source, destination + extra,
4941                          src_bytes, dst_bytes, 0);
4942       if (coding->eol_type != CODING_EOL_LF)
4943         {
4944           coding->produced += extra;
4945           coding->produced_char += extra;
4946           decode_eol_post_ccl (coding, destination, coding->produced);
4947         }
4948       break;
4949
4950     default:
4951       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4952     }
4953
4954   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4955       && coding->mode & CODING_MODE_LAST_BLOCK
4956       && coding->consumed == src_bytes)
4957     coding->result = CODING_FINISH_NORMAL;
4958
4959   if (coding->mode & CODING_MODE_LAST_BLOCK
4960       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4961     {
4962       const unsigned char *src = source + coding->consumed;
4963       unsigned char *dst = destination + coding->produced;
4964
4965       src_bytes -= coding->consumed;
4966       coding->errors++;
4967       if (COMPOSING_P (coding))
4968         DECODE_COMPOSITION_END ('1');
4969       while (src_bytes--)
4970         {
4971           int c = *src++;
4972           dst += CHAR_STRING (c, dst);
4973           coding->produced_char++;
4974         }
4975       coding->consumed = coding->consumed_char = src - source;
4976       coding->produced = dst - destination;
4977       coding->result = CODING_FINISH_NORMAL;
4978     }
4979
4980   if (!coding->dst_multibyte)
4981     {
4982       coding->produced = str_as_unibyte (destination, coding->produced);
4983       coding->produced_char = coding->produced;
4984     }
4985
4986   return coding->result;
4987 }
4988
4989 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4990    multibyteness of the source is CODING->src_multibyte, the
4991    multibyteness of the result is always unibyte.  */
4992
4993 int
4994 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4995      struct coding_system *coding;
4996      const unsigned char *source;
4997      unsigned char *destination;
4998      int src_bytes, dst_bytes;
4999 {
5000   coding->produced = coding->produced_char = 0;
5001   coding->consumed = coding->consumed_char = 0;
5002   coding->errors = 0;
5003   coding->result = CODING_FINISH_NORMAL;
5004   if (coding->eol_type == CODING_EOL_UNDECIDED)
5005     coding->eol_type = CODING_EOL_LF;
5006
5007   switch (coding->type)
5008     {
5009     case coding_type_sjis:
5010       encode_coding_sjis_big5 (coding, source, destination,
5011                                src_bytes, dst_bytes, 1);
5012       break;
5013
5014     case coding_type_iso2022:
5015       encode_coding_iso2022 (coding, source, destination,
5016                              src_bytes, dst_bytes);
5017       break;
5018
5019     case coding_type_big5:
5020       encode_coding_sjis_big5 (coding, source, destination,
5021                                src_bytes, dst_bytes, 0);
5022       break;
5023
5024     case coding_type_emacs_mule:
5025       encode_coding_emacs_mule (coding, source, destination,
5026                                 src_bytes, dst_bytes);
5027       break;
5028
5029     case coding_type_ccl:
5030       ccl_coding_driver (coding, source, destination,
5031                          src_bytes, dst_bytes, 1);
5032       break;
5033
5034     default:
5035       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5036     }
5037
5038   if (coding->mode & CODING_MODE_LAST_BLOCK
5039       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5040     {
5041       const unsigned char *src = source + coding->consumed;
5042       unsigned char *dst = destination + coding->produced;
5043
5044       if (coding->type == coding_type_iso2022)
5045         ENCODE_RESET_PLANE_AND_REGISTER;
5046       if (COMPOSING_P (coding))
5047         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5048       if (coding->consumed < src_bytes)
5049         {
5050           int len = src_bytes - coding->consumed;
5051
5052           BCOPY_SHORT (src, dst, len);
5053           if (coding->src_multibyte)
5054             len = str_as_unibyte (dst, len);
5055           dst += len;
5056           coding->consumed = src_bytes;
5057         }
5058       coding->produced = coding->produced_char = dst - destination;
5059       coding->result = CODING_FINISH_NORMAL;
5060     }
5061
5062   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5063       && coding->consumed == src_bytes)
5064     coding->result = CODING_FINISH_NORMAL;
5065
5066   return coding->result;
5067 }
5068
5069 /* Scan text in the region between *BEG and *END (byte positions),
5070    skip characters which we don't have to decode by coding system
5071    CODING at the head and tail, then set *BEG and *END to the region
5072    of the text we actually have to convert.  The caller should move
5073    the gap out of the region in advance if the region is from a
5074    buffer.
5075
5076    If STR is not NULL, *BEG and *END are indices into STR.  */
5077
5078 static void
5079 shrink_decoding_region (beg, end, coding, str)
5080      int *beg, *end;
5081      struct coding_system *coding;
5082      unsigned char *str;
5083 {
5084   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5085   int eol_conversion;
5086   Lisp_Object translation_table;
5087
5088   if (coding->type == coding_type_ccl
5089       || coding->type == coding_type_undecided
5090       || coding->eol_type != CODING_EOL_LF
5091       || !NILP (coding->post_read_conversion)
5092       || coding->composing != COMPOSITION_DISABLED)
5093     {
5094       /* We can't skip any data.  */
5095       return;
5096     }
5097   if (coding->type == coding_type_no_conversion
5098       || coding->type == coding_type_raw_text
5099       || coding->type == coding_type_emacs_mule)
5100     {
5101       /* We need no conversion, but don't have to skip any data here.
5102          Decoding routine handles them effectively anyway.  */
5103       return;
5104     }
5105
5106   translation_table = coding->translation_table_for_decode;
5107   if (NILP (translation_table) && !NILP (Venable_character_translation))
5108     translation_table = Vstandard_translation_table_for_decode;
5109   if (CHAR_TABLE_P (translation_table))
5110     {
5111       int i;
5112       for (i = 0; i < 128; i++)
5113         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5114           break;
5115       if (i < 128)
5116         /* Some ASCII character should be translated.  We give up
5117            shrinking.  */
5118         return;
5119     }
5120
5121   if (coding->heading_ascii >= 0)
5122     /* Detection routine has already found how much we can skip at the
5123        head.  */
5124     *beg += coding->heading_ascii;
5125
5126   if (str)
5127     {
5128       begp_orig = begp = str + *beg;
5129       endp_orig = endp = str + *end;
5130     }
5131   else
5132     {
5133       begp_orig = begp = BYTE_POS_ADDR (*beg);
5134       endp_orig = endp = begp + *end - *beg;
5135     }
5136
5137   eol_conversion = (coding->eol_type == CODING_EOL_CR
5138                     || coding->eol_type == CODING_EOL_CRLF);
5139
5140   switch (coding->type)
5141     {
5142     case coding_type_sjis:
5143     case coding_type_big5:
5144       /* We can skip all ASCII characters at the head.  */
5145       if (coding->heading_ascii < 0)
5146         {
5147           if (eol_conversion)
5148             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5149           else
5150             while (begp < endp && *begp < 0x80) begp++;
5151         }
5152       /* We can skip all ASCII characters at the tail except for the
5153          second byte of SJIS or BIG5 code.  */
5154       if (eol_conversion)
5155         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5156       else
5157         while (begp < endp && endp[-1] < 0x80) endp--;
5158       /* Do not consider LF as ascii if preceded by CR, since that
5159          confuses eol decoding. */
5160       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5161         endp++;
5162       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5163         endp++;
5164       break;
5165
5166     case coding_type_iso2022:
5167       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5168         /* We can't skip any data.  */
5169         break;
5170       if (coding->heading_ascii < 0)
5171         {
5172           /* We can skip all ASCII characters at the head except for a
5173              few control codes.  */
5174           while (begp < endp && (c = *begp) < 0x80
5175                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5176                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5177                  && (!eol_conversion || c != ISO_CODE_LF))
5178             begp++;
5179         }
5180       switch (coding->category_idx)
5181         {
5182         case CODING_CATEGORY_IDX_ISO_8_1:
5183         case CODING_CATEGORY_IDX_ISO_8_2:
5184           /* We can skip all ASCII characters at the tail.  */
5185           if (eol_conversion)
5186             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5187           else
5188             while (begp < endp && endp[-1] < 0x80) endp--;
5189           /* Do not consider LF as ascii if preceded by CR, since that
5190              confuses eol decoding. */
5191           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5192             endp++;
5193           break;
5194
5195         case CODING_CATEGORY_IDX_ISO_7:
5196         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5197           {
5198             /* We can skip all characters at the tail except for 8-bit
5199                codes and ESC and the following 2-byte at the tail.  */
5200             unsigned char *eight_bit = NULL;
5201
5202             if (eol_conversion)
5203               while (begp < endp
5204                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5205                 {
5206                   if (!eight_bit && c & 0x80) eight_bit = endp;
5207                   endp--;
5208                 }
5209             else
5210               while (begp < endp
5211                      && (c = endp[-1]) != ISO_CODE_ESC)
5212                 {
5213                   if (!eight_bit && c & 0x80) eight_bit = endp;
5214                   endp--;
5215                 }
5216             /* Do not consider LF as ascii if preceded by CR, since that
5217                confuses eol decoding. */
5218             if (begp < endp && endp < endp_orig
5219                 && endp[-1] == '\r' && endp[0] == '\n')
5220               endp++;
5221             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5222               {
5223                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5224                   /* This is an ASCII designation sequence.  We can
5225                      surely skip the tail.  But, if we have
5226                      encountered an 8-bit code, skip only the codes
5227                      after that.  */
5228                   endp = eight_bit ? eight_bit : endp + 2;
5229                 else
5230                   /* Hmmm, we can't skip the tail.  */
5231                   endp = endp_orig;
5232               }
5233             else if (eight_bit)
5234               endp = eight_bit;
5235           }
5236         }
5237       break;
5238
5239     default:
5240       abort ();
5241     }
5242   *beg += begp - begp_orig;
5243   *end += endp - endp_orig;
5244   return;
5245 }
5246
5247 /* Like shrink_decoding_region but for encoding.  */
5248
5249 static void
5250 shrink_encoding_region (beg, end, coding, str)
5251      int *beg, *end;
5252      struct coding_system *coding;
5253      unsigned char *str;
5254 {
5255   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5256   int eol_conversion;
5257   Lisp_Object translation_table;
5258
5259   if (coding->type == coding_type_ccl
5260       || coding->eol_type == CODING_EOL_CRLF
5261       || coding->eol_type == CODING_EOL_CR
5262       || (coding->cmp_data && coding->cmp_data->used > 0))
5263     {
5264       /* We can't skip any data.  */
5265       return;
5266     }
5267   if (coding->type == coding_type_no_conversion
5268       || coding->type == coding_type_raw_text
5269       || coding->type == coding_type_emacs_mule
5270       || coding->type == coding_type_undecided)
5271     {
5272       /* We need no conversion, but don't have to skip any data here.
5273          Encoding routine handles them effectively anyway.  */
5274       return;
5275     }
5276
5277   translation_table = coding->translation_table_for_encode;
5278   if (NILP (translation_table) && !NILP (Venable_character_translation))
5279     translation_table = Vstandard_translation_table_for_encode;
5280   if (CHAR_TABLE_P (translation_table))
5281     {
5282       int i;
5283       for (i = 0; i < 128; i++)
5284         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5285           break;
5286       if (i < 128)
5287         /* Some ASCII character should be translated.  We give up
5288            shrinking.  */
5289         return;
5290     }
5291
5292   if (str)
5293     {
5294       begp_orig = begp = str + *beg;
5295       endp_orig = endp = str + *end;
5296     }
5297   else
5298     {
5299       begp_orig = begp = BYTE_POS_ADDR (*beg);
5300       endp_orig = endp = begp + *end - *beg;
5301     }
5302
5303   eol_conversion = (coding->eol_type == CODING_EOL_CR
5304                     || coding->eol_type == CODING_EOL_CRLF);
5305
5306   /* Here, we don't have to check coding->pre_write_conversion because
5307      the caller is expected to have handled it already.  */
5308   switch (coding->type)
5309     {
5310     case coding_type_iso2022:
5311       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5312         /* We can't skip any data.  */
5313         break;
5314       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5315         {
5316           unsigned char *bol = begp;
5317           while (begp < endp && *begp < 0x80)
5318             {
5319               begp++;
5320               if (begp[-1] == '\n')
5321                 bol = begp;
5322             }
5323           begp = bol;
5324           goto label_skip_tail;
5325         }
5326       /* fall down ... */
5327
5328     case coding_type_sjis:
5329     case coding_type_big5:
5330       /* We can skip all ASCII characters at the head and tail.  */
5331       if (eol_conversion)
5332         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5333       else
5334         while (begp < endp && *begp < 0x80) begp++;
5335     label_skip_tail:
5336       if (eol_conversion)
5337         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5338       else
5339         while (begp < endp && *(endp - 1) < 0x80) endp--;
5340       break;
5341
5342     default:
5343       abort ();
5344     }
5345
5346   *beg += begp - begp_orig;
5347   *end += endp - endp_orig;
5348   return;
5349 }
5350
5351 /* As shrinking conversion region requires some overhead, we don't try
5352    shrinking if the length of conversion region is less than this
5353    value.  */
5354 static int shrink_conversion_region_threshhold = 1024;
5355
5356 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5357   do {                                                                  \
5358     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5359       {                                                                 \
5360         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5361         else shrink_decoding_region (beg, end, coding, str);            \
5362       }                                                                 \
5363   } while (0)
5364
5365 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5366    Vlast_coding_system_used and the remaining elements are buffers to
5367    kill.  */
5368 static Lisp_Object
5369 code_convert_region_unwind (arg)
5370      Lisp_Object arg;
5371 {
5372   struct gcpro gcpro1;
5373   GCPRO1 (arg);
5374
5375   inhibit_pre_post_conversion = 0;
5376   Vlast_coding_system_used = XCAR (arg);
5377   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5378     Fkill_buffer (XCAR (arg));
5379
5380   UNGCPRO;
5381   return Qnil;
5382 }
5383
5384 /* Store information about all compositions in the range FROM and TO
5385    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5386    buffer or a string, defaults to the current buffer.  */
5387
5388 void
5389 coding_save_composition (coding, from, to, obj)
5390      struct coding_system *coding;
5391      int from, to;
5392      Lisp_Object obj;
5393 {
5394   Lisp_Object prop;
5395   int start, end;
5396
5397   if (coding->composing == COMPOSITION_DISABLED)
5398     return;
5399   if (!coding->cmp_data)
5400     coding_allocate_composition_data (coding, from);
5401   if (!find_composition (from, to, &start, &end, &prop, obj)
5402       || end > to)
5403     return;
5404   if (start < from
5405       && (!find_composition (end, to, &start, &end, &prop, obj)
5406           || end > to))
5407     return;
5408   coding->composing = COMPOSITION_NO;
5409   do
5410     {
5411       if (COMPOSITION_VALID_P (start, end, prop))
5412         {
5413           enum composition_method method = COMPOSITION_METHOD (prop);
5414           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5415               >= COMPOSITION_DATA_SIZE)
5416             coding_allocate_composition_data (coding, from);
5417           /* For relative composition, we remember start and end
5418              positions, for the other compositions, we also remember
5419              components.  */
5420           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5421           if (method != COMPOSITION_RELATIVE)
5422             {
5423               /* We must store a*/
5424               Lisp_Object val, ch;
5425
5426               val = COMPOSITION_COMPONENTS (prop);
5427               if (CONSP (val))
5428                 while (CONSP (val))
5429                   {
5430                     ch = XCAR (val), val = XCDR (val);
5431                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5432                   }
5433               else if (VECTORP (val) || STRINGP (val))
5434                 {
5435                   int len = (VECTORP (val)
5436                              ? XVECTOR (val)->size : SCHARS (val));
5437                   int i;
5438                   for (i = 0; i < len; i++)
5439                     {
5440                       ch = (STRINGP (val)
5441                             ? Faref (val, make_number (i))
5442                             : XVECTOR (val)->contents[i]);
5443                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5444                     }
5445                 }
5446               else              /* INTEGERP (val) */
5447                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5448             }
5449           CODING_ADD_COMPOSITION_END (coding, end - from);
5450         }
5451       start = end;
5452     }
5453   while (start < to
5454          && find_composition (start, to, &start, &end, &prop, obj)
5455          && end <= to);
5456
5457   /* Make coding->cmp_data point to the first memory block.  */
5458   while (coding->cmp_data->prev)
5459     coding->cmp_data = coding->cmp_data->prev;
5460   coding->cmp_data_start = 0;
5461 }
5462
5463 /* Reflect the saved information about compositions to OBJ.
5464    CODING->cmp_data points to a memory block for the information.  OBJ
5465    is a buffer or a string, defaults to the current buffer.  */
5466
5467 void
5468 coding_restore_composition (coding, obj)
5469      struct coding_system *coding;
5470      Lisp_Object obj;
5471 {
5472   struct composition_data *cmp_data = coding->cmp_data;
5473
5474   if (!cmp_data)
5475     return;
5476
5477   while (cmp_data->prev)
5478     cmp_data = cmp_data->prev;
5479
5480   while (cmp_data)
5481     {
5482       int i;
5483
5484       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5485            i += cmp_data->data[i])
5486         {
5487           int *data = cmp_data->data + i;
5488           enum composition_method method = (enum composition_method) data[3];
5489           Lisp_Object components;
5490
5491           if (data[0] < 0 || i + data[0] > cmp_data->used)
5492             /* Invalid composition data.  */
5493             break;
5494
5495           if (method == COMPOSITION_RELATIVE)
5496             components = Qnil;
5497           else
5498             {
5499               int len = data[0] - 4, j;
5500               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5501
5502               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5503                   && len % 2 == 0)
5504                 len --;
5505               if (len < 1)
5506                 /* Invalid composition data.  */
5507                 break;
5508               for (j = 0; j < len; j++)
5509                 args[j] = make_number (data[4 + j]);
5510               components = (method == COMPOSITION_WITH_ALTCHARS
5511                             ? Fstring (len, args)
5512                             : Fvector (len, args));
5513             }
5514           compose_text (data[1], data[2], components, Qnil, obj);
5515         }
5516       cmp_data = cmp_data->next;
5517     }
5518 }
5519
5520 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5521    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5522    coding system CODING, and return the status code of code conversion
5523    (currently, this value has no meaning).
5524
5525    How many characters (and bytes) are converted to how many
5526    characters (and bytes) are recorded in members of the structure
5527    CODING.
5528
5529    If REPLACE is nonzero, we do various things as if the original text
5530    is deleted and a new text is inserted.  See the comments in
5531    replace_range (insdel.c) to know what we are doing.
5532
5533    If REPLACE is zero, it is assumed that the source text is unibyte.
5534    Otherwise, it is assumed that the source text is multibyte.  */
5535
5536 int
5537 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5538      int from, from_byte, to, to_byte, encodep, replace;
5539      struct coding_system *coding;
5540 {
5541   int len = to - from, len_byte = to_byte - from_byte;
5542   int nchars_del = 0, nbytes_del = 0;
5543   int require, inserted, inserted_byte;
5544   int head_skip, tail_skip, total_skip = 0;
5545   Lisp_Object saved_coding_symbol;
5546   int first = 1;
5547   unsigned char *src, *dst;
5548   Lisp_Object deletion;
5549   int orig_point = PT, orig_len = len;
5550   int prev_Z;
5551   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5552
5553   deletion = Qnil;
5554   saved_coding_symbol = coding->symbol;
5555
5556   if (from < PT && PT < to)
5557     {
5558       TEMP_SET_PT_BOTH (from, from_byte);
5559       orig_point = from;
5560     }
5561
5562   if (replace)
5563     {
5564       int saved_from = from;
5565       int saved_inhibit_modification_hooks;
5566
5567       prepare_to_modify_buffer (from, to, &from);
5568       if (saved_from != from)
5569         {
5570           to = from + len;
5571           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5572           len_byte = to_byte - from_byte;
5573         }
5574
5575       /* The code conversion routine can not preserve text properties
5576          for now.  So, we must remove all text properties in the
5577          region.  Here, we must suppress all modification hooks.  */
5578       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5579       inhibit_modification_hooks = 1;
5580       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5581       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5582     }
5583
5584   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5585     {
5586       /* We must detect encoding of text and eol format.  */
5587
5588       if (from < GPT && to > GPT)
5589         move_gap_both (from, from_byte);
5590       if (coding->type == coding_type_undecided)
5591         {
5592           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5593           if (coding->type == coding_type_undecided)
5594             {
5595               /* It seems that the text contains only ASCII, but we
5596                  should not leave it undecided because the deeper
5597                  decoding routine (decode_coding) tries to detect the
5598                  encodings again in vain.  */
5599               coding->type = coding_type_emacs_mule;
5600               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5601               /* As emacs-mule decoder will handle composition, we
5602                  need this setting to allocate coding->cmp_data
5603                  later.  */
5604               coding->composing = COMPOSITION_NO;
5605             }
5606         }
5607       if (coding->eol_type == CODING_EOL_UNDECIDED
5608           && coding->type != coding_type_ccl)
5609         {
5610           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5611           if (coding->eol_type == CODING_EOL_UNDECIDED)
5612             coding->eol_type = CODING_EOL_LF;
5613           /* We had better recover the original eol format if we
5614              encounter an inconsistent eol format while decoding.  */
5615           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5616         }
5617     }
5618
5619   /* Now we convert the text.  */
5620
5621   /* For encoding, we must process pre-write-conversion in advance.  */
5622   if (! inhibit_pre_post_conversion
5623       && encodep
5624       && SYMBOLP (coding->pre_write_conversion)
5625       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5626     {
5627       /* The function in pre-write-conversion may put a new text in a
5628          new buffer.  */
5629       struct buffer *prev = current_buffer;
5630       Lisp_Object new;
5631
5632       record_unwind_protect (code_convert_region_unwind,
5633                              Fcons (Vlast_coding_system_used, Qnil));
5634       /* We should not call any more pre-write/post-read-conversion
5635          functions while this pre-write-conversion is running.  */
5636       inhibit_pre_post_conversion = 1;
5637       call2 (coding->pre_write_conversion,
5638              make_number (from), make_number (to));
5639       inhibit_pre_post_conversion = 0;
5640       /* Discard the unwind protect.  */
5641       specpdl_ptr--;
5642
5643       if (current_buffer != prev)
5644         {
5645           len = ZV - BEGV;
5646           new = Fcurrent_buffer ();
5647           set_buffer_internal_1 (prev);
5648           del_range_2 (from, from_byte, to, to_byte, 0);
5649           TEMP_SET_PT_BOTH (from, from_byte);
5650           insert_from_buffer (XBUFFER (new), 1, len, 0);
5651           Fkill_buffer (new);
5652           if (orig_point >= to)
5653             orig_point += len - orig_len;
5654           else if (orig_point > from)
5655             orig_point = from;
5656           orig_len = len;
5657           to = from + len;
5658           from_byte = CHAR_TO_BYTE (from);
5659           to_byte = CHAR_TO_BYTE (to);
5660           len_byte = to_byte - from_byte;
5661           TEMP_SET_PT_BOTH (from, from_byte);
5662         }
5663     }
5664
5665   if (replace)
5666     {
5667       if (! EQ (current_buffer->undo_list, Qt))
5668         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5669       else
5670         {
5671           nchars_del = to - from;
5672           nbytes_del = to_byte - from_byte;
5673         }
5674     }
5675
5676   if (coding->composing != COMPOSITION_DISABLED)
5677     {
5678       if (encodep)
5679         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5680       else
5681         coding_allocate_composition_data (coding, from);
5682     }
5683
5684   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5685      if we must run CCL program or there are compositions to
5686      encode.  */
5687   if (coding->type != coding_type_ccl
5688       && (! coding->cmp_data || coding->cmp_data->used == 0))
5689     {
5690       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5691
5692       if (from < GPT && GPT < to)
5693         move_gap_both (from, from_byte);
5694       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5695       if (from_byte == to_byte
5696           && (encodep || NILP (coding->post_read_conversion))
5697           && ! CODING_REQUIRE_FLUSHING (coding))
5698         {
5699           coding->produced = len_byte;
5700           coding->produced_char = len;
5701           if (!replace)
5702             /* We must record and adjust for this new text now.  */
5703             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5704           coding_free_composition_data (coding);
5705           return 0;
5706         }
5707
5708       head_skip = from_byte - from_byte_orig;
5709       tail_skip = to_byte_orig - to_byte;
5710       total_skip = head_skip + tail_skip;
5711       from += head_skip;
5712       to -= tail_skip;
5713       len -= total_skip; len_byte -= total_skip;
5714     }
5715
5716   /* For conversion, we must put the gap before the text in addition to
5717      making the gap larger for efficient decoding.  The required gap
5718      size starts from 2000 which is the magic number used in make_gap.
5719      But, after one batch of conversion, it will be incremented if we
5720      find that it is not enough .  */
5721   require = 2000;
5722
5723   if (GAP_SIZE  < require)
5724     make_gap (require - GAP_SIZE);
5725   move_gap_both (from, from_byte);
5726
5727   inserted = inserted_byte = 0;
5728
5729   GAP_SIZE += len_byte;
5730   ZV -= len;
5731   Z -= len;
5732   ZV_BYTE -= len_byte;
5733   Z_BYTE -= len_byte;
5734
5735   if (GPT - BEG < BEG_UNCHANGED)
5736     BEG_UNCHANGED = GPT - BEG;
5737   if (Z - GPT < END_UNCHANGED)
5738     END_UNCHANGED = Z - GPT;
5739
5740   if (!encodep && coding->src_multibyte)
5741     {
5742       /* Decoding routines expects that the source text is unibyte.
5743          We must convert 8-bit characters of multibyte form to
5744          unibyte.  */
5745       int len_byte_orig = len_byte;
5746       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5747       if (len_byte < len_byte_orig)
5748         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5749                     len_byte);
5750       coding->src_multibyte = 0;
5751     }
5752
5753   for (;;)
5754     {
5755       int result;
5756
5757       /* The buffer memory is now:
5758          +--------+converted-text+---------+-------original-text-------+---+
5759          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5760                   |<---------------------- GAP ----------------------->|  */
5761       src = GAP_END_ADDR - len_byte;
5762       dst = GPT_ADDR + inserted_byte;
5763
5764       if (encodep)
5765         result = encode_coding (coding, src, dst, len_byte, 0);
5766       else
5767         {
5768           if (coding->composing != COMPOSITION_DISABLED)
5769             coding->cmp_data->char_offset = from + inserted;
5770           result = decode_coding (coding, src, dst, len_byte, 0);
5771         }
5772
5773       /* The buffer memory is now:
5774          +--------+-------converted-text----+--+------original-text----+---+
5775          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5776                   |<---------------------- GAP ----------------------->|  */
5777
5778       inserted += coding->produced_char;
5779       inserted_byte += coding->produced;
5780       len_byte -= coding->consumed;
5781
5782       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5783         {
5784           coding_allocate_composition_data (coding, from + inserted);
5785           continue;
5786         }
5787
5788       src += coding->consumed;
5789       dst += coding->produced;
5790
5791       if (result == CODING_FINISH_NORMAL)
5792         {
5793           src += len_byte;
5794           break;
5795         }
5796       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5797         {
5798           unsigned char *pend = dst, *p = pend - inserted_byte;
5799           Lisp_Object eol_type;
5800
5801           /* Encode LFs back to the original eol format (CR or CRLF).  */
5802           if (coding->eol_type == CODING_EOL_CR)
5803             {
5804               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5805             }
5806           else
5807             {
5808               int count = 0;
5809
5810               while (p < pend) if (*p++ == '\n') count++;
5811               if (src - dst < count)
5812                 {
5813                   /* We don't have sufficient room for encoding LFs
5814                      back to CRLF.  We must record converted and
5815                      not-yet-converted text back to the buffer
5816                      content, enlarge the gap, then record them out of
5817                      the buffer contents again.  */
5818                   int add = len_byte + inserted_byte;
5819
5820                   GAP_SIZE -= add;
5821                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5822                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5823                   make_gap (count - GAP_SIZE);
5824                   GAP_SIZE += add;
5825                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5826                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5827                   /* Don't forget to update SRC, DST, and PEND.  */
5828                   src = GAP_END_ADDR - len_byte;
5829                   dst = GPT_ADDR + inserted_byte;
5830                   pend = dst;
5831                 }
5832               inserted += count;
5833               inserted_byte += count;
5834               coding->produced += count;
5835               p = dst = pend + count;
5836               while (count)
5837                 {
5838                   *--p = *--pend;
5839                   if (*p == '\n') count--, *--p = '\r';
5840                 }
5841             }
5842
5843           /* Suppress eol-format conversion in the further conversion.  */
5844           coding->eol_type = CODING_EOL_LF;
5845
5846           /* Set the coding system symbol to that for Unix-like EOL.  */
5847           eol_type = Fget (saved_coding_symbol, Qeol_type);
5848           if (VECTORP (eol_type)
5849               && XVECTOR (eol_type)->size == 3
5850               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5851             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5852           else
5853             coding->symbol = saved_coding_symbol;
5854
5855           continue;
5856         }
5857       if (len_byte <= 0)
5858         {
5859           if (coding->type != coding_type_ccl
5860               || coding->mode & CODING_MODE_LAST_BLOCK)
5861             break;
5862           coding->mode |= CODING_MODE_LAST_BLOCK;
5863           continue;
5864         }
5865       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5866         {
5867           /* The source text ends in invalid codes.  Let's just
5868              make them valid buffer contents, and finish conversion.  */
5869           if (multibyte_p)
5870             {
5871               unsigned char *start = dst;
5872
5873               inserted += len_byte;
5874               while (len_byte--)
5875                 {
5876                   int c = *src++;
5877                   dst += CHAR_STRING (c, dst);
5878                 }
5879
5880               inserted_byte += dst - start;
5881             }
5882           else
5883             {
5884               inserted += len_byte;
5885               inserted_byte += len_byte;
5886               while (len_byte--)
5887                 *dst++ = *src++;
5888             }
5889           break;
5890         }
5891       if (result == CODING_FINISH_INTERRUPT)
5892         {
5893           /* The conversion procedure was interrupted by a user.  */
5894           break;
5895         }
5896       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5897       if (coding->consumed < 1)
5898         {
5899           /* It's quite strange to require more memory without
5900              consuming any bytes.  Perhaps CCL program bug.  */
5901           break;
5902         }
5903       if (first)
5904         {
5905           /* We have just done the first batch of conversion which was
5906              stopped because of insufficient gap.  Let's reconsider the
5907              required gap size (i.e. SRT - DST) now.
5908
5909              We have converted ORIG bytes (== coding->consumed) into
5910              NEW bytes (coding->produced).  To convert the remaining
5911              LEN bytes, we may need REQUIRE bytes of gap, where:
5912                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5913                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5914              Here, we are sure that NEW >= ORIG.  */
5915
5916           if (coding->produced <= coding->consumed)
5917             {
5918               /* This happens because of CCL-based coding system with
5919                  eol-type CRLF.  */
5920               require = 0;
5921             }
5922           else
5923             {
5924               float ratio = coding->produced - coding->consumed;
5925               ratio /= coding->consumed;
5926               require = len_byte * ratio;
5927             }
5928           first = 0;
5929         }
5930       if ((src - dst) < (require + 2000))
5931         {
5932           /* See the comment above the previous call of make_gap.  */
5933           int add = len_byte + inserted_byte;
5934
5935           GAP_SIZE -= add;
5936           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5937           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5938           make_gap (require + 2000);
5939           GAP_SIZE += add;
5940           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5941           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5942         }
5943     }
5944   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5945
5946   if (encodep && coding->dst_multibyte)
5947     {
5948       /* The output is unibyte.  We must convert 8-bit characters to
5949          multibyte form.  */
5950       if (inserted_byte * 2 > GAP_SIZE)
5951         {
5952           GAP_SIZE -= inserted_byte;
5953           ZV += inserted_byte; Z += inserted_byte;
5954           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5955           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5956           make_gap (inserted_byte - GAP_SIZE);
5957           GAP_SIZE += inserted_byte;
5958           ZV -= inserted_byte; Z -= inserted_byte;
5959           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5960           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5961         }
5962       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5963     }
5964
5965   /* If we shrank the conversion area, adjust it now.  */
5966   if (total_skip > 0)
5967     {
5968       if (tail_skip > 0)
5969         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5970       inserted += total_skip; inserted_byte += total_skip;
5971       GAP_SIZE += total_skip;
5972       GPT -= head_skip; GPT_BYTE -= head_skip;
5973       ZV -= total_skip; ZV_BYTE -= total_skip;
5974       Z -= total_skip; Z_BYTE -= total_skip;
5975       from -= head_skip; from_byte -= head_skip;
5976       to += tail_skip; to_byte += tail_skip;
5977     }
5978
5979   prev_Z = Z;
5980   if (! EQ (current_buffer->undo_list, Qt))
5981     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5982   else
5983     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5984                                  inserted, inserted_byte);
5985   inserted = Z - prev_Z;
5986
5987   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5988     coding_restore_composition (coding, Fcurrent_buffer ());
5989   coding_free_composition_data (coding);
5990
5991   if (! inhibit_pre_post_conversion
5992       && ! encodep && ! NILP (coding->post_read_conversion))
5993     {
5994       Lisp_Object val;
5995       Lisp_Object saved_coding_system;
5996
5997       if (from != PT)
5998         TEMP_SET_PT_BOTH (from, from_byte);
5999       prev_Z = Z;
6000       record_unwind_protect (code_convert_region_unwind,
6001                              Fcons (Vlast_coding_system_used, Qnil));
6002       saved_coding_system = Vlast_coding_system_used;
6003       Vlast_coding_system_used = coding->symbol;
6004       /* We should not call any more pre-write/post-read-conversion
6005          functions while this post-read-conversion is running.  */
6006       inhibit_pre_post_conversion = 1;
6007       val = call1 (coding->post_read_conversion, make_number (inserted));
6008       inhibit_pre_post_conversion = 0;
6009       coding->symbol = Vlast_coding_system_used;
6010       Vlast_coding_system_used = saved_coding_system;
6011       /* Discard the unwind protect.  */
6012       specpdl_ptr--;
6013       CHECK_NUMBER (val);
6014       inserted += Z - prev_Z;
6015     }
6016
6017   if (orig_point >= from)
6018     {
6019       if (orig_point >= from + orig_len)
6020         orig_point += inserted - orig_len;
6021       else
6022         orig_point = from;
6023       TEMP_SET_PT (orig_point);
6024     }
6025
6026   if (replace)
6027     {
6028       signal_after_change (from, to - from, inserted);
6029       update_compositions (from, from + inserted, CHECK_BORDER);
6030     }
6031
6032   {
6033     coding->consumed = to_byte - from_byte;
6034     coding->consumed_char = to - from;
6035     coding->produced = inserted_byte;
6036     coding->produced_char = inserted;
6037   }
6038
6039   return 0;
6040 }
6041
6042 /* Name (or base name) of work buffer for code conversion.  */
6043 static Lisp_Object Vcode_conversion_workbuf_name;
6044
6045 /* Set the current buffer to the working buffer prepared for
6046    code-conversion.  MULTIBYTE specifies the multibyteness of the
6047    buffer.  Return the buffer we set if it must be killed after use.
6048    Otherwise return Qnil.  */
6049
6050 static Lisp_Object
6051 set_conversion_work_buffer (multibyte)
6052      int multibyte;
6053 {
6054   Lisp_Object buffer, buffer_to_kill;
6055   struct buffer *buf;
6056
6057   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6058   buf = XBUFFER (buffer);
6059   if (buf == current_buffer)
6060     {
6061       /* As we are already in the work buffer, we must generate a new
6062          buffer for the work.  */
6063       Lisp_Object name;
6064
6065       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6066       buffer = buffer_to_kill = Fget_buffer_create (name);
6067       buf = XBUFFER (buffer);
6068     }
6069   else
6070     buffer_to_kill = Qnil;
6071
6072   delete_all_overlays (buf);
6073   buf->directory = current_buffer->directory;
6074   buf->read_only = Qnil;
6075   buf->filename = Qnil;
6076   buf->undo_list = Qt;
6077   eassert (buf->overlays_before == NULL);
6078   eassert (buf->overlays_after == NULL);
6079   set_buffer_internal (buf);
6080   if (BEG != BEGV || Z != ZV)
6081     Fwiden ();
6082   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6083   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6084   return buffer_to_kill;
6085 }
6086
6087 Lisp_Object
6088 run_pre_post_conversion_on_str (str, coding, encodep)
6089      Lisp_Object str;
6090      struct coding_system *coding;
6091      int encodep;
6092 {
6093   int count = SPECPDL_INDEX ();
6094   struct gcpro gcpro1, gcpro2;
6095   int multibyte = STRING_MULTIBYTE (str);
6096   Lisp_Object old_deactivate_mark;
6097   Lisp_Object buffer_to_kill;
6098   Lisp_Object unwind_arg;
6099
6100   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6101   /* It is not crucial to specbind this.  */
6102   old_deactivate_mark = Vdeactivate_mark;
6103   GCPRO2 (str, old_deactivate_mark);
6104
6105   /* We must insert the contents of STR as is without
6106      unibyte<->multibyte conversion.  For that, we adjust the
6107      multibyteness of the working buffer to that of STR.  */
6108   buffer_to_kill = set_conversion_work_buffer (multibyte);
6109   if (NILP (buffer_to_kill))
6110     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6111   else
6112     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6113   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6114
6115   insert_from_string (str, 0, 0,
6116                       SCHARS (str), SBYTES (str), 0);
6117   UNGCPRO;
6118   inhibit_pre_post_conversion = 1;
6119   if (encodep)
6120     {
6121       struct buffer *prev = current_buffer;
6122
6123       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6124       if (prev != current_buffer)
6125         /* We must kill the current buffer too.  */
6126         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6127     }
6128   else
6129     {
6130       Vlast_coding_system_used = coding->symbol;
6131       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6132       call1 (coding->post_read_conversion, make_number (Z - BEG));
6133       coding->symbol = Vlast_coding_system_used;
6134     }
6135   inhibit_pre_post_conversion = 0;
6136   Vdeactivate_mark = old_deactivate_mark;
6137   str = make_buffer_string (BEG, Z, 1);
6138   return unbind_to (count, str);
6139 }
6140
6141
6142 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6143    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6144    is intended that this function is called from encode_terminal_code,
6145    the pre-write-conversion function is run by safe_call and thus
6146    "Error during redisplay: ..." is logged when an error occurs.
6147
6148    Store the resulting text in *STR and set CODING->produced_char and
6149    CODING->produced to the number of characters and bytes
6150    respectively.  If the size of *STR is too small, enlarge it by
6151    xrealloc and update *STR and *SIZE.  */
6152
6153 void
6154 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6155      unsigned char **str;
6156      int *size, nchars, nbytes;
6157      struct coding_system *coding;
6158 {
6159   struct gcpro gcpro1, gcpro2;
6160   struct buffer *cur = current_buffer;
6161   struct buffer *prev;
6162   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6163   Lisp_Object args[3];
6164   Lisp_Object buffer_to_kill;
6165
6166   /* It is not crucial to specbind this.  */
6167   old_deactivate_mark = Vdeactivate_mark;
6168   old_last_coding_system_used = Vlast_coding_system_used;
6169   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6170
6171   /* We must insert the contents of STR as is without
6172      unibyte<->multibyte conversion.  For that, we adjust the
6173      multibyteness of the working buffer to that of STR.  */
6174   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6175   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6176   UNGCPRO;
6177   inhibit_pre_post_conversion = 1;
6178   prev = current_buffer;
6179   args[0] = coding->pre_write_conversion;
6180   args[1] = make_number (BEG);
6181   args[2] = make_number (Z);
6182   safe_call (3, args);
6183   inhibit_pre_post_conversion = 0;
6184   Vdeactivate_mark = old_deactivate_mark;
6185   Vlast_coding_system_used = old_last_coding_system_used;
6186   coding->produced_char = Z - BEG;
6187   coding->produced = Z_BYTE - BEG_BYTE;
6188   if (coding->produced > *size)
6189     {
6190       *size = coding->produced;
6191       *str = xrealloc (*str, *size);
6192     }
6193   if (BEG < GPT && GPT < Z)
6194     move_gap (BEG);
6195   bcopy (BEG_ADDR, *str, coding->produced);
6196   coding->src_multibyte
6197     = ! NILP (current_buffer->enable_multibyte_characters);
6198   if (prev != current_buffer)
6199     Fkill_buffer (Fcurrent_buffer ());
6200   set_buffer_internal (cur);
6201   if (! NILP (buffer_to_kill))
6202     Fkill_buffer (buffer_to_kill);
6203 }
6204
6205
6206 Lisp_Object
6207 decode_coding_string (str, coding, nocopy)
6208      Lisp_Object str;
6209      struct coding_system *coding;
6210      int nocopy;
6211 {
6212   int len;
6213   struct conversion_buffer buf;
6214   int from, to_byte;
6215   Lisp_Object saved_coding_symbol;
6216   int result;
6217   int require_decoding;
6218   int shrinked_bytes = 0;
6219   Lisp_Object newstr;
6220   int consumed, consumed_char, produced, produced_char;
6221
6222   from = 0;
6223   to_byte = SBYTES (str);
6224
6225   saved_coding_symbol = coding->symbol;
6226   coding->src_multibyte = STRING_MULTIBYTE (str);
6227   coding->dst_multibyte = 1;
6228   if (CODING_REQUIRE_DETECTION (coding))
6229     {
6230       /* See the comments in code_convert_region.  */
6231       if (coding->type == coding_type_undecided)
6232         {
6233           detect_coding (coding, SDATA (str), to_byte);
6234           if (coding->type == coding_type_undecided)
6235             {
6236               coding->type = coding_type_emacs_mule;
6237               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6238               /* As emacs-mule decoder will handle composition, we
6239                  need this setting to allocate coding->cmp_data
6240                  later.  */
6241               coding->composing = COMPOSITION_NO;
6242             }
6243         }
6244       if (coding->eol_type == CODING_EOL_UNDECIDED
6245           && coding->type != coding_type_ccl)
6246         {
6247           saved_coding_symbol = coding->symbol;
6248           detect_eol (coding, SDATA (str), to_byte);
6249           if (coding->eol_type == CODING_EOL_UNDECIDED)
6250             coding->eol_type = CODING_EOL_LF;
6251           /* We had better recover the original eol format if we
6252              encounter an inconsistent eol format while decoding.  */
6253           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6254         }
6255     }
6256
6257   if (coding->type == coding_type_no_conversion
6258       || coding->type == coding_type_raw_text)
6259     coding->dst_multibyte = 0;
6260
6261   require_decoding = CODING_REQUIRE_DECODING (coding);
6262
6263   if (STRING_MULTIBYTE (str))
6264     {
6265       /* Decoding routines expect the source text to be unibyte.  */
6266       str = Fstring_as_unibyte (str);
6267       to_byte = SBYTES (str);
6268       nocopy = 1;
6269       coding->src_multibyte = 0;
6270     }
6271
6272   /* Try to skip the heading and tailing ASCIIs.  */
6273   if (require_decoding && coding->type != coding_type_ccl)
6274     {
6275       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6276                                 0);
6277       if (from == to_byte)
6278         require_decoding = 0;
6279       shrinked_bytes = from + (SBYTES (str) - to_byte);
6280     }
6281
6282   if (!require_decoding
6283       && !(SYMBOLP (coding->post_read_conversion)
6284            && !NILP (Ffboundp (coding->post_read_conversion))))
6285     {
6286       coding->consumed = SBYTES (str);
6287       coding->consumed_char = SCHARS (str);
6288       if (coding->dst_multibyte)
6289         {
6290           str = Fstring_as_multibyte (str);
6291           nocopy = 1;
6292         }
6293       coding->produced = SBYTES (str);
6294       coding->produced_char = SCHARS (str);
6295       return (nocopy ? str : Fcopy_sequence (str));
6296     }
6297
6298   if (coding->composing != COMPOSITION_DISABLED)
6299     coding_allocate_composition_data (coding, from);
6300   len = decoding_buffer_size (coding, to_byte - from);
6301   allocate_conversion_buffer (buf, len);
6302
6303   consumed = consumed_char = produced = produced_char = 0;
6304   while (1)
6305     {
6306       result = decode_coding (coding, SDATA (str) + from + consumed,
6307                               buf.data + produced, to_byte - from - consumed,
6308                               buf.size - produced);
6309       consumed += coding->consumed;
6310       consumed_char += coding->consumed_char;
6311       produced += coding->produced;
6312       produced_char += coding->produced_char;
6313       if (result == CODING_FINISH_NORMAL
6314           || result == CODING_FINISH_INTERRUPT
6315           || (result == CODING_FINISH_INSUFFICIENT_SRC
6316               && coding->consumed == 0))
6317         break;
6318       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6319         coding_allocate_composition_data (coding, from + produced_char);
6320       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6321         extend_conversion_buffer (&buf);
6322       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6323         {
6324           Lisp_Object eol_type;
6325
6326           /* Recover the original EOL format.  */
6327           if (coding->eol_type == CODING_EOL_CR)
6328             {
6329               unsigned char *p;
6330               for (p = buf.data; p < buf.data + produced; p++)
6331                 if (*p == '\n') *p = '\r';
6332             }
6333           else if (coding->eol_type == CODING_EOL_CRLF)
6334             {
6335               int num_eol = 0;
6336               unsigned char *p0, *p1;
6337               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6338                 if (*p0 == '\n') num_eol++;
6339               if (produced + num_eol >= buf.size)
6340                 extend_conversion_buffer (&buf);
6341               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6342                 {
6343                   *--p1 = *--p0;
6344                   if (*p0 == '\n') *--p1 = '\r';
6345                 }
6346               produced += num_eol;
6347               produced_char += num_eol;
6348             }
6349           /* Suppress eol-format conversion in the further conversion.  */
6350           coding->eol_type = CODING_EOL_LF;
6351
6352           /* Set the coding system symbol to that for Unix-like EOL.  */
6353           eol_type = Fget (saved_coding_symbol, Qeol_type);
6354           if (VECTORP (eol_type)
6355               && XVECTOR (eol_type)->size == 3
6356               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6357             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6358           else
6359             coding->symbol = saved_coding_symbol;
6360
6361
6362         }
6363     }
6364
6365   coding->consumed = consumed;
6366   coding->consumed_char = consumed_char;
6367   coding->produced = produced;
6368   coding->produced_char = produced_char;
6369
6370   if (coding->dst_multibyte)
6371     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6372                                            produced + shrinked_bytes);
6373   else
6374     newstr = make_uninit_string (produced + shrinked_bytes);
6375   if (from > 0)
6376     STRING_COPYIN (newstr, 0, SDATA (str), from);
6377   STRING_COPYIN (newstr, from, buf.data, produced);
6378   if (shrinked_bytes > from)
6379     STRING_COPYIN (newstr, from + produced,
6380                    SDATA (str) + to_byte,
6381                    shrinked_bytes - from);
6382   free_conversion_buffer (&buf);
6383
6384   coding->consumed += shrinked_bytes;
6385   coding->consumed_char += shrinked_bytes;
6386   coding->produced += shrinked_bytes;
6387   coding->produced_char += shrinked_bytes;
6388
6389   if (coding->cmp_data && coding->cmp_data->used)
6390     coding_restore_composition (coding, newstr);
6391   coding_free_composition_data (coding);
6392
6393   if (SYMBOLP (coding->post_read_conversion)
6394       && !NILP (Ffboundp (coding->post_read_conversion)))
6395     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6396
6397   return newstr;
6398 }
6399
6400 Lisp_Object
6401 encode_coding_string (str, coding, nocopy)
6402      Lisp_Object str;
6403      struct coding_system *coding;
6404      int nocopy;
6405 {
6406   int len;
6407   struct conversion_buffer buf;
6408   int from, to, to_byte;
6409   int result;
6410   int shrinked_bytes = 0;
6411   Lisp_Object newstr;
6412   int consumed, consumed_char, produced, produced_char;
6413
6414   if (SYMBOLP (coding->pre_write_conversion)
6415       && !NILP (Ffboundp (coding->pre_write_conversion)))
6416     {
6417       str = run_pre_post_conversion_on_str (str, coding, 1);
6418       /* As STR is just newly generated, we don't have to copy it
6419          anymore.  */
6420       nocopy = 1;
6421     }
6422
6423   from = 0;
6424   to = SCHARS (str);
6425   to_byte = SBYTES (str);
6426
6427   /* Encoding routines determine the multibyteness of the source text
6428      by coding->src_multibyte.  */
6429   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6430   coding->dst_multibyte = 0;
6431   if (! CODING_REQUIRE_ENCODING (coding))
6432     goto no_need_of_encoding;
6433
6434   if (coding->composing != COMPOSITION_DISABLED)
6435     coding_save_composition (coding, from, to, str);
6436
6437   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6438      if we must run CCL program or there are compositions to
6439      encode.  */
6440   if (coding->type != coding_type_ccl
6441       && (! coding->cmp_data || coding->cmp_data->used == 0))
6442     {
6443       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6444                                 1);
6445       if (from == to_byte)
6446         {
6447           coding_free_composition_data (coding);
6448           goto no_need_of_encoding;
6449         }
6450       shrinked_bytes = from + (SBYTES (str) - to_byte);
6451     }
6452
6453   len = encoding_buffer_size (coding, to_byte - from);
6454   allocate_conversion_buffer (buf, len);
6455
6456   consumed = consumed_char = produced = produced_char = 0;
6457   while (1)
6458     {
6459       result = encode_coding (coding, SDATA (str) + from + consumed,
6460                               buf.data + produced, to_byte - from - consumed,
6461                               buf.size - produced);
6462       consumed += coding->consumed;
6463       consumed_char += coding->consumed_char;
6464       produced += coding->produced;
6465       produced_char += coding->produced_char;
6466       if (result == CODING_FINISH_NORMAL
6467           || result == CODING_FINISH_INTERRUPT
6468           || (result == CODING_FINISH_INSUFFICIENT_SRC
6469               && coding->consumed == 0))
6470         break;
6471       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6472       extend_conversion_buffer (&buf);
6473     }
6474
6475   coding->consumed = consumed;
6476   coding->consumed_char = consumed_char;
6477   coding->produced = produced;
6478   coding->produced_char = produced_char;
6479
6480   newstr = make_uninit_string (produced + shrinked_bytes);
6481   if (from > 0)
6482     STRING_COPYIN (newstr, 0, SDATA (str), from);
6483   STRING_COPYIN (newstr, from, buf.data, produced);
6484   if (shrinked_bytes > from)
6485     STRING_COPYIN (newstr, from + produced,
6486                    SDATA (str) + to_byte,
6487                    shrinked_bytes - from);
6488
6489   free_conversion_buffer (&buf);
6490   coding_free_composition_data (coding);
6491
6492   return newstr;
6493
6494  no_need_of_encoding:
6495   coding->consumed = SBYTES (str);
6496   coding->consumed_char = SCHARS (str);
6497   if (STRING_MULTIBYTE (str))
6498     {
6499       if (nocopy)
6500         /* We are sure that STR doesn't contain a multibyte
6501            character.  */
6502         STRING_SET_UNIBYTE (str);
6503       else
6504         {
6505           str = Fstring_as_unibyte (str);
6506           nocopy = 1;
6507         }
6508     }
6509   coding->produced = SBYTES (str);
6510   coding->produced_char = SCHARS (str);
6511   return (nocopy ? str : Fcopy_sequence (str));
6512 }
6513
6514 \f
6515 #ifdef emacs
6516 /*** 8. Emacs Lisp library functions ***/
6517
6518 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6519        doc: /* Return t if OBJECT is nil or a coding-system.
6520 See the documentation of `make-coding-system' for information
6521 about coding-system objects.  */)
6522      (obj)
6523      Lisp_Object obj;
6524 {
6525   if (NILP (obj))
6526     return Qt;
6527   if (!SYMBOLP (obj))
6528     return Qnil;
6529   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6530     return Qt;
6531   /* Get coding-spec vector for OBJ.  */
6532   obj = Fget (obj, Qcoding_system);
6533   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6534           ? Qt : Qnil);
6535 }
6536
6537 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6538        Sread_non_nil_coding_system, 1, 1, 0,
6539        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6540      (prompt)
6541      Lisp_Object prompt;
6542 {
6543   Lisp_Object val;
6544   do
6545     {
6546       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6547                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6548     }
6549   while (SCHARS (val) == 0);
6550   return (Fintern (val, Qnil));
6551 }
6552
6553 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6554        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6555 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6556      (prompt, default_coding_system)
6557      Lisp_Object prompt, default_coding_system;
6558 {
6559   Lisp_Object val;
6560   if (SYMBOLP (default_coding_system))
6561     default_coding_system = SYMBOL_NAME (default_coding_system);
6562   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6563                           Qt, Qnil, Qcoding_system_history,
6564                           default_coding_system, Qnil);
6565   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6566 }
6567
6568 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6569        1, 1, 0,
6570        doc: /* Check validity of CODING-SYSTEM.
6571 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6572 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6573 The value of this property should be a vector of length 5.  */)
6574      (coding_system)
6575      Lisp_Object coding_system;
6576 {
6577   Lisp_Object define_form;
6578
6579   define_form = Fget (coding_system, Qcoding_system_define_form);
6580   if (! NILP (define_form))
6581     {
6582       Fput (coding_system, Qcoding_system_define_form, Qnil);
6583       safe_eval (define_form);
6584     }
6585   if (!NILP (Fcoding_system_p (coding_system)))
6586     return coding_system;
6587   xsignal1 (Qcoding_system_error, coding_system);
6588 }
6589 \f
6590 Lisp_Object
6591 detect_coding_system (src, src_bytes, highest, multibytep)
6592      const unsigned char *src;
6593      int src_bytes, highest;
6594      int multibytep;
6595 {
6596   int coding_mask, eol_type;
6597   Lisp_Object val, tmp;
6598   int dummy;
6599
6600   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6601   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6602   if (eol_type == CODING_EOL_INCONSISTENT)
6603     eol_type = CODING_EOL_UNDECIDED;
6604
6605   if (!coding_mask)
6606     {
6607       val = Qundecided;
6608       if (eol_type != CODING_EOL_UNDECIDED)
6609         {
6610           Lisp_Object val2;
6611           val2 = Fget (Qundecided, Qeol_type);
6612           if (VECTORP (val2))
6613             val = XVECTOR (val2)->contents[eol_type];
6614         }
6615       return (highest ? val : Fcons (val, Qnil));
6616     }
6617
6618   /* At first, gather possible coding systems in VAL.  */
6619   val = Qnil;
6620   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6621     {
6622       Lisp_Object category_val, category_index;
6623
6624       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6625       category_val = Fsymbol_value (XCAR (tmp));
6626       if (!NILP (category_val)
6627           && NATNUMP (category_index)
6628           && (coding_mask & (1 << XFASTINT (category_index))))
6629         {
6630           val = Fcons (category_val, val);
6631           if (highest)
6632             break;
6633         }
6634     }
6635   if (!highest)
6636     val = Fnreverse (val);
6637
6638   /* Then, replace the elements with subsidiary coding systems.  */
6639   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6640     {
6641       if (eol_type != CODING_EOL_UNDECIDED
6642           && eol_type != CODING_EOL_INCONSISTENT)
6643         {
6644           Lisp_Object eol;
6645           eol = Fget (XCAR (tmp), Qeol_type);
6646           if (VECTORP (eol))
6647             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6648         }
6649     }
6650   return (highest ? XCAR (val) : val);
6651 }
6652
6653 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6654        2, 3, 0,
6655        doc: /* Detect how the byte sequence in the region is encoded.
6656 Return a list of possible coding systems used on decoding a byte
6657 sequence containing the bytes in the region between START and END when
6658 the coding system `undecided' is specified.  The list is ordered by
6659 priority decided in the current language environment.
6660
6661 If only ASCII characters are found, it returns a list of single element
6662 `undecided' or its subsidiary coding system according to a detected
6663 end-of-line format.
6664
6665 If optional argument HIGHEST is non-nil, return the coding system of
6666 highest priority.  */)
6667      (start, end, highest)
6668      Lisp_Object start, end, highest;
6669 {
6670   int from, to;
6671   int from_byte, to_byte;
6672   int include_anchor_byte = 0;
6673
6674   CHECK_NUMBER_COERCE_MARKER (start);
6675   CHECK_NUMBER_COERCE_MARKER (end);
6676
6677   validate_region (&start, &end);
6678   from = XINT (start), to = XINT (end);
6679   from_byte = CHAR_TO_BYTE (from);
6680   to_byte = CHAR_TO_BYTE (to);
6681
6682   if (from < GPT && to >= GPT)
6683     move_gap_both (to, to_byte);
6684   /* If we an anchor byte `\0' follows the region, we include it in
6685      the detecting source.  Then code detectors can handle the tailing
6686      byte sequence more accurately.
6687
6688      Fix me: This is not a perfect solution.  It is better that we
6689      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6690   */
6691   if (to == Z || (to == GPT && GAP_SIZE > 0))
6692     include_anchor_byte = 1;
6693   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6694                                to_byte - from_byte + include_anchor_byte,
6695                                !NILP (highest),
6696                                !NILP (current_buffer
6697                                       ->enable_multibyte_characters));
6698 }
6699
6700 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6701        1, 2, 0,
6702        doc: /* Detect how the byte sequence in STRING is encoded.
6703 Return a list of possible coding systems used on decoding a byte
6704 sequence containing the bytes in STRING when the coding system
6705 `undecided' is specified.  The list is ordered by priority decided in
6706 the current language environment.
6707
6708 If only ASCII characters are found, it returns a list of single element
6709 `undecided' or its subsidiary coding system according to a detected
6710 end-of-line format.
6711
6712 If optional argument HIGHEST is non-nil, return the coding system of
6713 highest priority.  */)
6714      (string, highest)
6715      Lisp_Object string, highest;
6716 {
6717   CHECK_STRING (string);
6718
6719   return detect_coding_system (SDATA (string),
6720                                /* "+ 1" is to include the anchor byte
6721                                   `\0'.  With this, code detectors can
6722                                   handle the tailing bytes more
6723                                   accurately.  */
6724                                SBYTES (string) + 1,
6725                                !NILP (highest),
6726                                STRING_MULTIBYTE (string));
6727 }
6728
6729 /*  Subroutine for Ffind_coding_systems_region_internal.
6730
6731     Return a list of coding systems that safely encode the multibyte
6732     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6733     possible coding systems.  If it is nil, it means that we have not
6734     yet found any coding systems.
6735
6736     WORK_TABLE a char-table of which element is set to t once the
6737     element is looked up.
6738
6739     If a non-ASCII single byte char is found, set
6740     *single_byte_char_found to 1.  */
6741
6742 static Lisp_Object
6743 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6744      unsigned char *p, *pend;
6745      Lisp_Object safe_codings, work_table;
6746      int *single_byte_char_found;
6747 {
6748   int c, len;
6749   Lisp_Object val, ch;
6750   Lisp_Object prev, tail;
6751
6752   if (NILP (safe_codings))
6753     goto done_safe_codings;
6754   while (p < pend)
6755     {
6756       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6757       p += len;
6758       if (ASCII_BYTE_P (c))
6759         /* We can ignore ASCII characters here.  */
6760         continue;
6761       if (SINGLE_BYTE_CHAR_P (c))
6762         *single_byte_char_found = 1;
6763       /* Check the safe coding systems for C.  */
6764       ch = make_number (c);
6765       val = Faref (work_table, ch);
6766       if (EQ (val, Qt))
6767         /* This element was already checked.  Ignore it.  */
6768         continue;
6769       /* Remember that we checked this element.  */
6770       Faset (work_table, ch, Qt);
6771
6772       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6773         {
6774           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6775           int encodable;
6776
6777           elt = XCAR (tail);
6778           if (CONSP (XCDR (elt)))
6779             {
6780               /* This entry has this format now:
6781                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6782                           ACCEPT-LATIN-EXTRA ) */
6783               val = XCDR (elt);
6784               encodable = ! NILP (Faref (XCAR (val), ch));
6785               if (! encodable)
6786                 {
6787                   val = XCDR (val);
6788                   translation_table = XCAR (val);
6789                   hash_table = XCAR (XCDR (val));
6790                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6791                 }
6792             }
6793           else
6794             {
6795               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6796               encodable = ! NILP (Faref (XCDR (elt), ch));
6797               if (! encodable)
6798                 {
6799                   /* Transform the format to:
6800                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6801                        ACCEPT-LATIN-EXTRA )  */
6802                   val = Fget (XCAR (elt), Qcoding_system);
6803                   translation_table
6804                     = Fplist_get (AREF (val, 3),
6805                                   Qtranslation_table_for_encode);
6806                   if (SYMBOLP (translation_table))
6807                     translation_table = Fget (translation_table,
6808                                               Qtranslation_table);
6809                   hash_table
6810                     = (CHAR_TABLE_P (translation_table)
6811                        ? XCHAR_TABLE (translation_table)->extras[1]
6812                        : Qnil);
6813                   accept_latin_extra
6814                     = ((EQ (AREF (val, 0), make_number (2))
6815                         && VECTORP (AREF (val, 4)))
6816                        ? AREF (AREF (val, 4), 16)
6817                        : Qnil);
6818                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6819                                         translation_table, hash_table,
6820                                         accept_latin_extra));
6821                 }
6822             }
6823
6824           if (! encodable
6825               && ((CHAR_TABLE_P (translation_table)
6826                    && ! NILP (Faref (translation_table, ch)))
6827                   || (HASH_TABLE_P (hash_table)
6828                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6829                   || (SINGLE_BYTE_CHAR_P (c)
6830                       && ! NILP (accept_latin_extra)
6831                       && VECTORP (Vlatin_extra_code_table)
6832                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6833             encodable = 1;
6834           if (encodable)
6835             prev = tail;
6836           else
6837             {
6838               /* Exclude this coding system from SAFE_CODINGS.  */
6839               if (EQ (tail, safe_codings))
6840                 {
6841                   safe_codings = XCDR (safe_codings);
6842                   if (NILP (safe_codings))
6843                     goto done_safe_codings;
6844                 }
6845               else
6846                 XSETCDR (prev, XCDR (tail));
6847             }
6848         }
6849     }
6850
6851  done_safe_codings:
6852   /* If the above loop was terminated before P reaches PEND, it means
6853      SAFE_CODINGS was set to nil.  If we have not yet found an
6854      non-ASCII single-byte char, check it now.  */
6855   if (! *single_byte_char_found)
6856     while (p < pend)
6857       {
6858         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6859         p += len;
6860         if (! ASCII_BYTE_P (c)
6861             && SINGLE_BYTE_CHAR_P (c))
6862           {
6863             *single_byte_char_found = 1;
6864             break;
6865           }
6866       }
6867   return safe_codings;
6868 }
6869
6870 DEFUN ("find-coding-systems-region-internal",
6871        Ffind_coding_systems_region_internal,
6872        Sfind_coding_systems_region_internal, 2, 2, 0,
6873        doc: /* Internal use only.  */)
6874      (start, end)
6875      Lisp_Object start, end;
6876 {
6877   Lisp_Object work_table, safe_codings;
6878   int non_ascii_p = 0;
6879   int single_byte_char_found = 0;
6880   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6881
6882   if (STRINGP (start))
6883     {
6884       if (!STRING_MULTIBYTE (start))
6885         return Qt;
6886       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6887       p2 = p2end = p1end;
6888       if (SCHARS (start) != SBYTES (start))
6889         non_ascii_p = 1;
6890     }
6891   else
6892     {
6893       int from, to, stop;
6894
6895       CHECK_NUMBER_COERCE_MARKER (start);
6896       CHECK_NUMBER_COERCE_MARKER (end);
6897       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6898         args_out_of_range (start, end);
6899       if (NILP (current_buffer->enable_multibyte_characters))
6900         return Qt;
6901       from = CHAR_TO_BYTE (XINT (start));
6902       to = CHAR_TO_BYTE (XINT (end));
6903       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6904       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6905       if (stop == to)
6906         p2 = p2end = p1end;
6907       else
6908         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6909       if (XINT (end) - XINT (start) != to - from)
6910         non_ascii_p = 1;
6911     }
6912
6913   if (!non_ascii_p)
6914     {
6915       /* We are sure that the text contains no multibyte character.
6916          Check if it contains eight-bit-graphic.  */
6917       p = p1;
6918       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6919       if (p == p1end)
6920         {
6921           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6922           if (p == p2end)
6923             return Qt;
6924         }
6925     }
6926
6927   /* The text contains non-ASCII characters.  */
6928
6929   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6930   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6931
6932   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6933                                     &single_byte_char_found);
6934   if (p2 < p2end)
6935     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6936                                       &single_byte_char_found);
6937   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6938     safe_codings = Qt;
6939   else
6940     {
6941       /* Turn safe_codings to a list of coding systems... */
6942       Lisp_Object val;
6943
6944       if (single_byte_char_found)
6945         /* ... and append these for eight-bit chars.  */
6946         val = Fcons (Qraw_text,
6947                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6948       else
6949         /* ... and append generic coding systems.  */
6950         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6951
6952       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6953         val = Fcons (XCAR (XCAR (safe_codings)), val);
6954       safe_codings = val;
6955     }
6956
6957   return safe_codings;
6958 }
6959
6960
6961 /* Search from position POS for such characters that are unencodable
6962    accoding to SAFE_CHARS, and return a list of their positions.  P
6963    points where in the memory the character at POS exists.  Limit the
6964    search at PEND or when Nth unencodable characters are found.
6965
6966    If SAFE_CHARS is a char table, an element for an unencodable
6967    character is nil.
6968
6969    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6970
6971    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6972    eight-bit-graphic characters are unencodable.  */
6973
6974 static Lisp_Object
6975 unencodable_char_position (safe_chars, pos, p, pend, n)
6976      Lisp_Object safe_chars;
6977      int pos;
6978      unsigned char *p, *pend;
6979      int n;
6980 {
6981   Lisp_Object pos_list;
6982
6983   pos_list = Qnil;
6984   while (p < pend)
6985     {
6986       int len;
6987       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6988
6989       if (c >= 128
6990           && (CHAR_TABLE_P (safe_chars)
6991               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6992               : (NILP (safe_chars) || c < 256)))
6993         {
6994           pos_list = Fcons (make_number (pos), pos_list);
6995           if (--n <= 0)
6996             break;
6997         }
6998       pos++;
6999       p += len;
7000     }
7001   return Fnreverse (pos_list);
7002 }
7003
7004
7005 DEFUN ("unencodable-char-position", Funencodable_char_position,
7006        Sunencodable_char_position, 3, 5, 0,
7007        doc: /*
7008 Return position of first un-encodable character in a region.
7009 START and END specfiy the region and CODING-SYSTEM specifies the
7010 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7011
7012 If optional 4th argument COUNT is non-nil, it specifies at most how
7013 many un-encodable characters to search.  In this case, the value is a
7014 list of positions.
7015
7016 If optional 5th argument STRING is non-nil, it is a string to search
7017 for un-encodable characters.  In that case, START and END are indexes
7018 to the string.  */)
7019      (start, end, coding_system, count, string)
7020      Lisp_Object start, end, coding_system, count, string;
7021 {
7022   int n;
7023   Lisp_Object safe_chars;
7024   struct coding_system coding;
7025   Lisp_Object positions;
7026   int from, to;
7027   unsigned char *p, *pend;
7028
7029   if (NILP (string))
7030     {
7031       validate_region (&start, &end);
7032       from = XINT (start);
7033       to = XINT (end);
7034       if (NILP (current_buffer->enable_multibyte_characters))
7035         return Qnil;
7036       p = CHAR_POS_ADDR (from);
7037       if (to == GPT)
7038         pend = GPT_ADDR;
7039       else
7040         pend = CHAR_POS_ADDR (to);
7041     }
7042   else
7043     {
7044       CHECK_STRING (string);
7045       CHECK_NATNUM (start);
7046       CHECK_NATNUM (end);
7047       from = XINT (start);
7048       to = XINT (end);
7049       if (from > to
7050           || to > SCHARS (string))
7051         args_out_of_range_3 (string, start, end);
7052       if (! STRING_MULTIBYTE (string))
7053         return Qnil;
7054       p = SDATA (string) + string_char_to_byte (string, from);
7055       pend = SDATA (string) + string_char_to_byte (string, to);
7056     }
7057
7058   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7059
7060   if (NILP (count))
7061     n = 1;
7062   else
7063     {
7064       CHECK_NATNUM (count);
7065       n = XINT (count);
7066     }
7067
7068   if (coding.type == coding_type_no_conversion
7069       || coding.type == coding_type_raw_text)
7070     return Qnil;
7071
7072   if (coding.type == coding_type_undecided)
7073     safe_chars = Qnil;
7074   else
7075     safe_chars = coding_safe_chars (coding_system);
7076
7077   if (STRINGP (string)
7078       || from >= GPT || to <= GPT)
7079     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7080   else
7081     {
7082       Lisp_Object args[2];
7083
7084       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7085       n -= XINT (Flength (args[0]));
7086       if (n <= 0)
7087         positions = args[0];
7088       else
7089         {
7090           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7091                                                pend, n);
7092           positions = Fappend (2, args);
7093         }
7094     }
7095
7096   return  (NILP (count) ? Fcar (positions) : positions);
7097 }
7098
7099
7100 Lisp_Object
7101 code_convert_region1 (start, end, coding_system, encodep)
7102      Lisp_Object start, end, coding_system;
7103      int encodep;
7104 {
7105   struct coding_system coding;
7106   int from, to;
7107
7108   CHECK_NUMBER_COERCE_MARKER (start);
7109   CHECK_NUMBER_COERCE_MARKER (end);
7110   CHECK_SYMBOL (coding_system);
7111
7112   validate_region (&start, &end);
7113   from = XFASTINT (start);
7114   to = XFASTINT (end);
7115
7116   if (NILP (coding_system))
7117     return make_number (to - from);
7118
7119   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7120     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7121
7122   coding.mode |= CODING_MODE_LAST_BLOCK;
7123   coding.src_multibyte = coding.dst_multibyte
7124     = !NILP (current_buffer->enable_multibyte_characters);
7125   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7126                        &coding, encodep, 1);
7127   Vlast_coding_system_used = coding.symbol;
7128   return make_number (coding.produced_char);
7129 }
7130
7131 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7132        3, 3, "r\nzCoding system: ",
7133        doc: /* Decode the current region from the specified coding system.
7134 When called from a program, takes three arguments:
7135 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7136 This function sets `last-coding-system-used' to the precise coding system
7137 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7138 not fully specified.)
7139 It returns the length of the decoded text.  */)
7140      (start, end, coding_system)
7141      Lisp_Object start, end, coding_system;
7142 {
7143   return code_convert_region1 (start, end, coding_system, 0);
7144 }
7145
7146 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7147        3, 3, "r\nzCoding system: ",
7148        doc: /* Encode the current region into the specified coding system.
7149 When called from a program, takes three arguments:
7150 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7151 This function sets `last-coding-system-used' to the precise coding system
7152 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7153 not fully specified.)
7154 It returns the length of the encoded text.  */)
7155      (start, end, coding_system)
7156      Lisp_Object start, end, coding_system;
7157 {
7158   return code_convert_region1 (start, end, coding_system, 1);
7159 }
7160
7161 Lisp_Object
7162 code_convert_string1 (string, coding_system, nocopy, encodep)
7163      Lisp_Object string, coding_system, nocopy;
7164      int encodep;
7165 {
7166   struct coding_system coding;
7167
7168   CHECK_STRING (string);
7169   CHECK_SYMBOL (coding_system);
7170
7171   if (NILP (coding_system))
7172     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7173
7174   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7175     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7176
7177   coding.mode |= CODING_MODE_LAST_BLOCK;
7178   string = (encodep
7179             ? encode_coding_string (string, &coding, !NILP (nocopy))
7180             : decode_coding_string (string, &coding, !NILP (nocopy)));
7181   Vlast_coding_system_used = coding.symbol;
7182
7183   return string;
7184 }
7185
7186 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7187        2, 3, 0,
7188        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7189 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7190 if the decoding operation is trivial.
7191 This function sets `last-coding-system-used' to the precise coding system
7192 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7193 not fully specified.)  */)
7194      (string, coding_system, nocopy)
7195      Lisp_Object string, coding_system, nocopy;
7196 {
7197   return code_convert_string1 (string, coding_system, nocopy, 0);
7198 }
7199
7200 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7201        2, 3, 0,
7202        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7203 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7204 if the encoding operation is trivial.
7205 This function sets `last-coding-system-used' to the precise coding system
7206 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7207 not fully specified.)  */)
7208      (string, coding_system, nocopy)
7209      Lisp_Object string, coding_system, nocopy;
7210 {
7211   return code_convert_string1 (string, coding_system, nocopy, 1);
7212 }
7213
7214 /* Encode or decode STRING according to CODING_SYSTEM.
7215    Do not set Vlast_coding_system_used.
7216
7217    This function is called only from macros DECODE_FILE and
7218    ENCODE_FILE, thus we ignore character composition.  */
7219
7220 Lisp_Object
7221 code_convert_string_norecord (string, coding_system, encodep)
7222      Lisp_Object string, coding_system;
7223      int encodep;
7224 {
7225   struct coding_system coding;
7226
7227   CHECK_STRING (string);
7228   CHECK_SYMBOL (coding_system);
7229
7230   if (NILP (coding_system))
7231     return string;
7232
7233   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7234     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7235
7236   coding.composing = COMPOSITION_DISABLED;
7237   coding.mode |= CODING_MODE_LAST_BLOCK;
7238   return (encodep
7239           ? encode_coding_string (string, &coding, 1)
7240           : decode_coding_string (string, &coding, 1));
7241 }
7242 \f
7243 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7244        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7245 Return the corresponding character.  */)
7246      (code)
7247      Lisp_Object code;
7248 {
7249   unsigned char c1, c2, s1, s2;
7250   Lisp_Object val;
7251
7252   CHECK_NUMBER (code);
7253   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7254   if (s1 == 0)
7255     {
7256       if (s2 < 0x80)
7257         XSETFASTINT (val, s2);
7258       else if (s2 >= 0xA0 || s2 <= 0xDF)
7259         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7260       else
7261         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7262     }
7263   else
7264     {
7265       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7266           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7267         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7268       DECODE_SJIS (s1, s2, c1, c2);
7269       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7270     }
7271   return val;
7272 }
7273
7274 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7275        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7276 Return the corresponding code in SJIS.  */)
7277      (ch)
7278      Lisp_Object ch;
7279 {
7280   int charset, c1, c2, s1, s2;
7281   Lisp_Object val;
7282
7283   CHECK_NUMBER (ch);
7284   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7285   if (charset == CHARSET_ASCII)
7286     {
7287       val = ch;
7288     }
7289   else if (charset == charset_jisx0208
7290            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7291     {
7292       ENCODE_SJIS (c1, c2, s1, s2);
7293       XSETFASTINT (val, (s1 << 8) | s2);
7294     }
7295   else if (charset == charset_katakana_jisx0201
7296            && c1 > 0x20 && c2 < 0xE0)
7297     {
7298       XSETFASTINT (val, c1 | 0x80);
7299     }
7300   else
7301     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7302   return val;
7303 }
7304
7305 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7306        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7307 Return the corresponding character.  */)
7308      (code)
7309      Lisp_Object code;
7310 {
7311   int charset;
7312   unsigned char b1, b2, c1, c2;
7313   Lisp_Object val;
7314
7315   CHECK_NUMBER (code);
7316   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7317   if (b1 == 0)
7318     {
7319       if (b2 >= 0x80)
7320         error ("Invalid BIG5 code: %x", XFASTINT (code));
7321       val = code;
7322     }
7323   else
7324     {
7325       if ((b1 < 0xA1 || b1 > 0xFE)
7326           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7327         error ("Invalid BIG5 code: %x", XFASTINT (code));
7328       DECODE_BIG5 (b1, b2, charset, c1, c2);
7329       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7330     }
7331   return val;
7332 }
7333
7334 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7335        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7336 Return the corresponding character code in Big5.  */)
7337      (ch)
7338      Lisp_Object ch;
7339 {
7340   int charset, c1, c2, b1, b2;
7341   Lisp_Object val;
7342
7343   CHECK_NUMBER (ch);
7344   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7345   if (charset == CHARSET_ASCII)
7346     {
7347       val = ch;
7348     }
7349   else if ((charset == charset_big5_1
7350             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7351            || (charset == charset_big5_2
7352                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7353     {
7354       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7355       XSETFASTINT (val, (b1 << 8) | b2);
7356     }
7357   else
7358     error ("Can't encode to Big5: %d", XFASTINT (ch));
7359   return val;
7360 }
7361 \f
7362 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7363        Sset_terminal_coding_system_internal, 1, 1, 0,
7364        doc: /* Internal use only.  */)
7365      (coding_system)
7366      Lisp_Object coding_system;
7367 {
7368   CHECK_SYMBOL (coding_system);
7369   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7370   /* We had better not send unsafe characters to terminal.  */
7371   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7372   /* Character composition should be disabled.  */
7373   terminal_coding.composing = COMPOSITION_DISABLED;
7374   /* Error notification should be suppressed.  */
7375   terminal_coding.suppress_error = 1;
7376   terminal_coding.src_multibyte = 1;
7377   terminal_coding.dst_multibyte = 0;
7378   return Qnil;
7379 }
7380
7381 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7382        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7383        doc: /* Internal use only.  */)
7384      (coding_system)
7385      Lisp_Object coding_system;
7386 {
7387   CHECK_SYMBOL (coding_system);
7388   setup_coding_system (Fcheck_coding_system (coding_system),
7389                        &safe_terminal_coding);
7390   /* Character composition should be disabled.  */
7391   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7392   /* Error notification should be suppressed.  */
7393   safe_terminal_coding.suppress_error = 1;
7394   safe_terminal_coding.src_multibyte = 1;
7395   safe_terminal_coding.dst_multibyte = 0;
7396   return Qnil;
7397 }
7398
7399 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7400        Sterminal_coding_system, 0, 0, 0,
7401        doc: /* Return coding system specified for terminal output.  */)
7402      ()
7403 {
7404   return terminal_coding.symbol;
7405 }
7406
7407 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7408        Sset_keyboard_coding_system_internal, 1, 1, 0,
7409        doc: /* Internal use only.  */)
7410      (coding_system)
7411      Lisp_Object coding_system;
7412 {
7413   CHECK_SYMBOL (coding_system);
7414   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7415   /* Character composition should be disabled.  */
7416   keyboard_coding.composing = COMPOSITION_DISABLED;
7417   return Qnil;
7418 }
7419
7420 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7421        Skeyboard_coding_system, 0, 0, 0,
7422        doc: /* Return coding system specified for decoding keyboard input.  */)
7423      ()
7424 {
7425   return keyboard_coding.symbol;
7426 }
7427
7428 \f
7429 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7430        Sfind_operation_coding_system,  1, MANY, 0,
7431        doc: /* Choose a coding system for an operation based on the target name.
7432 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7433 DECODING-SYSTEM is the coding system to use for decoding
7434 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7435 for encoding (in case OPERATION does encoding).
7436
7437 The first argument OPERATION specifies an I/O primitive:
7438   For file I/O, `insert-file-contents' or `write-region'.
7439   For process I/O, `call-process', `call-process-region', or `start-process'.
7440   For network I/O, `open-network-stream'.
7441
7442 The remaining arguments should be the same arguments that were passed
7443 to the primitive.  Depending on which primitive, one of those arguments
7444 is selected as the TARGET.  For example, if OPERATION does file I/O,
7445 whichever argument specifies the file name is TARGET.
7446
7447 TARGET has a meaning which depends on OPERATION:
7448   For file I/O, TARGET is a file name (except for the special case below).
7449   For process I/O, TARGET is a process name.
7450   For network I/O, TARGET is a service name or a port number
7451
7452 This function looks up what specified for TARGET in,
7453 `file-coding-system-alist', `process-coding-system-alist',
7454 or `network-coding-system-alist' depending on OPERATION.
7455 They may specify a coding system, a cons of coding systems,
7456 or a function symbol to call.
7457 In the last case, we call the function with one argument,
7458 which is a list of all the arguments given to this function.
7459
7460 If OPERATION is `insert-file-contents', the argument corresponding to
7461 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7462 file name to look up, and BUFFER is a buffer that contains the file's
7463 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7464 function to call for FILENAME, that function should examine the
7465 contents of BUFFER instead of reading the file.
7466
7467 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7468      (nargs, args)
7469      int nargs;
7470      Lisp_Object *args;
7471 {
7472   Lisp_Object operation, target_idx, target, val;
7473   register Lisp_Object chain;
7474
7475   if (nargs < 2)
7476     error ("Too few arguments");
7477   operation = args[0];
7478   if (!SYMBOLP (operation)
7479       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7480     error ("Invalid first argument");
7481   if (nargs < 1 + XINT (target_idx))
7482     error ("Too few arguments for operation: %s",
7483            SDATA (SYMBOL_NAME (operation)));
7484   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7485      argument to write-region) is string, it must be treated as a
7486      target file name.  */
7487   if (EQ (operation, Qwrite_region)
7488       && nargs > 5
7489       && STRINGP (args[5]))
7490     target_idx = make_number (4);
7491   target = args[XINT (target_idx) + 1];
7492   if (!(STRINGP (target)
7493         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7494             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7495         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7496     error ("Invalid argument %d", XINT (target_idx) + 1);
7497   if (CONSP (target))
7498     target = XCAR (target);
7499
7500   chain = ((EQ (operation, Qinsert_file_contents)
7501             || EQ (operation, Qwrite_region))
7502            ? Vfile_coding_system_alist
7503            : (EQ (operation, Qopen_network_stream)
7504               ? Vnetwork_coding_system_alist
7505               : Vprocess_coding_system_alist));
7506   if (NILP (chain))
7507     return Qnil;
7508
7509   for (; CONSP (chain); chain = XCDR (chain))
7510     {
7511       Lisp_Object elt;
7512       elt = XCAR (chain);
7513
7514       if (CONSP (elt)
7515           && ((STRINGP (target)
7516                && STRINGP (XCAR (elt))
7517                && fast_string_match (XCAR (elt), target) >= 0)
7518               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7519         {
7520           val = XCDR (elt);
7521           /* Here, if VAL is both a valid coding system and a valid
7522              function symbol, we return VAL as a coding system.  */
7523           if (CONSP (val))
7524             return val;
7525           if (! SYMBOLP (val))
7526             return Qnil;
7527           if (! NILP (Fcoding_system_p (val)))
7528             return Fcons (val, val);
7529           if (! NILP (Ffboundp (val)))
7530             {
7531               /* We use call1 rather than safe_call1
7532                  so as to get bug reports about functions called here
7533                  which don't handle the current interface.  */
7534               val = call1 (val, Flist (nargs, args));
7535               if (CONSP (val))
7536                 return val;
7537               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7538                 return Fcons (val, val);
7539             }
7540           return Qnil;
7541         }
7542     }
7543   return Qnil;
7544 }
7545
7546 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7547        Supdate_coding_systems_internal, 0, 0, 0,
7548        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7549 When values of any coding categories are changed, you must
7550 call this function.  */)
7551      ()
7552 {
7553   int i;
7554
7555   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7556     {
7557       Lisp_Object val;
7558
7559       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7560       if (!NILP (val))
7561         {
7562           if (! coding_system_table[i])
7563             coding_system_table[i] = ((struct coding_system *)
7564                                       xmalloc (sizeof (struct coding_system)));
7565           setup_coding_system (val, coding_system_table[i]);
7566         }
7567       else if (coding_system_table[i])
7568         {
7569           xfree (coding_system_table[i]);
7570           coding_system_table[i] = NULL;
7571         }
7572     }
7573
7574   return Qnil;
7575 }
7576
7577 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7578        Sset_coding_priority_internal, 0, 0, 0,
7579        doc: /* Update internal database for the current value of `coding-category-list'.
7580 This function is internal use only.  */)
7581      ()
7582 {
7583   int i = 0, idx;
7584   Lisp_Object val;
7585
7586   val = Vcoding_category_list;
7587
7588   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7589     {
7590       if (! SYMBOLP (XCAR (val)))
7591         break;
7592       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7593       if (idx >= CODING_CATEGORY_IDX_MAX)
7594         break;
7595       coding_priorities[i++] = (1 << idx);
7596       val = XCDR (val);
7597     }
7598   /* If coding-category-list is valid and contains all coding
7599      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7600      the following code saves Emacs from crashing.  */
7601   while (i < CODING_CATEGORY_IDX_MAX)
7602     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7603
7604   return Qnil;
7605 }
7606
7607 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7608        Sdefine_coding_system_internal, 1, 1, 0,
7609        doc: /* Register CODING-SYSTEM as a base coding system.
7610 This function is internal use only.  */)
7611      (coding_system)
7612      Lisp_Object coding_system;
7613 {
7614   Lisp_Object safe_chars, slot;
7615
7616   if (NILP (Fcheck_coding_system (coding_system)))
7617     xsignal1 (Qcoding_system_error, coding_system);
7618
7619   safe_chars = coding_safe_chars (coding_system);
7620   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7621     error ("No valid safe-chars property for %s",
7622            SDATA (SYMBOL_NAME (coding_system)));
7623
7624   if (EQ (safe_chars, Qt))
7625     {
7626       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7627         XSETCAR (Vcoding_system_safe_chars,
7628                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7629     }
7630   else
7631     {
7632       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7633       if (NILP (slot))
7634         XSETCDR (Vcoding_system_safe_chars,
7635                  nconc2 (XCDR (Vcoding_system_safe_chars),
7636                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7637       else
7638         XSETCDR (slot, safe_chars);
7639     }
7640   return Qnil;
7641 }
7642
7643 #endif /* emacs */
7644
7645 \f
7646 /*** 9. Post-amble ***/
7647
7648 void
7649 init_coding_once ()
7650 {
7651   int i;
7652
7653   /* Emacs' internal format specific initialize routine.  */
7654   for (i = 0; i <= 0x20; i++)
7655     emacs_code_class[i] = EMACS_control_code;
7656   emacs_code_class[0x0A] = EMACS_linefeed_code;
7657   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7658   for (i = 0x21 ; i < 0x7F; i++)
7659     emacs_code_class[i] = EMACS_ascii_code;
7660   emacs_code_class[0x7F] = EMACS_control_code;
7661   for (i = 0x80; i < 0xFF; i++)
7662     emacs_code_class[i] = EMACS_invalid_code;
7663   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7664   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7665   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7666   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7667
7668   /* ISO2022 specific initialize routine.  */
7669   for (i = 0; i < 0x20; i++)
7670     iso_code_class[i] = ISO_control_0;
7671   for (i = 0x21; i < 0x7F; i++)
7672     iso_code_class[i] = ISO_graphic_plane_0;
7673   for (i = 0x80; i < 0xA0; i++)
7674     iso_code_class[i] = ISO_control_1;
7675   for (i = 0xA1; i < 0xFF; i++)
7676     iso_code_class[i] = ISO_graphic_plane_1;
7677   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7678   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7679   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7680   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7681   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7682   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7683   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7684   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7685   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7686   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7687
7688   setup_coding_system (Qnil, &keyboard_coding);
7689   setup_coding_system (Qnil, &terminal_coding);
7690   setup_coding_system (Qnil, &safe_terminal_coding);
7691   setup_coding_system (Qnil, &default_buffer_file_coding);
7692
7693   bzero (coding_system_table, sizeof coding_system_table);
7694
7695   bzero (ascii_skip_code, sizeof ascii_skip_code);
7696   for (i = 0; i < 128; i++)
7697     ascii_skip_code[i] = 1;
7698
7699 #if defined (MSDOS) || defined (WINDOWSNT)
7700   system_eol_type = CODING_EOL_CRLF;
7701 #else
7702   system_eol_type = CODING_EOL_LF;
7703 #endif
7704
7705   inhibit_pre_post_conversion = 0;
7706 }
7707
7708 #ifdef emacs
7709
7710 void
7711 syms_of_coding ()
7712 {
7713   staticpro (&Vcode_conversion_workbuf_name);
7714   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7715
7716   Qtarget_idx = intern ("target-idx");
7717   staticpro (&Qtarget_idx);
7718
7719   Qcoding_system_history = intern ("coding-system-history");
7720   staticpro (&Qcoding_system_history);
7721   Fset (Qcoding_system_history, Qnil);
7722
7723   /* Target FILENAME is the first argument.  */
7724   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7725   /* Target FILENAME is the third argument.  */
7726   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7727
7728   Qcall_process = intern ("call-process");
7729   staticpro (&Qcall_process);
7730   /* Target PROGRAM is the first argument.  */
7731   Fput (Qcall_process, Qtarget_idx, make_number (0));
7732
7733   Qcall_process_region = intern ("call-process-region");
7734   staticpro (&Qcall_process_region);
7735   /* Target PROGRAM is the third argument.  */
7736   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7737
7738   Qstart_process = intern ("start-process");
7739   staticpro (&Qstart_process);
7740   /* Target PROGRAM is the third argument.  */
7741   Fput (Qstart_process, Qtarget_idx, make_number (2));
7742
7743   Qopen_network_stream = intern ("open-network-stream");
7744   staticpro (&Qopen_network_stream);
7745   /* Target SERVICE is the fourth argument.  */
7746   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7747
7748   Qcoding_system = intern ("coding-system");
7749   staticpro (&Qcoding_system);
7750
7751   Qeol_type = intern ("eol-type");
7752   staticpro (&Qeol_type);
7753
7754   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7755   staticpro (&Qbuffer_file_coding_system);
7756
7757   Qpost_read_conversion = intern ("post-read-conversion");
7758   staticpro (&Qpost_read_conversion);
7759
7760   Qpre_write_conversion = intern ("pre-write-conversion");
7761   staticpro (&Qpre_write_conversion);
7762
7763   Qno_conversion = intern ("no-conversion");
7764   staticpro (&Qno_conversion);
7765
7766   Qundecided = intern ("undecided");
7767   staticpro (&Qundecided);
7768
7769   Qcoding_system_p = intern ("coding-system-p");
7770   staticpro (&Qcoding_system_p);
7771
7772   Qcoding_system_error = intern ("coding-system-error");
7773   staticpro (&Qcoding_system_error);
7774
7775   Fput (Qcoding_system_error, Qerror_conditions,
7776         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7777   Fput (Qcoding_system_error, Qerror_message,
7778         build_string ("Invalid coding system"));
7779
7780   Qcoding_category = intern ("coding-category");
7781   staticpro (&Qcoding_category);
7782   Qcoding_category_index = intern ("coding-category-index");
7783   staticpro (&Qcoding_category_index);
7784
7785   Vcoding_category_table
7786     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7787   staticpro (&Vcoding_category_table);
7788   {
7789     int i;
7790     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7791       {
7792         XVECTOR (Vcoding_category_table)->contents[i]
7793           = intern (coding_category_name[i]);
7794         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7795               Qcoding_category_index, make_number (i));
7796       }
7797   }
7798
7799   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7800   staticpro (&Vcoding_system_safe_chars);
7801
7802   Qtranslation_table = intern ("translation-table");
7803   staticpro (&Qtranslation_table);
7804   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7805
7806   Qtranslation_table_id = intern ("translation-table-id");
7807   staticpro (&Qtranslation_table_id);
7808
7809   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7810   staticpro (&Qtranslation_table_for_decode);
7811
7812   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7813   staticpro (&Qtranslation_table_for_encode);
7814
7815   Qsafe_chars = intern ("safe-chars");
7816   staticpro (&Qsafe_chars);
7817
7818   Qchar_coding_system = intern ("char-coding-system");
7819   staticpro (&Qchar_coding_system);
7820
7821   /* Intern this now in case it isn't already done.
7822      Setting this variable twice is harmless.
7823      But don't staticpro it here--that is done in alloc.c.  */
7824   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7825   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7826   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7827
7828   Qvalid_codes = intern ("valid-codes");
7829   staticpro (&Qvalid_codes);
7830
7831   Qascii_incompatible = intern ("ascii-incompatible");
7832   staticpro (&Qascii_incompatible);
7833
7834   Qemacs_mule = intern ("emacs-mule");
7835   staticpro (&Qemacs_mule);
7836
7837   Qraw_text = intern ("raw-text");
7838   staticpro (&Qraw_text);
7839
7840   Qutf_8 = intern ("utf-8");
7841   staticpro (&Qutf_8);
7842
7843   Qcoding_system_define_form = intern ("coding-system-define-form");
7844   staticpro (&Qcoding_system_define_form);
7845
7846   defsubr (&Scoding_system_p);
7847   defsubr (&Sread_coding_system);
7848   defsubr (&Sread_non_nil_coding_system);
7849   defsubr (&Scheck_coding_system);
7850   defsubr (&Sdetect_coding_region);
7851   defsubr (&Sdetect_coding_string);
7852   defsubr (&Sfind_coding_systems_region_internal);
7853   defsubr (&Sunencodable_char_position);
7854   defsubr (&Sdecode_coding_region);
7855   defsubr (&Sencode_coding_region);
7856   defsubr (&Sdecode_coding_string);
7857   defsubr (&Sencode_coding_string);
7858   defsubr (&Sdecode_sjis_char);
7859   defsubr (&Sencode_sjis_char);
7860   defsubr (&Sdecode_big5_char);
7861   defsubr (&Sencode_big5_char);
7862   defsubr (&Sset_terminal_coding_system_internal);
7863   defsubr (&Sset_safe_terminal_coding_system_internal);
7864   defsubr (&Sterminal_coding_system);
7865   defsubr (&Sset_keyboard_coding_system_internal);
7866   defsubr (&Skeyboard_coding_system);
7867   defsubr (&Sfind_operation_coding_system);
7868   defsubr (&Supdate_coding_systems_internal);
7869   defsubr (&Sset_coding_priority_internal);
7870   defsubr (&Sdefine_coding_system_internal);
7871
7872   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7873                doc: /* List of coding systems.
7874
7875 Do not alter the value of this variable manually.  This variable should be
7876 updated by the functions `make-coding-system' and
7877 `define-coding-system-alias'.  */);
7878   Vcoding_system_list = Qnil;
7879
7880   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7881                doc: /* Alist of coding system names.
7882 Each element is one element list of coding system name.
7883 This variable is given to `completing-read' as TABLE argument.
7884
7885 Do not alter the value of this variable manually.  This variable should be
7886 updated by the functions `make-coding-system' and
7887 `define-coding-system-alias'.  */);
7888   Vcoding_system_alist = Qnil;
7889
7890   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7891                doc: /* List of coding-categories (symbols) ordered by priority.
7892
7893 On detecting a coding system, Emacs tries code detection algorithms
7894 associated with each coding-category one by one in this order.  When
7895 one algorithm agrees with a byte sequence of source text, the coding
7896 system bound to the corresponding coding-category is selected.
7897
7898 Don't modify this variable directly, but use `set-coding-priority'.  */);
7899   {
7900     int i;
7901
7902     Vcoding_category_list = Qnil;
7903     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7904       Vcoding_category_list
7905         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7906                  Vcoding_category_list);
7907   }
7908
7909   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7910                doc: /* Specify the coding system for read operations.
7911 It is useful to bind this variable with `let', but do not set it globally.
7912 If the value is a coding system, it is used for decoding on read operation.
7913 If not, an appropriate element is used from one of the coding system alists:
7914 There are three such tables, `file-coding-system-alist',
7915 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7916   Vcoding_system_for_read = Qnil;
7917
7918   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7919                doc: /* Specify the coding system for write operations.
7920 Programs bind this variable with `let', but you should not set it globally.
7921 If the value is a coding system, it is used for encoding of output,
7922 when writing it to a file and when sending it to a file or subprocess.
7923
7924 If this does not specify a coding system, an appropriate element
7925 is used from one of the coding system alists:
7926 There are three such tables, `file-coding-system-alist',
7927 `process-coding-system-alist', and `network-coding-system-alist'.
7928 For output to files, if the above procedure does not specify a coding system,
7929 the value of `buffer-file-coding-system' is used.  */);
7930   Vcoding_system_for_write = Qnil;
7931
7932   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7933                doc: /* Coding system used in the latest file or process I/O.
7934 Also set by `encode-coding-region', `decode-coding-region',
7935 `encode-coding-string' and `decode-coding-string'.  */);
7936   Vlast_coding_system_used = Qnil;
7937
7938   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7939                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7940 See info node `Coding Systems' and info node `Text and Binary' concerning
7941 such conversion.  */);
7942   inhibit_eol_conversion = 0;
7943
7944   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7945                doc: /* Non-nil means process buffer inherits coding system of process output.
7946 Bind it to t if the process output is to be treated as if it were a file
7947 read from some filesystem.  */);
7948   inherit_process_coding_system = 0;
7949
7950   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7951                doc: /* Alist to decide a coding system to use for a file I/O operation.
7952 The format is ((PATTERN . VAL) ...),
7953 where PATTERN is a regular expression matching a file name,
7954 VAL is a coding system, a cons of coding systems, or a function symbol.
7955 If VAL is a coding system, it is used for both decoding and encoding
7956 the file contents.
7957 If VAL is a cons of coding systems, the car part is used for decoding,
7958 and the cdr part is used for encoding.
7959 If VAL is a function symbol, the function must return a coding system
7960 or a cons of coding systems which are used as above.  The function is
7961 called with an argument that is a list of the arguments with which
7962 `find-operation-coding-system' was called.
7963
7964 See also the function `find-operation-coding-system'
7965 and the variable `auto-coding-alist'.  */);
7966   Vfile_coding_system_alist = Qnil;
7967
7968   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7969     doc: /* Alist to decide a coding system to use for a process I/O operation.
7970 The format is ((PATTERN . VAL) ...),
7971 where PATTERN is a regular expression matching a program name,
7972 VAL is a coding system, a cons of coding systems, or a function symbol.
7973 If VAL is a coding system, it is used for both decoding what received
7974 from the program and encoding what sent to the program.
7975 If VAL is a cons of coding systems, the car part is used for decoding,
7976 and the cdr part is used for encoding.
7977 If VAL is a function symbol, the function must return a coding system
7978 or a cons of coding systems which are used as above.
7979
7980 See also the function `find-operation-coding-system'.  */);
7981   Vprocess_coding_system_alist = Qnil;
7982
7983   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7984     doc: /* Alist to decide a coding system to use for a network I/O operation.
7985 The format is ((PATTERN . VAL) ...),
7986 where PATTERN is a regular expression matching a network service name
7987 or is a port number to connect to,
7988 VAL is a coding system, a cons of coding systems, or a function symbol.
7989 If VAL is a coding system, it is used for both decoding what received
7990 from the network stream and encoding what sent to the network stream.
7991 If VAL is a cons of coding systems, the car part is used for decoding,
7992 and the cdr part is used for encoding.
7993 If VAL is a function symbol, the function must return a coding system
7994 or a cons of coding systems which are used as above.
7995
7996 See also the function `find-operation-coding-system'.  */);
7997   Vnetwork_coding_system_alist = Qnil;
7998
7999   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8000                doc: /* Coding system to use with system messages.
8001 Also used for decoding keyboard input on X Window system.  */);
8002   Vlocale_coding_system = Qnil;
8003
8004   /* The eol mnemonics are reset in startup.el system-dependently.  */
8005   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8006                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8007   eol_mnemonic_unix = build_string (":");
8008
8009   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8010                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8011   eol_mnemonic_dos = build_string ("\\");
8012
8013   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8014                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8015   eol_mnemonic_mac = build_string ("/");
8016
8017   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8018                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8019   eol_mnemonic_undecided = build_string (":");
8020
8021   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8022                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8023   Venable_character_translation = Qt;
8024
8025   DEFVAR_LISP ("standard-translation-table-for-decode",
8026                &Vstandard_translation_table_for_decode,
8027                doc: /* Table for translating characters while decoding.  */);
8028   Vstandard_translation_table_for_decode = Qnil;
8029
8030   DEFVAR_LISP ("standard-translation-table-for-encode",
8031                &Vstandard_translation_table_for_encode,
8032                doc: /* Table for translating characters while encoding.  */);
8033   Vstandard_translation_table_for_encode = Qnil;
8034
8035   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8036                doc: /* Alist of charsets vs revision numbers.
8037 While encoding, if a charset (car part of an element) is found,
8038 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8039   Vcharset_revision_alist = Qnil;
8040
8041   DEFVAR_LISP ("default-process-coding-system",
8042                &Vdefault_process_coding_system,
8043                doc: /* Cons of coding systems used for process I/O by default.
8044 The car part is used for decoding a process output,
8045 the cdr part is used for encoding a text to be sent to a process.  */);
8046   Vdefault_process_coding_system = Qnil;
8047
8048   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8049                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8050 This is a vector of length 256.
8051 If Nth element is non-nil, the existence of code N in a file
8052 \(or output of subprocess) doesn't prevent it to be detected as
8053 a coding system of ISO 2022 variant which has a flag
8054 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8055 or reading output of a subprocess.
8056 Only 128th through 159th elements has a meaning.  */);
8057   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8058
8059   DEFVAR_LISP ("select-safe-coding-system-function",
8060                &Vselect_safe_coding_system_function,
8061                doc: /* Function to call to select safe coding system for encoding a text.
8062
8063 If set, this function is called to force a user to select a proper
8064 coding system which can encode the text in the case that a default
8065 coding system used in each operation can't encode the text.
8066
8067 The default value is `select-safe-coding-system' (which see).  */);
8068   Vselect_safe_coding_system_function = Qnil;
8069
8070   DEFVAR_BOOL ("coding-system-require-warning",
8071                &coding_system_require_warning,
8072                doc: /* Internal use only.
8073 If non-nil, on writing a file, `select-safe-coding-system-function' is
8074 called even if `coding-system-for-write' is non-nil.  The command
8075 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8076   coding_system_require_warning = 0;
8077
8078
8079   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8080                &inhibit_iso_escape_detection,
8081                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8082
8083 By default, on reading a file, Emacs tries to detect how the text is
8084 encoded.  This code detection is sensitive to escape sequences.  If
8085 the sequence is valid as ISO2022, the code is determined as one of
8086 the ISO2022 encodings, and the file is decoded by the corresponding
8087 coding system (e.g. `iso-2022-7bit').
8088
8089 However, there may be a case that you want to read escape sequences in
8090 a file as is.  In such a case, you can set this variable to non-nil.
8091 Then, as the code detection ignores any escape sequences, no file is
8092 detected as encoded in some ISO2022 encoding.  The result is that all
8093 escape sequences become visible in a buffer.
8094
8095 The default value is nil, and it is strongly recommended not to change
8096 it.  That is because many Emacs Lisp source files that contain
8097 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8098 in Emacs's distribution, and they won't be decoded correctly on
8099 reading if you suppress escape sequence detection.
8100
8101 The other way to read escape sequences in a file without decoding is
8102 to explicitly specify some coding system that doesn't use ISO2022's
8103 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8104   inhibit_iso_escape_detection = 0;
8105
8106   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8107                doc: /* Char table for translating self-inserting characters.
8108 This is applied to the result of input methods, not their input.  See also
8109 `keyboard-translate-table'.  */);
8110     Vtranslation_table_for_input = Qnil;
8111 }
8112
8113 char *
8114 emacs_strerror (error_number)
8115      int error_number;
8116 {
8117   char *str;
8118
8119   synchronize_system_messages_locale ();
8120   str = strerror (error_number);
8121
8122   if (! NILP (Vlocale_coding_system))
8123     {
8124       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8125                                                       Vlocale_coding_system,
8126                                                       0);
8127       str = (char *) SDATA (dec);
8128     }
8129
8130   return str;
8131 }
8132
8133 #endif /* emacs */
8134
8135 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8136    (do not change this comment) */