src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 /* Mnemonic string for each format of end-of-line.  */
 371 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 372 /* Mnemonic string to indicate format of end-of-line is not yet
 373    decided.  */
 374 Lisp_Object eol_mnemonic_undecided;
 375
 376 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 377    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 378 int system_eol_type;
 379
 380 #ifdef emacs
 381
 382 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 383
 384 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 385
 386 /* Coding system emacs-mule and raw-text are for converting only
 387    end-of-line format.  */
 388 Lisp_Object Qemacs_mule, Qraw_text;
 389
 390 /* Coding-systems are handed between Emacs Lisp programs and C internal
 391    routines by the following three variables.  */
 392 /* Coding-system for reading files and receiving data from process.  */
 393 Lisp_Object Vcoding_system_for_read;
 394 /* Coding-system for writing files and sending data to process.  */
 395 Lisp_Object Vcoding_system_for_write;
 396 /* Coding-system actually used in the latest I/O.  */
 397 Lisp_Object Vlast_coding_system_used;
 398
 399 /* A vector of length 256 which contains information about special
 400    Latin codes (especially for dealing with Microsoft codes).  */
 401 Lisp_Object Vlatin_extra_code_table;
 402
 403 /* Flag to inhibit code conversion of end-of-line format.  */
 404 int inhibit_eol_conversion;
 405
 406 /* Flag to inhibit ISO2022 escape sequence detection.  */
 407 int inhibit_iso_escape_detection;
 408
 409 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 410 int inherit_process_coding_system;
 411
 412 /* Coding system to be used to encode text for terminal display.  */
 413 struct coding_system terminal_coding;
 414
 415 /* Coding system to be used to encode text for terminal display when
 416    terminal coding system is nil.  */
 417 struct coding_system safe_terminal_coding;
 418
 419 /* Coding system of what is sent from terminal keyboard.  */
 420 struct coding_system keyboard_coding;
 421
 422 /* Default coding system to be used to write a file.  */
 423 struct coding_system default_buffer_file_coding;
 424
 425 Lisp_Object Vfile_coding_system_alist;
 426 Lisp_Object Vprocess_coding_system_alist;
 427 Lisp_Object Vnetwork_coding_system_alist;
 428
 429 Lisp_Object Vlocale_coding_system;
 430
 431 #endif /* emacs */
 432
 433 Lisp_Object Qcoding_category, Qcoding_category_index;
 434
 435 /* List of symbols `coding-category-xxx' ordered by priority.  */
 436 Lisp_Object Vcoding_category_list;
 437
 438 /* Table of coding categories (Lisp symbols).  */
 439 Lisp_Object Vcoding_category_table;
 440
 441 /* Table of names of symbol for each coding-category.  */
 442 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 443   "coding-category-emacs-mule",
 444   "coding-category-sjis",
 445   "coding-category-iso-7",
 446   "coding-category-iso-7-tight",
 447   "coding-category-iso-8-1",
 448   "coding-category-iso-8-2",
 449   "coding-category-iso-7-else",
 450   "coding-category-iso-8-else",
 451   "coding-category-ccl",
 452   "coding-category-big5",
 453   "coding-category-utf-8",
 454   "coding-category-utf-16-be",
 455   "coding-category-utf-16-le",
 456   "coding-category-raw-text",
 457   "coding-category-binary"
 458 };
 459
 460 /* Table of pointers to coding systems corresponding to each coding
 461    categories.  */
 462 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 463
 464 /* Table of coding category masks.  Nth element is a mask for a coding
 465    category of which priority is Nth.  */
 466 static
 467 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 468
 469 /* Flag to tell if we look up translation table on character code
 470    conversion.  */
 471 Lisp_Object Venable_character_translation;
 472 /* Standard translation table to look up on decoding (reading).  */
 473 Lisp_Object Vstandard_translation_table_for_decode;
 474 /* Standard translation table to look up on encoding (writing).  */
 475 Lisp_Object Vstandard_translation_table_for_encode;
 476
 477 Lisp_Object Qtranslation_table;
 478 Lisp_Object Qtranslation_table_id;
 479 Lisp_Object Qtranslation_table_for_decode;
 480 Lisp_Object Qtranslation_table_for_encode;
 481
 482 /* Alist of charsets vs revision number.  */
 483 Lisp_Object Vcharset_revision_alist;
 484
 485 /* Default coding systems used for process I/O.  */
 486 Lisp_Object Vdefault_process_coding_system;
 487
 488 /* Char table for translating Quail and self-inserting input.  */
 489 Lisp_Object Vtranslation_table_for_input;
 490
 491 /* Global flag to tell that we can't call post-read-conversion and
 492    pre-write-conversion functions.  Usually the value is zero, but it
 493    is set to 1 temporarily while such functions are running.  This is
 494    to avoid infinite recursive call.  */
 495 static int inhibit_pre_post_conversion;
 496
 497 /* Char-table containing safe coding systems of each character.  */
 498 Lisp_Object Vchar_coding_system_table;
 499 Lisp_Object Qchar_coding_system;
 500
 501 /* Return `safe-chars' property of coding system CODING.  Don't check
 502    validity of CODING.  */
 503
 504 Lisp_Object
 505 coding_safe_chars (coding)
 506      struct coding_system *coding;
 507 {
 508   Lisp_Object coding_spec, plist, safe_chars;
 509
 510   coding_spec = Fget (coding->symbol, Qcoding_system);
 511   plist = XVECTOR (coding_spec)->contents[3];
 512   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 513   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 514 }
 515
 516 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 517   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 518
 519 \f
 520 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 521
 522 /* Emacs' internal format for representation of multiple character
 523    sets is a kind of multi-byte encoding, i.e. characters are
 524    represented by variable-length sequences of one-byte codes.
 525
 526    ASCII characters and control characters (e.g. `tab', `newline') are
 527    represented by one-byte sequences which are their ASCII codes, in
 528    the range 0x00 through 0x7F.
 529
 530    8-bit characters of the range 0x80..0x9F are represented by
 531    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 532    code + 0x20).
 533
 534    8-bit characters of the range 0xA0..0xFF are represented by
 535    one-byte sequences which are their 8-bit code.
 536
 537    The other characters are represented by a sequence of `base
 538    leading-code', optional `extended leading-code', and one or two
 539    `position-code's.  The length of the sequence is determined by the
 540    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 541    whereas extended leading-code and position-code take the range 0xA0
 542    through 0xFF.  See `charset.h' for more details about leading-code
 543    and position-code.
 544
 545    --- CODE RANGE of Emacs' internal format ---
 546    character set        range
 547    -------------        -----
 548    ascii                0x00..0x7F
 549    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 550    eight-bit-graphic    0xA0..0xBF
 551    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 552    ---------------------------------------------
 553
 554    As this is the internal character representation, the format is
 555    usually not used externally (i.e. in a file or in a data sent to a
 556    process).  But, it is possible to have a text externally in this
 557    format (i.e. by encoding by the coding system `emacs-mule').
 558
 559    In that case, a sequence of one-byte codes has a slightly different
 560    form.
 561
 562    Firstly, all characters in eight-bit-control are represented by
 563    one-byte sequences which are their 8-bit code.
 564
 565    Next, character composition data are represented by the byte
 566    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 567    where,
 568         METHOD is 0xF0 plus one of composition method (enum
 569         composition_method),
 570
 571         BYTES is 0xA0 plus the byte length of these composition data,
 572
 573         CHARS is 0xA0 plus the number of characters composed by these
 574         data,
 575
 576         COMPONENTs are characters of multibyte form or composition
 577         rules encoded by two-byte of ASCII codes.
 578
 579    In addition, for backward compatibility, the following formats are
 580    also recognized as composition data on decoding.
 581
 582    0x80 MSEQ ...
 583    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 584
 585    Here,
 586         MSEQ is a multibyte form but in these special format:
 587           ASCII: 0xA0 ASCII_CODE+0x80,
 588           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 589         RULE is a one byte code of the range 0xA0..0xF0 that
 590         represents a composition rule.
 591   */
 592
 593 enum emacs_code_class_type emacs_code_class[256];
 594
 595 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 596    Check if a text is encoded in Emacs' internal format.  If it is,
 597    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 598
 599 static int
 600 detect_coding_emacs_mule (src, src_end, multibytep)
 601       unsigned char *src, *src_end;
 602       int multibytep;
 603 {
 604   unsigned char c;
 605   int composing = 0;
 606   /* Dummy for ONE_MORE_BYTE.  */
 607   struct coding_system dummy_coding;
 608   struct coding_system *coding = &dummy_coding;
 609
 610   while (1)
 611     {
 612       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 613
 614       if (composing)
 615         {
 616           if (c < 0xA0)
 617             composing = 0;
 618           else if (c == 0xA0)
 619             {
 620               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 621               c &= 0x7F;
 622             }
 623           else
 624             c -= 0x20;
 625         }
 626
 627       if (c < 0x20)
 628         {
 629           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 630             return 0;
 631         }
 632       else if (c >= 0x80 && c < 0xA0)
 633         {
 634           if (c == 0x80)
 635             /* Old leading code for a composite character.  */
 636             composing = 1;
 637           else
 638             {
 639               unsigned char *src_base = src - 1;
 640               int bytes;
 641
 642               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 643                                                bytes))
 644                 return 0;
 645               src = src_base + bytes;
 646             }
 647         }
 648     }
 649  label_end_of_loop:
 650   return CODING_CATEGORY_MASK_EMACS_MULE;
 651 }
 652
 653
 654 /* Record the starting position START and METHOD of one composition.  */
 655
 656 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 657   do {                                                          \
 658     struct composition_data *cmp_data = coding->cmp_data;       \
 659     int *data = cmp_data->data + cmp_data->used;                \
 660     coding->cmp_data_start = cmp_data->used;                    \
 661     data[0] = -1;                                               \
 662     data[1] = cmp_data->char_offset + start;                    \
 663     data[3] = (int) method;                                     \
 664     cmp_data->used += 4;                                        \
 665   } while (0)
 666
 667 /* Record the ending position END of the current composition.  */
 668
 669 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 670   do {                                                          \
 671     struct composition_data *cmp_data = coding->cmp_data;       \
 672     int *data = cmp_data->data + coding->cmp_data_start;        \
 673     data[0] = cmp_data->used - coding->cmp_data_start;          \
 674     data[2] = cmp_data->char_offset + end;                      \
 675   } while (0)
 676
 677 /* Record one COMPONENT (alternate character or composition rule).  */
 678
 679 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
 680   (coding->cmp_data->data[coding->cmp_data->used++] = component)
 681
 682
 683 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 684    is not less than SRC_END, return -1 without incrementing Src.  */
 685
 686 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 687
 688
 689 /* Decode a character represented as a component of composition
 690    sequence of Emacs 20 style at SRC.  Set C to that character, store
 691    its multibyte form sequence at P, and set P to the end of that
 692    sequence.  If no valid character is found, set C to -1.  */
 693
 694 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 695   do {                                                          \
 696     int bytes;                                                  \
 697                                                                 \
 698     c = SAFE_ONE_MORE_BYTE ();                                  \
 699     if (c < 0)                                                  \
 700       break;                                                    \
 701     if (CHAR_HEAD_P (c))                                        \
 702       c = -1;                                                   \
 703     else if (c == 0xA0)                                         \
 704       {                                                         \
 705         c = SAFE_ONE_MORE_BYTE ();                              \
 706         if (c < 0xA0)                                           \
 707           c = -1;                                               \
 708         else                                                    \
 709           {                                                     \
 710             c -= 0xA0;                                          \
 711             *p++ = c;                                           \
 712           }                                                     \
 713       }                                                         \
 714     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 715       {                                                         \
 716         unsigned char *p0 = p;                                  \
 717                                                                 \
 718         c -= 0x20;                                              \
 719         *p++ = c;                                               \
 720         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 721         while (--bytes)                                         \
 722           {                                                     \
 723             c = SAFE_ONE_MORE_BYTE ();                          \
 724             if (c < 0)                                          \
 725               break;                                            \
 726             *p++ = c;                                           \
 727           }                                                     \
 728         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes))     \
 729           c = STRING_CHAR (p0, bytes);                          \
 730         else                                                    \
 731           c = -1;                                               \
 732       }                                                         \
 733     else                                                        \
 734       c = -1;                                                   \
 735   } while (0)
 736
 737
 738 /* Decode a composition rule represented as a component of composition
 739    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 740    valid rule is found, set C to -1.  */
 741
 742 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 743   do {                                                  \
 744     c = SAFE_ONE_MORE_BYTE ();                          \
 745     c -= 0xA0;                                          \
 746     if (c < 0 || c >= 81)                               \
 747       c = -1;                                           \
 748     else                                                \
 749       {                                                 \
 750         gref = c / 9, nref = c % 9;                     \
 751         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 752       }                                                 \
 753   } while (0)
 754
 755
 756 /* Decode composition sequence encoded by `emacs-mule' at the source
 757    pointed by SRC.  SRC_END is the end of source.  Store information
 758    of the composition in CODING->cmp_data.
 759
 760    For backward compatibility, decode also a composition sequence of
 761    Emacs 20 style.  In that case, the composition sequence contains
 762    characters that should be extracted into a buffer or string.  Store
 763    those characters at *DESTINATION in multibyte form.
 764
 765    If we encounter an invalid byte sequence, return 0.
 766    If we encounter an insufficient source or destination, or
 767    insufficient space in CODING->cmp_data, return 1.
 768    Otherwise, return consumed bytes in the source.
 769
 770 */
 771 static INLINE int
 772 decode_composition_emacs_mule (coding, src, src_end,
 773                                destination, dst_end, dst_bytes)
 774      struct coding_system *coding;
 775      unsigned char *src, *src_end, **destination, *dst_end;
 776      int dst_bytes;
 777 {
 778   unsigned char *dst = *destination;
 779   int method, data_len, nchars;
 780   unsigned char *src_base = src++;
 781   /* Store components of composition.  */
 782   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 783   int ncomponent;
 784   /* Store multibyte form of characters to be composed.  This is for
 785      Emacs 20 style composition sequence.  */
 786   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 787   unsigned char *bufp = buf;
 788   int c, i, gref, nref;
 789
 790   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 791       >= COMPOSITION_DATA_SIZE)
 792     {
 793       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 794       return -1;
 795     }
 796
 797   ONE_MORE_BYTE (c);
 798   if (c - 0xF0 >= COMPOSITION_RELATIVE
 799            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 800     {
 801       int with_rule;
 802
 803       method = c - 0xF0;
 804       with_rule = (method == COMPOSITION_WITH_RULE
 805                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 806       ONE_MORE_BYTE (c);
 807       data_len = c - 0xA0;
 808       if (data_len < 4
 809           || src_base + data_len > src_end)
 810         return 0;
 811       ONE_MORE_BYTE (c);
 812       nchars = c - 0xA0;
 813       if (c < 1)
 814         return 0;
 815       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 816         {
 817           /* If it is longer than this, it can't be valid.  */
 818           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 819             return 0;
 820
 821           if (ncomponent % 2 && with_rule)
 822             {
 823               ONE_MORE_BYTE (gref);
 824               gref -= 32;
 825               ONE_MORE_BYTE (nref);
 826               nref -= 32;
 827               c = COMPOSITION_ENCODE_RULE (gref, nref);
 828             }
 829           else
 830             {
 831               int bytes;
 832               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 833                 c = STRING_CHAR (src, bytes);
 834               else
 835                 c = *src, bytes = 1;
 836               src += bytes;
 837             }
 838           component[ncomponent] = c;
 839         }
 840     }
 841   else
 842     {
 843       /* This may be an old Emacs 20 style format.  See the comment at
 844          the section 2 of this file.  */
 845       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 846       if (src == src_end
 847           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 848         goto label_end_of_loop;
 849
 850       src_end = src;
 851       src = src_base + 1;
 852       if (c < 0xC0)
 853         {
 854           method = COMPOSITION_RELATIVE;
 855           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 856             {
 857               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 858               if (c < 0)
 859                 break;
 860               component[ncomponent++] = c;
 861             }
 862           if (ncomponent < 2)
 863             return 0;
 864           nchars = ncomponent;
 865         }
 866       else if (c == 0xFF)
 867         {
 868           method = COMPOSITION_WITH_RULE;
 869           src++;
 870           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 871           if (c < 0)
 872             return 0;
 873           component[0] = c;
 874           for (ncomponent = 1;
 875                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 876             {
 877               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 878               if (c < 0)
 879                 break;
 880               component[ncomponent++] = c;
 881               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 882               if (c < 0)
 883                 break;
 884               component[ncomponent++] = c;
 885             }
 886           if (ncomponent < 3)
 887             return 0;
 888           nchars = (ncomponent + 1) / 2;
 889         }
 890       else
 891         return 0;
 892     }
 893
 894   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 895     {
 896       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 897       for (i = 0; i < ncomponent; i++)
 898         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 899       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 900       if (buf < bufp)
 901         {
 902           unsigned char *p = buf;
 903           EMIT_BYTES (p, bufp);
 904           *destination += bufp - buf;
 905           coding->produced_char += nchars;
 906         }
 907       return (src - src_base);
 908     }
 909  label_end_of_loop:
 910   return -1;
 911 }
 912
 913 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 914
 915 static void
 916 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 917      struct coding_system *coding;
 918      unsigned char *source, *destination;
 919      int src_bytes, dst_bytes;
 920 {
 921   unsigned char *src = source;
 922   unsigned char *src_end = source + src_bytes;
 923   unsigned char *dst = destination;
 924   unsigned char *dst_end = destination + dst_bytes;
 925   /* SRC_BASE remembers the start position in source in each loop.
 926      The loop will be exited when there's not enough source code, or
 927      when there's not enough destination area to produce a
 928      character.  */
 929   unsigned char *src_base;
 930
 931   coding->produced_char = 0;
 932   while ((src_base = src) < src_end)
 933     {
 934       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 935       int bytes;
 936
 937       if (*src == '\r')
 938         {
 939           int c = *src++;
 940
 941           if (coding->eol_type == CODING_EOL_CR)
 942             c = '\n';
 943           else if (coding->eol_type == CODING_EOL_CRLF)
 944             {
 945               ONE_MORE_BYTE (c);
 946               if (c != '\n')
 947                 {
 948                   src--;
 949                   c = '\r';
 950                 }
 951             }
 952           *dst++ = c;
 953           coding->produced_char++;
 954           continue;
 955         }
 956       else if (*src == '\n')
 957         {
 958           if ((coding->eol_type == CODING_EOL_CR
 959                || coding->eol_type == CODING_EOL_CRLF)
 960               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 961             {
 962               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 963               goto label_end_of_loop;
 964             }
 965           *dst++ = *src++;
 966           coding->produced_char++;
 967           continue;
 968         }
 969       else if (*src == 0x80 && coding->cmp_data)
 970         {
 971           /* Start of composition data.  */
 972           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 973                                                          &dst, dst_end,
 974                                                          dst_bytes);
 975           if (consumed < 0)
 976             goto label_end_of_loop;
 977           else if (consumed > 0)
 978             {
 979               src += consumed;
 980               continue;
 981             }
 982           bytes = CHAR_STRING (*src, tmp);
 983           p = tmp;
 984           src++;
 985         }
 986       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 987         {
 988           p = src;
 989           src += bytes;
 990         }
 991       else
 992         {
 993           bytes = CHAR_STRING (*src, tmp);
 994           p = tmp;
 995           src++;
 996         }
 997       if (dst + bytes >= (dst_bytes ? dst_end : src))
 998         {
 999           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1000           break;
1001         }
1002       while (bytes--) *dst++ = *p++;
1003       coding->produced_char++;
1004     }
1005  label_end_of_loop:
1006   coding->consumed = coding->consumed_char = src_base - source;
1007   coding->produced = dst - destination;
1008 }
1009
1010
1011 /* Encode composition data stored at DATA into a special byte sequence
1012    starting by 0x80.  Update CODING->cmp_data_start and maybe
1013    CODING->cmp_data for the next call.  */
1014
1015 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1016   do {                                                                  \
1017     unsigned char buf[1024], *p0 = buf, *p;                             \
1018     int len = data[0];                                                  \
1019     int i;                                                              \
1020                                                                         \
1021     buf[0] = 0x80;                                                      \
1022     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1023     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1024     p = buf + 4;                                                        \
1025     if (data[3] == COMPOSITION_WITH_RULE                                \
1026         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1027       {                                                                 \
1028         p += CHAR_STRING (data[4], p);                                  \
1029         for (i = 5; i < len; i += 2)                                    \
1030           {                                                             \
1031             int gref, nref;                                             \
1032              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1033             *p++ = 0x20 + gref;                                         \
1034             *p++ = 0x20 + nref;                                         \
1035             p += CHAR_STRING (data[i + 1], p);                          \
1036           }                                                             \
1037       }                                                                 \
1038     else                                                                \
1039       {                                                                 \
1040         for (i = 4; i < len; i++)                                       \
1041           p += CHAR_STRING (data[i], p);                                \
1042       }                                                                 \
1043     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1044                                                                         \
1045     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1046       {                                                                 \
1047         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1048         goto label_end_of_loop;                                         \
1049       }                                                                 \
1050     while (p0 < p)                                                      \
1051       *dst++ = *p0++;                                                   \
1052     coding->cmp_data_start += data[0];                                  \
1053     if (coding->cmp_data_start == coding->cmp_data->used                \
1054         && coding->cmp_data->next)                                      \
1055       {                                                                 \
1056         coding->cmp_data = coding->cmp_data->next;                      \
1057         coding->cmp_data_start = 0;                                     \
1058       }                                                                 \
1059   } while (0)
1060
1061
1062 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1063                             unsigned char *, int, int));
1064
1065 static void
1066 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1067      struct coding_system *coding;
1068      unsigned char *source, *destination;
1069      int src_bytes, dst_bytes;
1070 {
1071   unsigned char *src = source;
1072   unsigned char *src_end = source + src_bytes;
1073   unsigned char *dst = destination;
1074   unsigned char *dst_end = destination + dst_bytes;
1075   unsigned char *src_base;
1076   int c;
1077   int char_offset;
1078   int *data;
1079
1080   Lisp_Object translation_table;
1081
1082   translation_table = Qnil;
1083
1084   /* Optimization for the case that there's no composition.  */
1085   if (!coding->cmp_data || coding->cmp_data->used == 0)
1086     {
1087       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1088       return;
1089     }
1090
1091   char_offset = coding->cmp_data->char_offset;
1092   data = coding->cmp_data->data + coding->cmp_data_start;
1093   while (1)
1094     {
1095       src_base = src;
1096
1097       /* If SRC starts a composition, encode the information about the
1098          composition in advance.  */
1099       if (coding->cmp_data_start < coding->cmp_data->used
1100           && char_offset + coding->consumed_char == data[1])
1101         {
1102           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1103           char_offset = coding->cmp_data->char_offset;
1104           data = coding->cmp_data->data + coding->cmp_data_start;
1105         }
1106
1107       ONE_MORE_CHAR (c);
1108       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1109                         || coding->eol_type == CODING_EOL_CR))
1110         {
1111           if (coding->eol_type == CODING_EOL_CRLF)
1112             EMIT_TWO_BYTES ('\r', c);
1113           else
1114             EMIT_ONE_BYTE ('\r');
1115         }
1116       else if (SINGLE_BYTE_CHAR_P (c))
1117         EMIT_ONE_BYTE (c);
1118       else
1119         EMIT_BYTES (src_base, src);
1120       coding->consumed_char++;
1121     }
1122  label_end_of_loop:
1123   coding->consumed = src_base - source;
1124   coding->produced = coding->produced_char = dst - destination;
1125   return;
1126 }
1127
1128 \f
1129 /*** 3. ISO2022 handlers ***/
1130
1131 /* The following note describes the coding system ISO2022 briefly.
1132    Since the intention of this note is to help understand the
1133    functions in this file, some parts are NOT ACCURATE or are OVERLY
1134    SIMPLIFIED.  For thorough understanding, please refer to the
1135    original document of ISO2022.  This is equivalent to the standard
1136    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1137
1138    ISO2022 provides many mechanisms to encode several character sets
1139    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1140    is encoded using bytes less than 128.  This may make the encoded
1141    text a little bit longer, but the text passes more easily through
1142    several types of gateway, some of which strip off the MSB (Most
1143    Significant Bit).
1144
1145    There are two kinds of character sets: control character sets and
1146    graphic character sets.  The former contain control characters such
1147    as `newline' and `escape' to provide control functions (control
1148    functions are also provided by escape sequences).  The latter
1149    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1150    two control character sets and many graphic character sets.
1151
1152    Graphic character sets are classified into one of the following
1153    four classes, according to the number of bytes (DIMENSION) and
1154    number of characters in one dimension (CHARS) of the set:
1155    - DIMENSION1_CHARS94
1156    - DIMENSION1_CHARS96
1157    - DIMENSION2_CHARS94
1158    - DIMENSION2_CHARS96
1159
1160    In addition, each character set is assigned an identification tag,
1161    unique for each set, called the "final character" (denoted as <F>
1162    hereafter).  The <F> of each character set is decided by ECMA(*)
1163    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1164    (0x30..0x3F are for private use only).
1165
1166    Note (*): ECMA = European Computer Manufacturers Association
1167
1168    Here are examples of graphic character sets [NAME(<F>)]:
1169         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1170         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1171         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1172         o DIMENSION2_CHARS96 -- none for the moment
1173
1174    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1175         C0 [0x00..0x1F] -- control character plane 0
1176         GL [0x20..0x7F] -- graphic character plane 0
1177         C1 [0x80..0x9F] -- control character plane 1
1178         GR [0xA0..0xFF] -- graphic character plane 1
1179
1180    A control character set is directly designated and invoked to C0 or
1181    C1 by an escape sequence.  The most common case is that:
1182    - ISO646's  control character set is designated/invoked to C0, and
1183    - ISO6429's control character set is designated/invoked to C1,
1184    and usually these designations/invocations are omitted in encoded
1185    text.  In a 7-bit environment, only C0 can be used, and a control
1186    character for C1 is encoded by an appropriate escape sequence to
1187    fit into the environment.  All control characters for C1 are
1188    defined to have corresponding escape sequences.
1189
1190    A graphic character set is at first designated to one of four
1191    graphic registers (G0 through G3), then these graphic registers are
1192    invoked to GL or GR.  These designations and invocations can be
1193    done independently.  The most common case is that G0 is invoked to
1194    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1195    these invocations and designations are omitted in encoded text.
1196    In a 7-bit environment, only GL can be used.
1197
1198    When a graphic character set of CHARS94 is invoked to GL, codes
1199    0x20 and 0x7F of the GL area work as control characters SPACE and
1200    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1201    be used.
1202
1203    There are two ways of invocation: locking-shift and single-shift.
1204    With locking-shift, the invocation lasts until the next different
1205    invocation, whereas with single-shift, the invocation affects the
1206    following character only and doesn't affect the locking-shift
1207    state.  Invocations are done by the following control characters or
1208    escape sequences:
1209
1210    ----------------------------------------------------------------------
1211    abbrev  function                  cntrl escape seq   description
1212    ----------------------------------------------------------------------
1213    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1214    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1215    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1216    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1217    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1218    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1219    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1220    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1221    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1222    ----------------------------------------------------------------------
1223    (*) These are not used by any known coding system.
1224
1225    Control characters for these functions are defined by macros
1226    ISO_CODE_XXX in `coding.h'.
1227
1228    Designations are done by the following escape sequences:
1229    ----------------------------------------------------------------------
1230    escape sequence      description
1231    ----------------------------------------------------------------------
1232    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1233    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1234    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1235    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1236    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1237    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1238    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1239    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1240    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1241    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1242    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1243    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1244    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1245    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1246    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1247    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1248    ----------------------------------------------------------------------
1249
1250    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1251    of dimension 1, chars 94, and final character <F>, etc...
1252
1253    Note (*): Although these designations are not allowed in ISO2022,
1254    Emacs accepts them on decoding, and produces them on encoding
1255    CHARS96 character sets in a coding system which is characterized as
1256    7-bit environment, non-locking-shift, and non-single-shift.
1257
1258    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1259    '(' can be omitted.  We refer to this as "short-form" hereafter.
1260
1261    Now you may notice that there are a lot of ways of encoding the
1262    same multilingual text in ISO2022.  Actually, there exist many
1263    coding systems such as Compound Text (used in X11's inter client
1264    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1265    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1266    localized platforms), and all of these are variants of ISO2022.
1267
1268    In addition to the above, Emacs handles two more kinds of escape
1269    sequences: ISO6429's direction specification and Emacs' private
1270    sequence for specifying character composition.
1271
1272    ISO6429's direction specification takes the following form:
1273         o CSI ']'      -- end of the current direction
1274         o CSI '0' ']'  -- end of the current direction
1275         o CSI '1' ']'  -- start of left-to-right text
1276         o CSI '2' ']'  -- start of right-to-left text
1277    The control character CSI (0x9B: control sequence introducer) is
1278    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1279
1280    Character composition specification takes the following form:
1281         o ESC '0' -- start relative composition
1282         o ESC '1' -- end composition
1283         o ESC '2' -- start rule-base composition (*)
1284         o ESC '3' -- start relative composition with alternate chars  (**)
1285         o ESC '4' -- start rule-base composition with alternate chars  (**)
1286   Since these are not standard escape sequences of any ISO standard,
1287   the use of them with these meanings is restricted to Emacs only.
1288
1289   (*) This form is used only in Emacs 20.5 and older versions,
1290   but the newer versions can safely decode it.
1291   (**) This form is used only in Emacs 21.1 and newer versions,
1292   and the older versions can't decode it.
1293
1294   Here's a list of example usages of these composition escape
1295   sequences (categorized by `enum composition_method').
1296
1297   COMPOSITION_RELATIVE:
1298         ESC 0 CHAR [ CHAR ] ESC 1
1299   COMPOSITION_WITH_RULE:
1300         ESC 2 CHAR [ RULE CHAR ] ESC 1
1301   COMPOSITION_WITH_ALTCHARS:
1302         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1303   COMPOSITION_WITH_RULE_ALTCHARS:
1304         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1305
1306 enum iso_code_class_type iso_code_class[256];
1307
1308 #define CHARSET_OK(idx, charset, c)                                     \
1309   (coding_system_table[idx]                                             \
1310    && (charset == CHARSET_ASCII                                         \
1311        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
1312            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1313    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1314                                               charset)                  \
1315        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1316
1317 #define SHIFT_OUT_OK(idx) \
1318   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1319
1320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1321    Check if a text is encoded in ISO2022.  If it is, return an
1322    integer in which appropriate flag bits any of:
1323         CODING_CATEGORY_MASK_ISO_7
1324         CODING_CATEGORY_MASK_ISO_7_TIGHT
1325         CODING_CATEGORY_MASK_ISO_8_1
1326         CODING_CATEGORY_MASK_ISO_8_2
1327         CODING_CATEGORY_MASK_ISO_7_ELSE
1328         CODING_CATEGORY_MASK_ISO_8_ELSE
1329    are set.  If a code which should never appear in ISO2022 is found,
1330    returns 0.  */
1331
1332 static int
1333 detect_coding_iso2022 (src, src_end, multibytep)
1334      unsigned char *src, *src_end;
1335      int multibytep;
1336 {
1337   int mask = CODING_CATEGORY_MASK_ISO;
1338   int mask_found = 0;
1339   int reg[4], shift_out = 0, single_shifting = 0;
1340   int c, c1, charset;
1341   /* Dummy for ONE_MORE_BYTE.  */
1342   struct coding_system dummy_coding;
1343   struct coding_system *coding = &dummy_coding;
1344   Lisp_Object safe_chars;
1345
1346   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1347   while (mask && src < src_end)
1348     {
1349       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1350     retry:
1351       switch (c)
1352         {
1353         case ISO_CODE_ESC:
1354           if (inhibit_iso_escape_detection)
1355             break;
1356           single_shifting = 0;
1357           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1358           if (c >= '(' && c <= '/')
1359             {
1360               /* Designation sequence for a charset of dimension 1.  */
1361               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1362               if (c1 < ' ' || c1 >= 0x80
1363                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1364                 /* Invalid designation sequence.  Just ignore.  */
1365                 break;
1366               reg[(c - '(') % 4] = charset;
1367             }
1368           else if (c == '$')
1369             {
1370               /* Designation sequence for a charset of dimension 2.  */
1371               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1372               if (c >= '@' && c <= 'B')
1373                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1374                 reg[0] = charset = iso_charset_table[1][0][c];
1375               else if (c >= '(' && c <= '/')
1376                 {
1377                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1378                   if (c1 < ' ' || c1 >= 0x80
1379                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1380                     /* Invalid designation sequence.  Just ignore.  */
1381                     break;
1382                   reg[(c - '(') % 4] = charset;
1383                 }
1384               else
1385                 /* Invalid designation sequence.  Just ignore.  */
1386                 break;
1387             }
1388           else if (c == 'N' || c == 'O')
1389             {
1390               /* ESC <Fe> for SS2 or SS3.  */
1391               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1392               break;
1393             }
1394           else if (c >= '0' && c <= '4')
1395             {
1396               /* ESC <Fp> for start/end composition.  */
1397               mask_found |= CODING_CATEGORY_MASK_ISO;
1398               break;
1399             }
1400           else
1401             /* Invalid escape sequence.  Just ignore.  */
1402             break;
1403
1404           /* We found a valid designation sequence for CHARSET.  */
1405           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1406           c = MAKE_CHAR (charset, 0, 0);
1407           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1408             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1409           else
1410             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1411           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1412             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1413           else
1414             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1415           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1416             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1417           else
1418             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1419           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1420             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1421           else
1422             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1423           break;
1424
1425         case ISO_CODE_SO:
1426           if (inhibit_iso_escape_detection)
1427             break;
1428           single_shifting = 0;
1429           if (shift_out == 0
1430               && (reg[1] >= 0
1431                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1432                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1433             {
1434               /* Locking shift out.  */
1435               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1436               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1437             }
1438           break;
1439
1440         case ISO_CODE_SI:
1441           if (inhibit_iso_escape_detection)
1442             break;
1443           single_shifting = 0;
1444           if (shift_out == 1)
1445             {
1446               /* Locking shift in.  */
1447               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1448               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1449             }
1450           break;
1451
1452         case ISO_CODE_CSI:
1453           single_shifting = 0;
1454         case ISO_CODE_SS2:
1455         case ISO_CODE_SS3:
1456           {
1457             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1458
1459             if (inhibit_iso_escape_detection)
1460               break;
1461             if (c != ISO_CODE_CSI)
1462               {
1463                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1464                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1465                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1466                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1467                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1468                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1469                 single_shifting = 1;
1470               }
1471             if (VECTORP (Vlatin_extra_code_table)
1472                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1473               {
1474                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1475                     & CODING_FLAG_ISO_LATIN_EXTRA)
1476                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1477                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1478                     & CODING_FLAG_ISO_LATIN_EXTRA)
1479                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1480               }
1481             mask &= newmask;
1482             mask_found |= newmask;
1483           }
1484           break;
1485
1486         default:
1487           if (c < 0x80)
1488             {
1489               single_shifting = 0;
1490               break;
1491             }
1492           else if (c < 0xA0)
1493             {
1494               single_shifting = 0;
1495               if (VECTORP (Vlatin_extra_code_table)
1496                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1497                 {
1498                   int newmask = 0;
1499
1500                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1501                       & CODING_FLAG_ISO_LATIN_EXTRA)
1502                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1503                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1504                       & CODING_FLAG_ISO_LATIN_EXTRA)
1505                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1506                   mask &= newmask;
1507                   mask_found |= newmask;
1508                 }
1509               else
1510                 return 0;
1511             }
1512           else
1513             {
1514               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1515                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1516               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1517               /* Check the length of succeeding codes of the range
1518                  0xA0..0FF.  If the byte length is odd, we exclude
1519                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1520                  when we are not single shifting.  */
1521               if (!single_shifting
1522                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1523                 {
1524                   int i = 1;
1525
1526                   c = -1;
1527                   while (src < src_end)
1528                     {
1529                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1530                       if (c < 0xA0)
1531                         break;
1532                       i++;
1533                     }
1534
1535                   if (i & 1 && src < src_end)
1536                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1537                   else
1538                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1539                   if (c >= 0)
1540                     /* This means that we have read one extra byte.  */
1541                     goto retry;
1542                 }
1543             }
1544           break;
1545         }
1546     }
1547  label_end_of_loop:
1548   return (mask & mask_found);
1549 }
1550
1551 /* Decode a character of which charset is CHARSET, the 1st position
1552    code is C1, the 2nd position code is C2, and return the decoded
1553    character code.  If the variable `translation_table' is non-nil,
1554    returned the translated code.  */
1555
1556 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1557   (NILP (translation_table)                     \
1558    ? MAKE_CHAR (charset, c1, c2)                \
1559    : translate_char (translation_table, -1, charset, c1, c2))
1560
1561 /* Set designation state into CODING.  */
1562 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1563   do {                                                                     \
1564     int charset, c;                                                        \
1565                                                                            \
1566     if (final_char < '0' || final_char >= 128)                             \
1567       goto label_invalid_code;                                             \
1568     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1569                                  make_number (chars),                      \
1570                                  make_number (final_char));                \
1571     c = MAKE_CHAR (charset, 0, 0);                                         \
1572     if (charset >= 0                                                       \
1573         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1574             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1575       {                                                                    \
1576         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1577             && reg == 0                                                    \
1578             && charset == CHARSET_ASCII)                                   \
1579           {                                                                \
1580             /* We should insert this designation sequence as is so         \
1581                that it is surely written back to a file.  */               \
1582             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1583             goto label_invalid_code;                                       \
1584           }                                                                \
1585         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1586         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1587             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1588           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1589         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1590       }                                                                    \
1591     else                                                                   \
1592       {                                                                    \
1593         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1594         goto label_invalid_code;                                           \
1595       }                                                                    \
1596   } while (0)
1597
1598 /* Allocate a memory block for storing information about compositions.
1599    The block is chained to the already allocated blocks.  */
1600
1601 void
1602 coding_allocate_composition_data (coding, char_offset)
1603      struct coding_system *coding;
1604      int char_offset;
1605 {
1606   struct composition_data *cmp_data
1607     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1608
1609   cmp_data->char_offset = char_offset;
1610   cmp_data->used = 0;
1611   cmp_data->prev = coding->cmp_data;
1612   cmp_data->next = NULL;
1613   if (coding->cmp_data)
1614     coding->cmp_data->next = cmp_data;
1615   coding->cmp_data = cmp_data;
1616   coding->cmp_data_start = 0;
1617 }
1618
1619 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1620    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1621    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1622    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1623    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1624   */
1625
1626 #define DECODE_COMPOSITION_START(c1)                                       \
1627   do {                                                                     \
1628     if (coding->composing == COMPOSITION_DISABLED)                         \
1629       {                                                                    \
1630         *dst++ = ISO_CODE_ESC;                                             \
1631         *dst++ = c1 & 0x7f;                                                \
1632         coding->produced_char += 2;                                        \
1633       }                                                                    \
1634     else if (!COMPOSING_P (coding))                                        \
1635       {                                                                    \
1636         /* This is surely the start of a composition.  We must be sure     \
1637            that coding->cmp_data has enough space to store the             \
1638            information about the composition.  If not, terminate the       \
1639            current decoding loop, allocate one more memory block for       \
1640            coding->cmp_data in the caller, then start the decoding         \
1641            loop again.  We can't allocate memory here directly because     \
1642            it may cause buffer/string relocation.  */                      \
1643         if (!coding->cmp_data                                              \
1644             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1645                 >= COMPOSITION_DATA_SIZE))                                 \
1646           {                                                                \
1647             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1648             goto label_end_of_loop;                                        \
1649           }                                                                \
1650         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1651                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1652                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1653                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1654         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1655                                       coding->composing);                  \
1656         coding->composition_rule_follows = 0;                              \
1657       }                                                                    \
1658     else                                                                   \
1659       {                                                                    \
1660         /* We are already handling a composition.  If the method is        \
1661            the following two, the codes following the current escape       \
1662            sequence are actual characters stored in a buffer.  */          \
1663         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1664             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1665           {                                                                \
1666             coding->composing = COMPOSITION_RELATIVE;                      \
1667             coding->composition_rule_follows = 0;                          \
1668           }                                                                \
1669       }                                                                    \
1670   } while (0)
1671
1672 /* Handle composition end sequence ESC 1.  */
1673
1674 #define DECODE_COMPOSITION_END(c1)                                      \
1675   do {                                                                  \
1676     if (! COMPOSING_P (coding))                                         \
1677       {                                                                 \
1678         *dst++ = ISO_CODE_ESC;                                          \
1679         *dst++ = c1;                                                    \
1680         coding->produced_char += 2;                                     \
1681       }                                                                 \
1682     else                                                                \
1683       {                                                                 \
1684         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1685         coding->composing = COMPOSITION_NO;                             \
1686       }                                                                 \
1687   } while (0)
1688
1689 /* Decode a composition rule from the byte C1 (and maybe one more byte
1690    from SRC) and store one encoded composition rule in
1691    coding->cmp_data.  */
1692
1693 #define DECODE_COMPOSITION_RULE(c1)                                     \
1694   do {                                                                  \
1695     int rule = 0;                                                       \
1696     (c1) -= 32;                                                         \
1697     if (c1 < 81)                /* old format (before ver.21) */        \
1698       {                                                                 \
1699         int gref = (c1) / 9;                                            \
1700         int nref = (c1) % 9;                                            \
1701         if (gref == 4) gref = 10;                                       \
1702         if (nref == 4) nref = 10;                                       \
1703         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1704       }                                                                 \
1705     else if (c1 < 93)           /* new format (after ver.21) */         \
1706       {                                                                 \
1707         ONE_MORE_BYTE (c2);                                             \
1708         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1709       }                                                                 \
1710     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1711     coding->composition_rule_follows = 0;                               \
1712   } while (0)
1713
1714
1715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1716
1717 static void
1718 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1719      struct coding_system *coding;
1720      unsigned char *source, *destination;
1721      int src_bytes, dst_bytes;
1722 {
1723   unsigned char *src = source;
1724   unsigned char *src_end = source + src_bytes;
1725   unsigned char *dst = destination;
1726   unsigned char *dst_end = destination + dst_bytes;
1727   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1728   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1729   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1730   /* SRC_BASE remembers the start position in source in each loop.
1731      The loop will be exited when there's not enough source code
1732      (within macro ONE_MORE_BYTE), or when there's not enough
1733      destination area to produce a character (within macro
1734      EMIT_CHAR).  */
1735   unsigned char *src_base;
1736   int c, charset;
1737   Lisp_Object translation_table;
1738   Lisp_Object safe_chars;
1739
1740   safe_chars = coding_safe_chars (coding);
1741
1742   if (NILP (Venable_character_translation))
1743     translation_table = Qnil;
1744   else
1745     {
1746       translation_table = coding->translation_table_for_decode;
1747       if (NILP (translation_table))
1748         translation_table = Vstandard_translation_table_for_decode;
1749     }
1750
1751   coding->result = CODING_FINISH_NORMAL;
1752
1753   while (1)
1754     {
1755       int c1, c2;
1756
1757       src_base = src;
1758       ONE_MORE_BYTE (c1);
1759
1760       /* We produce no character or one character.  */
1761       switch (iso_code_class [c1])
1762         {
1763         case ISO_0x20_or_0x7F:
1764           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1765             {
1766               DECODE_COMPOSITION_RULE (c1);
1767               continue;
1768             }
1769           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1770             {
1771               /* This is SPACE or DEL.  */
1772               charset = CHARSET_ASCII;
1773               break;
1774             }
1775           /* This is a graphic character, we fall down ...  */
1776
1777         case ISO_graphic_plane_0:
1778           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1779             {
1780               DECODE_COMPOSITION_RULE (c1);
1781               continue;
1782             }
1783           charset = charset0;
1784           break;
1785
1786         case ISO_0xA0_or_0xFF:
1787           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1788               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1789             goto label_invalid_code;
1790           /* This is a graphic character, we fall down ... */
1791
1792         case ISO_graphic_plane_1:
1793           if (charset1 < 0)
1794             goto label_invalid_code;
1795           charset = charset1;
1796           break;
1797
1798         case ISO_control_0:
1799           if (COMPOSING_P (coding))
1800             DECODE_COMPOSITION_END ('1');
1801
1802           /* All ISO2022 control characters in this class have the
1803              same representation in Emacs internal format.  */
1804           if (c1 == '\n'
1805               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1806               && (coding->eol_type == CODING_EOL_CR
1807                   || coding->eol_type == CODING_EOL_CRLF))
1808             {
1809               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1810               goto label_end_of_loop;
1811             }
1812           charset = CHARSET_ASCII;
1813           break;
1814
1815         case ISO_control_1:
1816           if (COMPOSING_P (coding))
1817             DECODE_COMPOSITION_END ('1');
1818           goto label_invalid_code;
1819
1820         case ISO_carriage_return:
1821           if (COMPOSING_P (coding))
1822             DECODE_COMPOSITION_END ('1');
1823
1824           if (coding->eol_type == CODING_EOL_CR)
1825             c1 = '\n';
1826           else if (coding->eol_type == CODING_EOL_CRLF)
1827             {
1828               ONE_MORE_BYTE (c1);
1829               if (c1 != ISO_CODE_LF)
1830                 {
1831                   src--;
1832                   c1 = '\r';
1833                 }
1834             }
1835           charset = CHARSET_ASCII;
1836           break;
1837
1838         case ISO_shift_out:
1839           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1840               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1841             goto label_invalid_code;
1842           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1843           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1844           continue;
1845
1846         case ISO_shift_in:
1847           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1848             goto label_invalid_code;
1849           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1850           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1851           continue;
1852
1853         case ISO_single_shift_2_7:
1854         case ISO_single_shift_2:
1855           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1856             goto label_invalid_code;
1857           /* SS2 is handled as an escape sequence of ESC 'N' */
1858           c1 = 'N';
1859           goto label_escape_sequence;
1860
1861         case ISO_single_shift_3:
1862           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1863             goto label_invalid_code;
1864           /* SS2 is handled as an escape sequence of ESC 'O' */
1865           c1 = 'O';
1866           goto label_escape_sequence;
1867
1868         case ISO_control_sequence_introducer:
1869           /* CSI is handled as an escape sequence of ESC '[' ...  */
1870           c1 = '[';
1871           goto label_escape_sequence;
1872
1873         case ISO_escape:
1874           ONE_MORE_BYTE (c1);
1875         label_escape_sequence:
1876           /* Escape sequences handled by Emacs are invocation,
1877              designation, direction specification, and character
1878              composition specification.  */
1879           switch (c1)
1880             {
1881             case '&':           /* revision of following character set */
1882               ONE_MORE_BYTE (c1);
1883               if (!(c1 >= '@' && c1 <= '~'))
1884                 goto label_invalid_code;
1885               ONE_MORE_BYTE (c1);
1886               if (c1 != ISO_CODE_ESC)
1887                 goto label_invalid_code;
1888               ONE_MORE_BYTE (c1);
1889               goto label_escape_sequence;
1890
1891             case '$':           /* designation of 2-byte character set */
1892               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1893                 goto label_invalid_code;
1894               ONE_MORE_BYTE (c1);
1895               if (c1 >= '@' && c1 <= 'B')
1896                 {       /* designation of JISX0208.1978, GB2312.1980,
1897                            or JISX0208.1980 */
1898                   DECODE_DESIGNATION (0, 2, 94, c1);
1899                 }
1900               else if (c1 >= 0x28 && c1 <= 0x2B)
1901                 {       /* designation of DIMENSION2_CHARS94 character set */
1902                   ONE_MORE_BYTE (c2);
1903                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1904                 }
1905               else if (c1 >= 0x2C && c1 <= 0x2F)
1906                 {       /* designation of DIMENSION2_CHARS96 character set */
1907                   ONE_MORE_BYTE (c2);
1908                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1909                 }
1910               else
1911                 goto label_invalid_code;
1912               /* We must update these variables now.  */
1913               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1914               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1915               continue;
1916
1917             case 'n':           /* invocation of locking-shift-2 */
1918               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1919                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1920                 goto label_invalid_code;
1921               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1922               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1923               continue;
1924
1925             case 'o':           /* invocation of locking-shift-3 */
1926               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1927                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1928                 goto label_invalid_code;
1929               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1930               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1931               continue;
1932
1933             case 'N':           /* invocation of single-shift-2 */
1934               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1935                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1936                 goto label_invalid_code;
1937               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1938               ONE_MORE_BYTE (c1);
1939               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1940                 goto label_invalid_code;
1941               break;
1942
1943             case 'O':           /* invocation of single-shift-3 */
1944               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1945                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1946                 goto label_invalid_code;
1947               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1948               ONE_MORE_BYTE (c1);
1949               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1950                 goto label_invalid_code;
1951               break;
1952
1953             case '0': case '2': case '3': case '4': /* start composition */
1954               DECODE_COMPOSITION_START (c1);
1955               continue;
1956
1957             case '1':           /* end composition */
1958               DECODE_COMPOSITION_END (c1);
1959               continue;
1960
1961             case '[':           /* specification of direction */
1962               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1963                 goto label_invalid_code;
1964               /* For the moment, nested direction is not supported.
1965                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1966                  left-to-right, and nonzero means right-to-left.  */
1967               ONE_MORE_BYTE (c1);
1968               switch (c1)
1969                 {
1970                 case ']':       /* end of the current direction */
1971                   coding->mode &= ~CODING_MODE_DIRECTION;
1972
1973                 case '0':       /* end of the current direction */
1974                 case '1':       /* start of left-to-right direction */
1975                   ONE_MORE_BYTE (c1);
1976                   if (c1 == ']')
1977                     coding->mode &= ~CODING_MODE_DIRECTION;
1978                   else
1979                     goto label_invalid_code;
1980                   break;
1981
1982                 case '2':       /* start of right-to-left direction */
1983                   ONE_MORE_BYTE (c1);
1984                   if (c1 == ']')
1985                     coding->mode |= CODING_MODE_DIRECTION;
1986                   else
1987                     goto label_invalid_code;
1988                   break;
1989
1990                 default:
1991                   goto label_invalid_code;
1992                 }
1993               continue;
1994
1995             default:
1996               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1997                 goto label_invalid_code;
1998               if (c1 >= 0x28 && c1 <= 0x2B)
1999                 {       /* designation of DIMENSION1_CHARS94 character set */
2000                   ONE_MORE_BYTE (c2);
2001                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2002                 }
2003               else if (c1 >= 0x2C && c1 <= 0x2F)
2004                 {       /* designation of DIMENSION1_CHARS96 character set */
2005                   ONE_MORE_BYTE (c2);
2006                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2007                 }
2008               else
2009                 goto label_invalid_code;
2010               /* We must update these variables now.  */
2011               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2012               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2013               continue;
2014             }
2015         }
2016
2017       /* Now we know CHARSET and 1st position code C1 of a character.
2018          Produce a multibyte sequence for that character while getting
2019          2nd position code C2 if necessary.  */
2020       if (CHARSET_DIMENSION (charset) == 2)
2021         {
2022           ONE_MORE_BYTE (c2);
2023           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2024             /* C2 is not in a valid range.  */
2025             goto label_invalid_code;
2026         }
2027       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2028       EMIT_CHAR (c);
2029       continue;
2030
2031     label_invalid_code:
2032       coding->errors++;
2033       if (COMPOSING_P (coding))
2034         DECODE_COMPOSITION_END ('1');
2035       src = src_base;
2036       c = *src++;
2037       EMIT_CHAR (c);
2038     }
2039
2040  label_end_of_loop:
2041   coding->consumed = coding->consumed_char = src_base - source;
2042   coding->produced = dst - destination;
2043   return;
2044 }
2045
2046
2047 /* ISO2022 encoding stuff.  */
2048
2049 /*
2050    It is not enough to say just "ISO2022" on encoding, we have to
2051    specify more details.  In Emacs, each ISO2022 coding system
2052    variant has the following specifications:
2053         1. Initial designation to G0 through G3.
2054         2. Allows short-form designation?
2055         3. ASCII should be designated to G0 before control characters?
2056         4. ASCII should be designated to G0 at end of line?
2057         5. 7-bit environment or 8-bit environment?
2058         6. Use locking-shift?
2059         7. Use Single-shift?
2060    And the following two are only for Japanese:
2061         8. Use ASCII in place of JIS0201-1976-Roman?
2062         9. Use JISX0208-1983 in place of JISX0208-1978?
2063    These specifications are encoded in `coding->flags' as flag bits
2064    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2065    details.
2066 */
2067
2068 /* Produce codes (escape sequence) for designating CHARSET to graphic
2069    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2070    '@', 'A', or 'B' and the coding system CODING allows, produce
2071    designation sequence of short-form.  */
2072
2073 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2074   do {                                                                  \
2075     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2076     char *intermediate_char_94 = "()*+";                                \
2077     char *intermediate_char_96 = ",-./";                                \
2078     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2079                                                                         \
2080     if (revision < 255)                                                 \
2081       {                                                                 \
2082         *dst++ = ISO_CODE_ESC;                                          \
2083         *dst++ = '&';                                                   \
2084         *dst++ = '@' + revision;                                        \
2085       }                                                                 \
2086     *dst++ = ISO_CODE_ESC;                                              \
2087     if (CHARSET_DIMENSION (charset) == 1)                               \
2088       {                                                                 \
2089         if (CHARSET_CHARS (charset) == 94)                              \
2090           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2091         else                                                            \
2092           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2093       }                                                                 \
2094     else                                                                \
2095       {                                                                 \
2096         *dst++ = '$';                                                   \
2097         if (CHARSET_CHARS (charset) == 94)                              \
2098           {                                                             \
2099             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2100                 || reg != 0                                             \
2101                 || final_char < '@' || final_char > 'B')                \
2102               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2103           }                                                             \
2104         else                                                            \
2105           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2106       }                                                                 \
2107     *dst++ = final_char;                                                \
2108     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2109   } while (0)
2110
2111 /* The following two macros produce codes (control character or escape
2112    sequence) for ISO2022 single-shift functions (single-shift-2 and
2113    single-shift-3).  */
2114
2115 #define ENCODE_SINGLE_SHIFT_2                           \
2116   do {                                                  \
2117     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2118       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2119     else                                                \
2120       *dst++ = ISO_CODE_SS2;                            \
2121     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2122   } while (0)
2123
2124 #define ENCODE_SINGLE_SHIFT_3                           \
2125   do {                                                  \
2126     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2127       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2128     else                                                \
2129       *dst++ = ISO_CODE_SS3;                            \
2130     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2131   } while (0)
2132
2133 /* The following four macros produce codes (control character or
2134    escape sequence) for ISO2022 locking-shift functions (shift-in,
2135    shift-out, locking-shift-2, and locking-shift-3).  */
2136
2137 #define ENCODE_SHIFT_IN                         \
2138   do {                                          \
2139     *dst++ = ISO_CODE_SI;                       \
2140     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2141   } while (0)
2142
2143 #define ENCODE_SHIFT_OUT                        \
2144   do {                                          \
2145     *dst++ = ISO_CODE_SO;                       \
2146     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2147   } while (0)
2148
2149 #define ENCODE_LOCKING_SHIFT_2                  \
2150   do {                                          \
2151     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2152     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2153   } while (0)
2154
2155 #define ENCODE_LOCKING_SHIFT_3                  \
2156   do {                                          \
2157     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2158     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2159   } while (0)
2160
2161 /* Produce codes for a DIMENSION1 character whose character set is
2162    CHARSET and whose position-code is C1.  Designation and invocation
2163    sequences are also produced in advance if necessary.  */
2164
2165 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2166   do {                                                                  \
2167     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2168       {                                                                 \
2169         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2170           *dst++ = c1 & 0x7F;                                           \
2171         else                                                            \
2172           *dst++ = c1 | 0x80;                                           \
2173         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2174         break;                                                          \
2175       }                                                                 \
2176     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2177       {                                                                 \
2178         *dst++ = c1 & 0x7F;                                             \
2179         break;                                                          \
2180       }                                                                 \
2181     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2182       {                                                                 \
2183         *dst++ = c1 | 0x80;                                             \
2184         break;                                                          \
2185       }                                                                 \
2186     else                                                                \
2187       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2188          must invoke it, or, at first, designate it to some graphic     \
2189          register.  Then repeat the loop to actually produce the        \
2190          character.  */                                                 \
2191       dst = encode_invocation_designation (charset, coding, dst);       \
2192   } while (1)
2193
2194 /* Produce codes for a DIMENSION2 character whose character set is
2195    CHARSET and whose position-codes are C1 and C2.  Designation and
2196    invocation codes are also produced in advance if necessary.  */
2197
2198 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2199   do {                                                                  \
2200     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2201       {                                                                 \
2202         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2203           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2204         else                                                            \
2205           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2206         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2207         break;                                                          \
2208       }                                                                 \
2209     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2210       {                                                                 \
2211         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2212         break;                                                          \
2213       }                                                                 \
2214     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2215       {                                                                 \
2216         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2217         break;                                                          \
2218       }                                                                 \
2219     else                                                                \
2220       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2221          must invoke it, or, at first, designate it to some graphic     \
2222          register.  Then repeat the loop to actually produce the        \
2223          character.  */                                                 \
2224       dst = encode_invocation_designation (charset, coding, dst);       \
2225   } while (1)
2226
2227 #define ENCODE_ISO_CHARACTER(c)                                 \
2228   do {                                                          \
2229     int charset, c1, c2;                                        \
2230                                                                 \
2231     SPLIT_CHAR (c, charset, c1, c2);                            \
2232     if (CHARSET_DEFINED_P (charset))                            \
2233       {                                                         \
2234         if (CHARSET_DIMENSION (charset) == 1)                   \
2235           {                                                     \
2236             if (charset == CHARSET_ASCII                        \
2237                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2238               charset = charset_latin_jisx0201;                 \
2239             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2240           }                                                     \
2241         else                                                    \
2242           {                                                     \
2243             if (charset == charset_jisx0208                     \
2244                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2245               charset = charset_jisx0208_1978;                  \
2246             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2247           }                                                     \
2248       }                                                         \
2249     else                                                        \
2250       {                                                         \
2251         *dst++ = c1;                                            \
2252         if (c2 >= 0)                                            \
2253           *dst++ = c2;                                          \
2254       }                                                         \
2255   } while (0)
2256
2257
2258 /* Instead of encoding character C, produce one or two `?'s.  */
2259
2260 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
2261   do {                                                                  \
2262     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
2263     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
2264       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
2265   } while (0)
2266
2267
2268 /* Produce designation and invocation codes at a place pointed by DST
2269    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2270    Return new DST.  */
2271
2272 unsigned char *
2273 encode_invocation_designation (charset, coding, dst)
2274      int charset;
2275      struct coding_system *coding;
2276      unsigned char *dst;
2277 {
2278   int reg;                      /* graphic register number */
2279
2280   /* At first, check designations.  */
2281   for (reg = 0; reg < 4; reg++)
2282     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2283       break;
2284
2285   if (reg >= 4)
2286     {
2287       /* CHARSET is not yet designated to any graphic registers.  */
2288       /* At first check the requested designation.  */
2289       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2290       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2291         /* Since CHARSET requests no special designation, designate it
2292            to graphic register 0.  */
2293         reg = 0;
2294
2295       ENCODE_DESIGNATION (charset, reg, coding);
2296     }
2297
2298   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2299       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2300     {
2301       /* Since the graphic register REG is not invoked to any graphic
2302          planes, invoke it to graphic plane 0.  */
2303       switch (reg)
2304         {
2305         case 0:                 /* graphic register 0 */
2306           ENCODE_SHIFT_IN;
2307           break;
2308
2309         case 1:                 /* graphic register 1 */
2310           ENCODE_SHIFT_OUT;
2311           break;
2312
2313         case 2:                 /* graphic register 2 */
2314           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2315             ENCODE_SINGLE_SHIFT_2;
2316           else
2317             ENCODE_LOCKING_SHIFT_2;
2318           break;
2319
2320         case 3:                 /* graphic register 3 */
2321           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2322             ENCODE_SINGLE_SHIFT_3;
2323           else
2324             ENCODE_LOCKING_SHIFT_3;
2325           break;
2326         }
2327     }
2328
2329   return dst;
2330 }
2331
2332 /* Produce 2-byte codes for encoded composition rule RULE.  */
2333
2334 #define ENCODE_COMPOSITION_RULE(rule)           \
2335   do {                                          \
2336     int gref, nref;                             \
2337     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2338     *dst++ = 32 + 81 + gref;                    \
2339     *dst++ = 32 + nref;                         \
2340   } while (0)
2341
2342 /* Produce codes for indicating the start of a composition sequence
2343    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2344    which specify information about the composition.  See the comment
2345    in coding.h for the format of DATA.  */
2346
2347 #define ENCODE_COMPOSITION_START(coding, data)                          \
2348   do {                                                                  \
2349     coding->composing = data[3];                                        \
2350     *dst++ = ISO_CODE_ESC;                                              \
2351     if (coding->composing == COMPOSITION_RELATIVE)                      \
2352       *dst++ = '0';                                                     \
2353     else                                                                \
2354       {                                                                 \
2355         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2356                   ? '3' : '4');                                         \
2357         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2358         coding->composition_rule_follows = 0;                           \
2359       }                                                                 \
2360   } while (0)
2361
2362 /* Produce codes for indicating the end of the current composition.  */
2363
2364 #define ENCODE_COMPOSITION_END(coding, data)                    \
2365   do {                                                          \
2366     *dst++ = ISO_CODE_ESC;                                      \
2367     *dst++ = '1';                                               \
2368     coding->cmp_data_start += data[0];                          \
2369     coding->composing = COMPOSITION_NO;                         \
2370     if (coding->cmp_data_start == coding->cmp_data->used        \
2371         && coding->cmp_data->next)                              \
2372       {                                                         \
2373         coding->cmp_data = coding->cmp_data->next;              \
2374         coding->cmp_data_start = 0;                             \
2375       }                                                         \
2376   } while (0)
2377
2378 /* Produce composition start sequence ESC 0.  Here, this sequence
2379    doesn't mean the start of a new composition but means that we have
2380    just produced components (alternate chars and composition rules) of
2381    the composition and the actual text follows in SRC.  */
2382
2383 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2384   do {                                          \
2385     *dst++ = ISO_CODE_ESC;                      \
2386     *dst++ = '0';                               \
2387     coding->composing = COMPOSITION_RELATIVE;   \
2388   } while (0)
2389
2390 /* The following three macros produce codes for indicating direction
2391    of text.  */
2392 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2393   do {                                                  \
2394     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2395       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2396     else                                                \
2397       *dst++ = ISO_CODE_CSI;                            \
2398   } while (0)
2399
2400 #define ENCODE_DIRECTION_R2L    \
2401   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2402
2403 #define ENCODE_DIRECTION_L2R    \
2404   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2405
2406 /* Produce codes for designation and invocation to reset the graphic
2407    planes and registers to initial state.  */
2408 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2409   do {                                                                      \
2410     int reg;                                                                \
2411     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2412       ENCODE_SHIFT_IN;                                                      \
2413     for (reg = 0; reg < 4; reg++)                                           \
2414       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2415           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2416               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2417         ENCODE_DESIGNATION                                                  \
2418           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2419   } while (0)
2420
2421 /* Produce designation sequences of charsets in the line started from
2422    SRC to a place pointed by DST, and return updated DST.
2423
2424    If the current block ends before any end-of-line, we may fail to
2425    find all the necessary designations.  */
2426
2427 static unsigned char *
2428 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2429      struct coding_system *coding;
2430      Lisp_Object translation_table;
2431      unsigned char *src, *src_end, *dst;
2432 {
2433   int charset, c, found = 0, reg;
2434   /* Table of charsets to be designated to each graphic register.  */
2435   int r[4];
2436
2437   for (reg = 0; reg < 4; reg++)
2438     r[reg] = -1;
2439
2440   while (found < 4)
2441     {
2442       ONE_MORE_CHAR (c);
2443       if (c == '\n')
2444         break;
2445
2446       charset = CHAR_CHARSET (c);
2447       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2448       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2449         {
2450           found++;
2451           r[reg] = charset;
2452         }
2453     }
2454
2455  label_end_of_loop:
2456   if (found)
2457     {
2458       for (reg = 0; reg < 4; reg++)
2459         if (r[reg] >= 0
2460             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2461           ENCODE_DESIGNATION (r[reg], reg, coding);
2462     }
2463
2464   return dst;
2465 }
2466
2467 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2468
2469 static void
2470 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2471      struct coding_system *coding;
2472      unsigned char *source, *destination;
2473      int src_bytes, dst_bytes;
2474 {
2475   unsigned char *src = source;
2476   unsigned char *src_end = source + src_bytes;
2477   unsigned char *dst = destination;
2478   unsigned char *dst_end = destination + dst_bytes;
2479   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2480      from DST_END to assure overflow checking is necessary only at the
2481      head of loop.  */
2482   unsigned char *adjusted_dst_end = dst_end - 19;
2483   /* SRC_BASE remembers the start position in source in each loop.
2484      The loop will be exited when there's not enough source text to
2485      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2486      there's not enough destination area to produce encoded codes
2487      (within macro EMIT_BYTES).  */
2488   unsigned char *src_base;
2489   int c;
2490   Lisp_Object translation_table;
2491   Lisp_Object safe_chars;
2492
2493   safe_chars = coding_safe_chars (coding);
2494
2495   if (NILP (Venable_character_translation))
2496     translation_table = Qnil;
2497   else
2498     {
2499       translation_table = coding->translation_table_for_encode;
2500       if (NILP (translation_table))
2501         translation_table = Vstandard_translation_table_for_encode;
2502     }
2503
2504   coding->consumed_char = 0;
2505   coding->errors = 0;
2506   while (1)
2507     {
2508       src_base = src;
2509
2510       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2511         {
2512           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2513           break;
2514         }
2515
2516       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2517           && CODING_SPEC_ISO_BOL (coding))
2518         {
2519           /* We have to produce designation sequences if any now.  */
2520           dst = encode_designation_at_bol (coding, translation_table,
2521                                            src, src_end, dst);
2522           CODING_SPEC_ISO_BOL (coding) = 0;
2523         }
2524
2525       /* Check composition start and end.  */
2526       if (coding->composing != COMPOSITION_DISABLED
2527           && coding->cmp_data_start < coding->cmp_data->used)
2528         {
2529           struct composition_data *cmp_data = coding->cmp_data;
2530           int *data = cmp_data->data + coding->cmp_data_start;
2531           int this_pos = cmp_data->char_offset + coding->consumed_char;
2532
2533           if (coding->composing == COMPOSITION_RELATIVE)
2534             {
2535               if (this_pos == data[2])
2536                 {
2537                   ENCODE_COMPOSITION_END (coding, data);
2538                   cmp_data = coding->cmp_data;
2539                   data = cmp_data->data + coding->cmp_data_start;
2540                 }
2541             }
2542           else if (COMPOSING_P (coding))
2543             {
2544               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2545               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2546                 /* We have consumed components of the composition.
2547                    What follows in SRC is the composition's base
2548                    text.  */
2549                 ENCODE_COMPOSITION_FAKE_START (coding);
2550               else
2551                 {
2552                   int c = cmp_data->data[coding->cmp_data_index++];
2553                   if (coding->composition_rule_follows)
2554                     {
2555                       ENCODE_COMPOSITION_RULE (c);
2556                       coding->composition_rule_follows = 0;
2557                     }
2558                   else
2559                     {
2560                       if (coding->flags & CODING_FLAG_ISO_SAFE
2561                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2562                         ENCODE_UNSAFE_CHARACTER (c);
2563                       else
2564                         ENCODE_ISO_CHARACTER (c);
2565                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2566                         coding->composition_rule_follows = 1;
2567                     }
2568                   continue;
2569                 }
2570             }
2571           if (!COMPOSING_P (coding))
2572             {
2573               if (this_pos == data[1])
2574                 {
2575                   ENCODE_COMPOSITION_START (coding, data);
2576                   continue;
2577                 }
2578             }
2579         }
2580
2581       ONE_MORE_CHAR (c);
2582
2583       /* Now encode the character C.  */
2584       if (c < 0x20 || c == 0x7F)
2585         {
2586           if (c == '\r')
2587             {
2588               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2589                 {
2590                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2591                     ENCODE_RESET_PLANE_AND_REGISTER;
2592                   *dst++ = c;
2593                   continue;
2594                 }
2595               /* fall down to treat '\r' as '\n' ...  */
2596               c = '\n';
2597             }
2598           if (c == '\n')
2599             {
2600               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2601                 ENCODE_RESET_PLANE_AND_REGISTER;
2602               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2603                 bcopy (coding->spec.iso2022.initial_designation,
2604                        coding->spec.iso2022.current_designation,
2605                        sizeof coding->spec.iso2022.initial_designation);
2606               if (coding->eol_type == CODING_EOL_LF
2607                   || coding->eol_type == CODING_EOL_UNDECIDED)
2608                 *dst++ = ISO_CODE_LF;
2609               else if (coding->eol_type == CODING_EOL_CRLF)
2610                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2611               else
2612                 *dst++ = ISO_CODE_CR;
2613               CODING_SPEC_ISO_BOL (coding) = 1;
2614             }
2615           else
2616             {
2617               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2618                 ENCODE_RESET_PLANE_AND_REGISTER;
2619               *dst++ = c;
2620             }
2621         }
2622       else if (ASCII_BYTE_P (c))
2623         ENCODE_ISO_CHARACTER (c);
2624       else if (SINGLE_BYTE_CHAR_P (c))
2625         {
2626           *dst++ = c;
2627           coding->errors++;
2628         }
2629       else if (coding->flags & CODING_FLAG_ISO_SAFE
2630                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2631         ENCODE_UNSAFE_CHARACTER (c);
2632       else
2633         ENCODE_ISO_CHARACTER (c);
2634
2635       coding->consumed_char++;
2636     }
2637
2638  label_end_of_loop:
2639   coding->consumed = src_base - source;
2640   coding->produced = coding->produced_char = dst - destination;
2641 }
2642
2643 \f
2644 /*** 4. SJIS and BIG5 handlers ***/
2645
2646 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2647    quite widely.  So, for the moment, Emacs supports them in the bare
2648    C code.  But, in the future, they may be supported only by CCL.  */
2649
2650 /* SJIS is a coding system encoding three character sets: ASCII, right
2651    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2652    as is.  A character of charset katakana-jisx0201 is encoded by
2653    "position-code + 0x80".  A character of charset japanese-jisx0208
2654    is encoded in 2-byte but two position-codes are divided and shifted
2655    so that it fits in the range below.
2656
2657    --- CODE RANGE of SJIS ---
2658    (character set)      (range)
2659    ASCII                0x00 .. 0x7F
2660    KATAKANA-JISX0201    0xA1 .. 0xDF
2661    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2662             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2663    -------------------------------
2664
2665 */
2666
2667 /* BIG5 is a coding system encoding two character sets: ASCII and
2668    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2669    character set and is encoded in two bytes.
2670
2671    --- CODE RANGE of BIG5 ---
2672    (character set)      (range)
2673    ASCII                0x00 .. 0x7F
2674    Big5 (1st byte)      0xA1 .. 0xFE
2675         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2676    --------------------------
2677
2678    Since the number of characters in Big5 is larger than maximum
2679    characters in Emacs' charset (96x96), it can't be handled as one
2680    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2681    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2682    contains frequently used characters and the latter contains less
2683    frequently used characters.  */
2684
2685 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2686    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2687    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2688    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2689
2690 /* Number of Big5 characters which have the same code in 1st byte.  */
2691 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2692
2693 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2694   do {                                                                  \
2695     unsigned int temp                                                   \
2696       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2697     if (b1 < 0xC9)                                                      \
2698       charset = charset_big5_1;                                         \
2699     else                                                                \
2700       {                                                                 \
2701         charset = charset_big5_2;                                       \
2702         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2703       }                                                                 \
2704     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2705     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2706   } while (0)
2707
2708 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2709   do {                                                                  \
2710     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2711     if (charset == charset_big5_2)                                      \
2712       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2713     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2714     b2 = temp % BIG5_SAME_ROW;                                          \
2715     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2716   } while (0)
2717
2718 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2719    Check if a text is encoded in SJIS.  If it is, return
2720    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2721
2722 static int
2723 detect_coding_sjis (src, src_end, multibytep)
2724      unsigned char *src, *src_end;
2725      int multibytep;
2726 {
2727   int c;
2728   /* Dummy for ONE_MORE_BYTE.  */
2729   struct coding_system dummy_coding;
2730   struct coding_system *coding = &dummy_coding;
2731
2732   while (1)
2733     {
2734       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2735       if (c < 0x80)
2736         continue;
2737       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2738         return 0;
2739       if (c <= 0x9F || c >= 0xE0)
2740         {
2741           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2742           if (c < 0x40 || c == 0x7F || c > 0xFC)
2743             return 0;
2744         }
2745     }
2746  label_end_of_loop:
2747   return CODING_CATEGORY_MASK_SJIS;
2748 }
2749
2750 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2751    Check if a text is encoded in BIG5.  If it is, return
2752    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2753
2754 static int
2755 detect_coding_big5 (src, src_end, multibytep)
2756      unsigned char *src, *src_end;
2757      int multibytep;
2758 {
2759   int c;
2760   /* Dummy for ONE_MORE_BYTE.  */
2761   struct coding_system dummy_coding;
2762   struct coding_system *coding = &dummy_coding;
2763
2764   while (1)
2765     {
2766       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2767       if (c < 0x80)
2768         continue;
2769       if (c < 0xA1 || c > 0xFE)
2770         return 0;
2771       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2772       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2773         return 0;
2774     }
2775  label_end_of_loop:
2776   return CODING_CATEGORY_MASK_BIG5;
2777 }
2778
2779 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2780    Check if a text is encoded in UTF-8.  If it is, return
2781    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2782
2783 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2784 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2785 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2786 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2787 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2788 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2789 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2790
2791 static int
2792 detect_coding_utf_8 (src, src_end, multibytep)
2793      unsigned char *src, *src_end;
2794      int multibytep;
2795 {
2796   unsigned char c;
2797   int seq_maybe_bytes;
2798   /* Dummy for ONE_MORE_BYTE.  */
2799   struct coding_system dummy_coding;
2800   struct coding_system *coding = &dummy_coding;
2801
2802   while (1)
2803     {
2804       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2805       if (UTF_8_1_OCTET_P (c))
2806         continue;
2807       else if (UTF_8_2_OCTET_LEADING_P (c))
2808         seq_maybe_bytes = 1;
2809       else if (UTF_8_3_OCTET_LEADING_P (c))
2810         seq_maybe_bytes = 2;
2811       else if (UTF_8_4_OCTET_LEADING_P (c))
2812         seq_maybe_bytes = 3;
2813       else if (UTF_8_5_OCTET_LEADING_P (c))
2814         seq_maybe_bytes = 4;
2815       else if (UTF_8_6_OCTET_LEADING_P (c))
2816         seq_maybe_bytes = 5;
2817       else
2818         return 0;
2819
2820       do
2821         {
2822           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2823           if (!UTF_8_EXTRA_OCTET_P (c))
2824             return 0;
2825           seq_maybe_bytes--;
2826         }
2827       while (seq_maybe_bytes > 0);
2828     }
2829
2830  label_end_of_loop:
2831   return CODING_CATEGORY_MASK_UTF_8;
2832 }
2833
2834 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2835    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2836    Little Endian (otherwise).  If it is, return
2837    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2838    else return 0.  */
2839
2840 #define UTF_16_INVALID_P(val)   \
2841   (((val) == 0xFFFE)            \
2842    || ((val) == 0xFFFF))
2843
2844 #define UTF_16_HIGH_SURROGATE_P(val) \
2845   (((val) & 0xD800) == 0xD800)
2846
2847 #define UTF_16_LOW_SURROGATE_P(val) \
2848   (((val) & 0xDC00) == 0xDC00)
2849
2850 static int
2851 detect_coding_utf_16 (src, src_end, multibytep)
2852      unsigned char *src, *src_end;
2853      int multibytep;
2854 {
2855   unsigned char c1, c2;
2856   /* Dummy for TWO_MORE_BYTES.  */
2857   struct coding_system dummy_coding;
2858   struct coding_system *coding = &dummy_coding;
2859
2860   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2861   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2862
2863   if ((c1 == 0xFF) && (c2 == 0xFE))
2864     return CODING_CATEGORY_MASK_UTF_16_LE;
2865   else if ((c1 == 0xFE) && (c2 == 0xFF))
2866     return CODING_CATEGORY_MASK_UTF_16_BE;
2867
2868  label_end_of_loop:
2869   return 0;
2870 }
2871
2872 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2873    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2874
2875 static void
2876 decode_coding_sjis_big5 (coding, source, destination,
2877                          src_bytes, dst_bytes, sjis_p)
2878      struct coding_system *coding;
2879      unsigned char *source, *destination;
2880      int src_bytes, dst_bytes;
2881      int sjis_p;
2882 {
2883   unsigned char *src = source;
2884   unsigned char *src_end = source + src_bytes;
2885   unsigned char *dst = destination;
2886   unsigned char *dst_end = destination + dst_bytes;
2887   /* SRC_BASE remembers the start position in source in each loop.
2888      The loop will be exited when there's not enough source code
2889      (within macro ONE_MORE_BYTE), or when there's not enough
2890      destination area to produce a character (within macro
2891      EMIT_CHAR).  */
2892   unsigned char *src_base;
2893   Lisp_Object translation_table;
2894
2895   if (NILP (Venable_character_translation))
2896     translation_table = Qnil;
2897   else
2898     {
2899       translation_table = coding->translation_table_for_decode;
2900       if (NILP (translation_table))
2901         translation_table = Vstandard_translation_table_for_decode;
2902     }
2903
2904   coding->produced_char = 0;
2905   while (1)
2906     {
2907       int c, charset, c1, c2;
2908
2909       src_base = src;
2910       ONE_MORE_BYTE (c1);
2911
2912       if (c1 < 0x80)
2913         {
2914           charset = CHARSET_ASCII;
2915           if (c1 < 0x20)
2916             {
2917               if (c1 == '\r')
2918                 {
2919                   if (coding->eol_type == CODING_EOL_CRLF)
2920                     {
2921                       ONE_MORE_BYTE (c2);
2922                       if (c2 == '\n')
2923                         c1 = c2;
2924                       else
2925                         /* To process C2 again, SRC is subtracted by 1.  */
2926                         src--;
2927                     }
2928                   else if (coding->eol_type == CODING_EOL_CR)
2929                     c1 = '\n';
2930                 }
2931               else if (c1 == '\n'
2932                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2933                        && (coding->eol_type == CODING_EOL_CR
2934                            || coding->eol_type == CODING_EOL_CRLF))
2935                 {
2936                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2937                   goto label_end_of_loop;
2938                 }
2939             }
2940         }
2941       else
2942         {
2943           if (sjis_p)
2944             {
2945               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
2946                 goto label_invalid_code;
2947               if (c1 <= 0x9F || c1 >= 0xE0)
2948                 {
2949                   /* SJIS -> JISX0208 */
2950                   ONE_MORE_BYTE (c2);
2951                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2952                     goto label_invalid_code;
2953                   DECODE_SJIS (c1, c2, c1, c2);
2954                   charset = charset_jisx0208;
2955                 }
2956               else
2957                 /* SJIS -> JISX0201-Kana */
2958                 charset = charset_katakana_jisx0201;
2959             }
2960           else
2961             {
2962               /* BIG5 -> Big5 */
2963               if (c1 < 0xA0 || c1 > 0xFE)
2964                 goto label_invalid_code;
2965               ONE_MORE_BYTE (c2);
2966               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2967                 goto label_invalid_code;
2968               DECODE_BIG5 (c1, c2, charset, c1, c2);
2969             }
2970         }
2971
2972       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2973       EMIT_CHAR (c);
2974       continue;
2975
2976     label_invalid_code:
2977       coding->errors++;
2978       src = src_base;
2979       c = *src++;
2980       EMIT_CHAR (c);
2981     }
2982
2983  label_end_of_loop:
2984   coding->consumed = coding->consumed_char = src_base - source;
2985   coding->produced = dst - destination;
2986   return;
2987 }
2988
2989 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2990    This function can encode charsets `ascii', `katakana-jisx0201',
2991    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2992    are sure that all these charsets are registered as official charset
2993    (i.e. do not have extended leading-codes).  Characters of other
2994    charsets are produced without any encoding.  If SJIS_P is 1, encode
2995    SJIS text, else encode BIG5 text.  */
2996
2997 static void
2998 encode_coding_sjis_big5 (coding, source, destination,
2999                          src_bytes, dst_bytes, sjis_p)
3000      struct coding_system *coding;
3001      unsigned char *source, *destination;
3002      int src_bytes, dst_bytes;
3003      int sjis_p;
3004 {
3005   unsigned char *src = source;
3006   unsigned char *src_end = source + src_bytes;
3007   unsigned char *dst = destination;
3008   unsigned char *dst_end = destination + dst_bytes;
3009   /* SRC_BASE remembers the start position in source in each loop.
3010      The loop will be exited when there's not enough source text to
3011      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3012      there's not enough destination area to produce encoded codes
3013      (within macro EMIT_BYTES).  */
3014   unsigned char *src_base;
3015   Lisp_Object translation_table;
3016
3017   if (NILP (Venable_character_translation))
3018     translation_table = Qnil;
3019   else
3020     {
3021       translation_table = coding->translation_table_for_encode;
3022       if (NILP (translation_table))
3023         translation_table = Vstandard_translation_table_for_encode;
3024     }
3025
3026   while (1)
3027     {
3028       int c, charset, c1, c2;
3029
3030       src_base = src;
3031       ONE_MORE_CHAR (c);
3032
3033       /* Now encode the character C.  */
3034       if (SINGLE_BYTE_CHAR_P (c))
3035         {
3036           switch (c)
3037             {
3038             case '\r':
3039               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3040                 {
3041                   EMIT_ONE_BYTE (c);
3042                   break;
3043                 }
3044               c = '\n';
3045             case '\n':
3046               if (coding->eol_type == CODING_EOL_CRLF)
3047                 {
3048                   EMIT_TWO_BYTES ('\r', c);
3049                   break;
3050                 }
3051               else if (coding->eol_type == CODING_EOL_CR)
3052                 c = '\r';
3053             default:
3054               EMIT_ONE_BYTE (c);
3055             }
3056         }
3057       else
3058         {
3059           SPLIT_CHAR (c, charset, c1, c2);
3060           if (sjis_p)
3061             {
3062               if (charset == charset_jisx0208
3063                   || charset == charset_jisx0208_1978)
3064                 {
3065                   ENCODE_SJIS (c1, c2, c1, c2);
3066                   EMIT_TWO_BYTES (c1, c2);
3067                 }
3068               else if (charset == charset_katakana_jisx0201)
3069                 EMIT_ONE_BYTE (c1 | 0x80);
3070               else if (charset == charset_latin_jisx0201)
3071                 EMIT_ONE_BYTE (c1);
3072               else
3073                 /* There's no way other than producing the internal
3074                    codes as is.  */
3075                 EMIT_BYTES (src_base, src);
3076             }
3077           else
3078             {
3079               if (charset == charset_big5_1 || charset == charset_big5_2)
3080                 {
3081                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3082                   EMIT_TWO_BYTES (c1, c2);
3083                 }
3084               else
3085                 /* There's no way other than producing the internal
3086                    codes as is.  */
3087                 EMIT_BYTES (src_base, src);
3088             }
3089         }
3090       coding->consumed_char++;
3091     }
3092
3093  label_end_of_loop:
3094   coding->consumed = src_base - source;
3095   coding->produced = coding->produced_char = dst - destination;
3096 }
3097
3098 \f
3099 /*** 5. CCL handlers ***/
3100
3101 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3102    Check if a text is encoded in a coding system of which
3103    encoder/decoder are written in CCL program.  If it is, return
3104    CODING_CATEGORY_MASK_CCL, else return 0.  */
3105
3106 static int
3107 detect_coding_ccl (src, src_end, multibytep)
3108      unsigned char *src, *src_end;
3109      int multibytep;
3110 {
3111   unsigned char *valid;
3112   int c;
3113   /* Dummy for ONE_MORE_BYTE.  */
3114   struct coding_system dummy_coding;
3115   struct coding_system *coding = &dummy_coding;
3116
3117   /* No coding system is assigned to coding-category-ccl.  */
3118   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3119     return 0;
3120
3121   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3122   while (1)
3123     {
3124       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3125       if (! valid[c])
3126         return 0;
3127     }
3128  label_end_of_loop:
3129   return CODING_CATEGORY_MASK_CCL;
3130 }
3131
3132 \f
3133 /*** 6. End-of-line handlers ***/
3134
3135 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3136
3137 static void
3138 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3139      struct coding_system *coding;
3140      unsigned char *source, *destination;
3141      int src_bytes, dst_bytes;
3142 {
3143   unsigned char *src = source;
3144   unsigned char *dst = destination;
3145   unsigned char *src_end = src + src_bytes;
3146   unsigned char *dst_end = dst + dst_bytes;
3147   Lisp_Object translation_table;
3148   /* SRC_BASE remembers the start position in source in each loop.
3149      The loop will be exited when there's not enough source code
3150      (within macro ONE_MORE_BYTE), or when there's not enough
3151      destination area to produce a character (within macro
3152      EMIT_CHAR).  */
3153   unsigned char *src_base;
3154   int c;
3155
3156   translation_table = Qnil;
3157   switch (coding->eol_type)
3158     {
3159     case CODING_EOL_CRLF:
3160       while (1)
3161         {
3162           src_base = src;
3163           ONE_MORE_BYTE (c);
3164           if (c == '\r')
3165             {
3166               ONE_MORE_BYTE (c);
3167               if (c != '\n')
3168                 {
3169                   src--;
3170                   c = '\r';
3171                 }
3172             }
3173           else if (c == '\n'
3174                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3175             {
3176               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3177               goto label_end_of_loop;
3178             }
3179           EMIT_CHAR (c);
3180         }
3181       break;
3182
3183     case CODING_EOL_CR:
3184       while (1)
3185         {
3186           src_base = src;
3187           ONE_MORE_BYTE (c);
3188           if (c == '\n')
3189             {
3190               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3191                 {
3192                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3193                   goto label_end_of_loop;
3194                 }
3195             }
3196           else if (c == '\r')
3197             c = '\n';
3198           EMIT_CHAR (c);
3199         }
3200       break;
3201
3202     default:                    /* no need for EOL handling */
3203       while (1)
3204         {
3205           src_base = src;
3206           ONE_MORE_BYTE (c);
3207           EMIT_CHAR (c);
3208         }
3209     }
3210
3211  label_end_of_loop:
3212   coding->consumed = coding->consumed_char = src_base - source;
3213   coding->produced = dst - destination;
3214   return;
3215 }
3216
3217 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3218    format of end-of-line according to `coding->eol_type'.  It also
3219    convert multibyte form 8-bit characters to unibyte if
3220    CODING->src_multibyte is nonzero.  If `coding->mode &
3221    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3222    also means end-of-line.  */
3223
3224 static void
3225 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3226      struct coding_system *coding;
3227      const unsigned char *source;
3228      unsigned char *destination;
3229      int src_bytes, dst_bytes;
3230 {
3231   const unsigned char *src = source;
3232   unsigned char *dst = destination;
3233   const unsigned char *src_end = src + src_bytes;
3234   unsigned char *dst_end = dst + dst_bytes;
3235   Lisp_Object translation_table;
3236   /* SRC_BASE remembers the start position in source in each loop.
3237      The loop will be exited when there's not enough source text to
3238      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3239      there's not enough destination area to produce encoded codes
3240      (within macro EMIT_BYTES).  */
3241   const unsigned char *src_base;
3242   unsigned char *tmp;
3243   int c;
3244   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3245
3246   translation_table = Qnil;
3247   if (coding->src_multibyte
3248       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3249     {
3250       src_end--;
3251       src_bytes--;
3252       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3253     }
3254
3255   if (coding->eol_type == CODING_EOL_CRLF)
3256     {
3257       while (src < src_end)
3258         {
3259           src_base = src;
3260           c = *src++;
3261           if (c >= 0x20)
3262             EMIT_ONE_BYTE (c);
3263           else if (c == '\n' || (c == '\r' && selective_display))
3264             EMIT_TWO_BYTES ('\r', '\n');
3265           else
3266             EMIT_ONE_BYTE (c);
3267         }
3268       src_base = src;
3269     label_end_of_loop:
3270       ;
3271     }
3272   else
3273     {
3274       if (!dst_bytes || src_bytes <= dst_bytes)
3275         {
3276           safe_bcopy (src, dst, src_bytes);
3277           src_base = src_end;
3278           dst += src_bytes;
3279         }
3280       else
3281         {
3282           if (coding->src_multibyte
3283               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3284             dst_bytes--;
3285           safe_bcopy (src, dst, dst_bytes);
3286           src_base = src + dst_bytes;
3287           dst = destination + dst_bytes;
3288           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3289         }
3290       if (coding->eol_type == CODING_EOL_CR)
3291         {
3292           for (tmp = destination; tmp < dst; tmp++)
3293             if (*tmp == '\n') *tmp = '\r';
3294         }
3295       else if (selective_display)
3296         {
3297           for (tmp = destination; tmp < dst; tmp++)
3298             if (*tmp == '\r') *tmp = '\n';
3299         }
3300     }
3301   if (coding->src_multibyte)
3302     dst = destination + str_as_unibyte (destination, dst - destination);
3303
3304   coding->consumed = src_base - source;
3305   coding->produced = dst - destination;
3306   coding->produced_char = coding->produced;
3307 }
3308
3309 \f
3310 /*** 7. C library functions ***/
3311
3312 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3313    has a property `coding-system'.  The value of this property is a
3314    vector of length 5 (called the coding-vector).  Among elements of
3315    this vector, the first (element[0]) and the fifth (element[4])
3316    carry important information for decoding/encoding.  Before
3317    decoding/encoding, this information should be set in fields of a
3318    structure of type `coding_system'.
3319
3320    The value of the property `coding-system' can be a symbol of another
3321    subsidiary coding-system.  In that case, Emacs gets coding-vector
3322    from that symbol.
3323
3324    `element[0]' contains information to be set in `coding->type'.  The
3325    value and its meaning is as follows:
3326
3327    0 -- coding_type_emacs_mule
3328    1 -- coding_type_sjis
3329    2 -- coding_type_iso2022
3330    3 -- coding_type_big5
3331    4 -- coding_type_ccl encoder/decoder written in CCL
3332    nil -- coding_type_no_conversion
3333    t -- coding_type_undecided (automatic conversion on decoding,
3334                                no-conversion on encoding)
3335
3336    `element[4]' contains information to be set in `coding->flags' and
3337    `coding->spec'.  The meaning varies by `coding->type'.
3338
3339    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3340    of length 32 (of which the first 13 sub-elements are used now).
3341    Meanings of these sub-elements are:
3342
3343    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3344         If the value is an integer of valid charset, the charset is
3345         assumed to be designated to graphic register N initially.
3346
3347         If the value is minus, it is a minus value of charset which
3348         reserves graphic register N, which means that the charset is
3349         not designated initially but should be designated to graphic
3350         register N just before encoding a character in that charset.
3351
3352         If the value is nil, graphic register N is never used on
3353         encoding.
3354
3355    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3356         Each value takes t or nil.  See the section ISO2022 of
3357         `coding.h' for more information.
3358
3359    If `coding->type' is `coding_type_big5', element[4] is t to denote
3360    BIG5-ETen or nil to denote BIG5-HKU.
3361
3362    If `coding->type' takes the other value, element[4] is ignored.
3363
3364    Emacs Lisp's coding systems also carry information about format of
3365    end-of-line in a value of property `eol-type'.  If the value is
3366    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3367    means CODING_EOL_CR.  If it is not integer, it should be a vector
3368    of subsidiary coding systems of which property `eol-type' has one
3369    of the above values.
3370
3371 */
3372
3373 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3374    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3375    is setup so that no conversion is necessary and return -1, else
3376    return 0.  */
3377
3378 int
3379 setup_coding_system (coding_system, coding)
3380      Lisp_Object coding_system;
3381      struct coding_system *coding;
3382 {
3383   Lisp_Object coding_spec, coding_type, eol_type, plist;
3384   Lisp_Object val;
3385
3386   /* At first, zero clear all members.  */
3387   bzero (coding, sizeof (struct coding_system));
3388
3389   /* Initialize some fields required for all kinds of coding systems.  */
3390   coding->symbol = coding_system;
3391   coding->heading_ascii = -1;
3392   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3393   coding->composing = COMPOSITION_DISABLED;
3394   coding->cmp_data = NULL;
3395
3396   if (NILP (coding_system))
3397     goto label_invalid_coding_system;
3398
3399   coding_spec = Fget (coding_system, Qcoding_system);
3400
3401   if (!VECTORP (coding_spec)
3402       || XVECTOR (coding_spec)->size != 5
3403       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3404     goto label_invalid_coding_system;
3405
3406   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3407   if (VECTORP (eol_type))
3408     {
3409       coding->eol_type = CODING_EOL_UNDECIDED;
3410       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3411     }
3412   else if (XFASTINT (eol_type) == 1)
3413     {
3414       coding->eol_type = CODING_EOL_CRLF;
3415       coding->common_flags
3416         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3417     }
3418   else if (XFASTINT (eol_type) == 2)
3419     {
3420       coding->eol_type = CODING_EOL_CR;
3421       coding->common_flags
3422         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3423     }
3424   else
3425     coding->eol_type = CODING_EOL_LF;
3426
3427   coding_type = XVECTOR (coding_spec)->contents[0];
3428   /* Try short cut.  */
3429   if (SYMBOLP (coding_type))
3430     {
3431       if (EQ (coding_type, Qt))
3432         {
3433           coding->type = coding_type_undecided;
3434           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3435         }
3436       else
3437         coding->type = coding_type_no_conversion;
3438       /* Initialize this member.  Any thing other than
3439          CODING_CATEGORY_IDX_UTF_16_BE and
3440          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3441          special treatment in detect_eol.  */
3442       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3443
3444       return 0;
3445     }
3446
3447   /* Get values of coding system properties:
3448      `post-read-conversion', `pre-write-conversion',
3449      `translation-table-for-decode', `translation-table-for-encode'.  */
3450   plist = XVECTOR (coding_spec)->contents[3];
3451   /* Pre & post conversion functions should be disabled if
3452      inhibit_eol_conversion is nonzero.  This is the case that a code
3453      conversion function is called while those functions are running.  */
3454   if (! inhibit_pre_post_conversion)
3455     {
3456       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3457       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3458     }
3459   val = Fplist_get (plist, Qtranslation_table_for_decode);
3460   if (SYMBOLP (val))
3461     val = Fget (val, Qtranslation_table_for_decode);
3462   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3463   val = Fplist_get (plist, Qtranslation_table_for_encode);
3464   if (SYMBOLP (val))
3465     val = Fget (val, Qtranslation_table_for_encode);
3466   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3467   val = Fplist_get (plist, Qcoding_category);
3468   if (!NILP (val))
3469     {
3470       val = Fget (val, Qcoding_category_index);
3471       if (INTEGERP (val))
3472         coding->category_idx = XINT (val);
3473       else
3474         goto label_invalid_coding_system;
3475     }
3476   else
3477     goto label_invalid_coding_system;
3478
3479   /* If the coding system has non-nil `composition' property, enable
3480      composition handling.  */
3481   val = Fplist_get (plist, Qcomposition);
3482   if (!NILP (val))
3483     coding->composing = COMPOSITION_NO;
3484
3485   switch (XFASTINT (coding_type))
3486     {
3487     case 0:
3488       coding->type = coding_type_emacs_mule;
3489       coding->common_flags
3490         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3491       if (!NILP (coding->post_read_conversion))
3492         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3493       if (!NILP (coding->pre_write_conversion))
3494         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3495       break;
3496
3497     case 1:
3498       coding->type = coding_type_sjis;
3499       coding->common_flags
3500         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3501       break;
3502
3503     case 2:
3504       coding->type = coding_type_iso2022;
3505       coding->common_flags
3506         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3507       {
3508         Lisp_Object val, temp;
3509         Lisp_Object *flags;
3510         int i, charset, reg_bits = 0;
3511
3512         val = XVECTOR (coding_spec)->contents[4];
3513
3514         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3515           goto label_invalid_coding_system;
3516
3517         flags = XVECTOR (val)->contents;
3518         coding->flags
3519           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3520              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3521              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3522              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3523              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3524              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3525              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3526              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3527              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3528              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3529              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3530              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3531              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3532              );
3533
3534         /* Invoke graphic register 0 to plane 0.  */
3535         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3536         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3537         CODING_SPEC_ISO_INVOCATION (coding, 1)
3538           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3539         /* Not single shifting at first.  */
3540         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3541         /* Beginning of buffer should also be regarded as bol. */
3542         CODING_SPEC_ISO_BOL (coding) = 1;
3543
3544         for (charset = 0; charset <= MAX_CHARSET; charset++)
3545           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3546         val = Vcharset_revision_alist;
3547         while (CONSP (val))
3548           {
3549             charset = get_charset_id (Fcar_safe (XCAR (val)));
3550             if (charset >= 0
3551                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3552                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3553               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3554             val = XCDR (val);
3555           }
3556
3557         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3558            FLAGS[REG] can be one of below:
3559                 integer CHARSET: CHARSET occupies register I,
3560                 t: designate nothing to REG initially, but can be used
3561                   by any charsets,
3562                 list of integer, nil, or t: designate the first
3563                   element (if integer) to REG initially, the remaining
3564                   elements (if integer) is designated to REG on request,
3565                   if an element is t, REG can be used by any charsets,
3566                 nil: REG is never used.  */
3567         for (charset = 0; charset <= MAX_CHARSET; charset++)
3568           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3569             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3570         for (i = 0; i < 4; i++)
3571           {
3572             if ((INTEGERP (flags[i])
3573                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3574                 || (charset = get_charset_id (flags[i])) >= 0)
3575               {
3576                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3577                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3578               }
3579             else if (EQ (flags[i], Qt))
3580               {
3581                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3582                 reg_bits |= 1 << i;
3583                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3584               }
3585             else if (CONSP (flags[i]))
3586               {
3587                 Lisp_Object tail;
3588                 tail = flags[i];
3589
3590                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3591                 if ((INTEGERP (XCAR (tail))
3592                      && (charset = XINT (XCAR (tail)),
3593                          CHARSET_VALID_P (charset)))
3594                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3595                   {
3596                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3597                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3598                   }
3599                 else
3600                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3601                 tail = XCDR (tail);
3602                 while (CONSP (tail))
3603                   {
3604                     if ((INTEGERP (XCAR (tail))
3605                          && (charset = XINT (XCAR (tail)),
3606                              CHARSET_VALID_P (charset)))
3607                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3608                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3609                         = i;
3610                     else if (EQ (XCAR (tail), Qt))
3611                       reg_bits |= 1 << i;
3612                     tail = XCDR (tail);
3613                   }
3614               }
3615             else
3616               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3617
3618             CODING_SPEC_ISO_DESIGNATION (coding, i)
3619               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3620           }
3621
3622         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3623           {
3624             /* REG 1 can be used only by locking shift in 7-bit env.  */
3625             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3626               reg_bits &= ~2;
3627             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3628               /* Without any shifting, only REG 0 and 1 can be used.  */
3629               reg_bits &= 3;
3630           }
3631
3632         if (reg_bits)
3633           for (charset = 0; charset <= MAX_CHARSET; charset++)
3634             {
3635               if (CHARSET_DEFINED_P (charset)
3636                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3637                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3638                 {
3639                   /* There exist some default graphic registers to be
3640                      used by CHARSET.  */
3641
3642                   /* We had better avoid designating a charset of
3643                      CHARS96 to REG 0 as far as possible.  */
3644                   if (CHARSET_CHARS (charset) == 96)
3645                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3646                       = (reg_bits & 2
3647                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3648                   else
3649                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3650                       = (reg_bits & 1
3651                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3652                 }
3653             }
3654       }
3655       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3656       coding->spec.iso2022.last_invalid_designation_register = -1;
3657       break;
3658
3659     case 3:
3660       coding->type = coding_type_big5;
3661       coding->common_flags
3662         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3663       coding->flags
3664         = (NILP (XVECTOR (coding_spec)->contents[4])
3665            ? CODING_FLAG_BIG5_HKU
3666            : CODING_FLAG_BIG5_ETEN);
3667       break;
3668
3669     case 4:
3670       coding->type = coding_type_ccl;
3671       coding->common_flags
3672         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3673       {
3674         val = XVECTOR (coding_spec)->contents[4];
3675         if (! CONSP (val)
3676             || setup_ccl_program (&(coding->spec.ccl.decoder),
3677                                   XCAR (val)) < 0
3678             || setup_ccl_program (&(coding->spec.ccl.encoder),
3679                                   XCDR (val)) < 0)
3680           goto label_invalid_coding_system;
3681
3682         bzero (coding->spec.ccl.valid_codes, 256);
3683         val = Fplist_get (plist, Qvalid_codes);
3684         if (CONSP (val))
3685           {
3686             Lisp_Object this;
3687
3688             for (; CONSP (val); val = XCDR (val))
3689               {
3690                 this = XCAR (val);
3691                 if (INTEGERP (this)
3692                     && XINT (this) >= 0 && XINT (this) < 256)
3693                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3694                 else if (CONSP (this)
3695                          && INTEGERP (XCAR (this))
3696                          && INTEGERP (XCDR (this)))
3697                   {
3698                     int start = XINT (XCAR (this));
3699                     int end = XINT (XCDR (this));
3700
3701                     if (start >= 0 && start <= end && end < 256)
3702                       while (start <= end)
3703                         coding->spec.ccl.valid_codes[start++] = 1;
3704                   }
3705               }
3706           }
3707       }
3708       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3709       coding->spec.ccl.cr_carryover = 0;
3710       coding->spec.ccl.eight_bit_carryover[0] = 0;
3711       break;
3712
3713     case 5:
3714       coding->type = coding_type_raw_text;
3715       break;
3716
3717     default:
3718       goto label_invalid_coding_system;
3719     }
3720   return 0;
3721
3722  label_invalid_coding_system:
3723   coding->type = coding_type_no_conversion;
3724   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3725   coding->common_flags = 0;
3726   coding->eol_type = CODING_EOL_LF;
3727   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3728   return -1;
3729 }
3730
3731 /* Free memory blocks allocated for storing composition information.  */
3732
3733 void
3734 coding_free_composition_data (coding)
3735      struct coding_system *coding;
3736 {
3737   struct composition_data *cmp_data = coding->cmp_data, *next;
3738
3739   if (!cmp_data)
3740     return;
3741   /* Memory blocks are chained.  At first, rewind to the first, then,
3742      free blocks one by one.  */
3743   while (cmp_data->prev)
3744     cmp_data = cmp_data->prev;
3745   while (cmp_data)
3746     {
3747       next = cmp_data->next;
3748       xfree (cmp_data);
3749       cmp_data = next;
3750     }
3751   coding->cmp_data = NULL;
3752 }
3753
3754 /* Set `char_offset' member of all memory blocks pointed by
3755    coding->cmp_data to POS.  */
3756
3757 void
3758 coding_adjust_composition_offset (coding, pos)
3759      struct coding_system *coding;
3760      int pos;
3761 {
3762   struct composition_data *cmp_data;
3763
3764   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3765     cmp_data->char_offset = pos;
3766 }
3767
3768 /* Setup raw-text or one of its subsidiaries in the structure
3769    coding_system CODING according to the already setup value eol_type
3770    in CODING.  CODING should be setup for some coding system in
3771    advance.  */
3772
3773 void
3774 setup_raw_text_coding_system (coding)
3775      struct coding_system *coding;
3776 {
3777   if (coding->type != coding_type_raw_text)
3778     {
3779       coding->symbol = Qraw_text;
3780       coding->type = coding_type_raw_text;
3781       if (coding->eol_type != CODING_EOL_UNDECIDED)
3782         {
3783           Lisp_Object subsidiaries;
3784           subsidiaries = Fget (Qraw_text, Qeol_type);
3785
3786           if (VECTORP (subsidiaries)
3787               && XVECTOR (subsidiaries)->size == 3)
3788             coding->symbol
3789               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3790         }
3791       setup_coding_system (coding->symbol, coding);
3792     }
3793   return;
3794 }
3795
3796 /* Emacs has a mechanism to automatically detect a coding system if it
3797    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3798    it's impossible to distinguish some coding systems accurately
3799    because they use the same range of codes.  So, at first, coding
3800    systems are categorized into 7, those are:
3801
3802    o coding-category-emacs-mule
3803
3804         The category for a coding system which has the same code range
3805         as Emacs' internal format.  Assigned the coding-system (Lisp
3806         symbol) `emacs-mule' by default.
3807
3808    o coding-category-sjis
3809
3810         The category for a coding system which has the same code range
3811         as SJIS.  Assigned the coding-system (Lisp
3812         symbol) `japanese-shift-jis' by default.
3813
3814    o coding-category-iso-7
3815
3816         The category for a coding system which has the same code range
3817         as ISO2022 of 7-bit environment.  This doesn't use any locking
3818         shift and single shift functions.  This can encode/decode all
3819         charsets.  Assigned the coding-system (Lisp symbol)
3820         `iso-2022-7bit' by default.
3821
3822    o coding-category-iso-7-tight
3823
3824         Same as coding-category-iso-7 except that this can
3825         encode/decode only the specified charsets.
3826
3827    o coding-category-iso-8-1
3828
3829         The category for a coding system which has the same code range
3830         as ISO2022 of 8-bit environment and graphic plane 1 used only
3831         for DIMENSION1 charset.  This doesn't use any locking shift
3832         and single shift functions.  Assigned the coding-system (Lisp
3833         symbol) `iso-latin-1' by default.
3834
3835    o coding-category-iso-8-2
3836
3837         The category for a coding system which has the same code range
3838         as ISO2022 of 8-bit environment and graphic plane 1 used only
3839         for DIMENSION2 charset.  This doesn't use any locking shift
3840         and single shift functions.  Assigned the coding-system (Lisp
3841         symbol) `japanese-iso-8bit' by default.
3842
3843    o coding-category-iso-7-else
3844
3845         The category for a coding system which has the same code range
3846         as ISO2022 of 7-bit environment but uses locking shift or
3847         single shift functions.  Assigned the coding-system (Lisp
3848         symbol) `iso-2022-7bit-lock' by default.
3849
3850    o coding-category-iso-8-else
3851
3852         The category for a coding system which has the same code range
3853         as ISO2022 of 8-bit environment but uses locking shift or
3854         single shift functions.  Assigned the coding-system (Lisp
3855         symbol) `iso-2022-8bit-ss2' by default.
3856
3857    o coding-category-big5
3858
3859         The category for a coding system which has the same code range
3860         as BIG5.  Assigned the coding-system (Lisp symbol)
3861         `cn-big5' by default.
3862
3863    o coding-category-utf-8
3864
3865         The category for a coding system which has the same code range
3866         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3867         symbol) `utf-8' by default.
3868
3869    o coding-category-utf-16-be
3870
3871         The category for a coding system in which a text has an
3872         Unicode signature (cf. Unicode Standard) in the order of BIG
3873         endian at the head.  Assigned the coding-system (Lisp symbol)
3874         `utf-16-be' by default.
3875
3876    o coding-category-utf-16-le
3877
3878         The category for a coding system in which a text has an
3879         Unicode signature (cf. Unicode Standard) in the order of
3880         LITTLE endian at the head.  Assigned the coding-system (Lisp
3881         symbol) `utf-16-le' by default.
3882
3883    o coding-category-ccl
3884
3885         The category for a coding system of which encoder/decoder is
3886         written in CCL programs.  The default value is nil, i.e., no
3887         coding system is assigned.
3888
3889    o coding-category-binary
3890
3891         The category for a coding system not categorized in any of the
3892         above.  Assigned the coding-system (Lisp symbol)
3893         `no-conversion' by default.
3894
3895    Each of them is a Lisp symbol and the value is an actual
3896    `coding-system' (this is also a Lisp symbol) assigned by a user.
3897    What Emacs does actually is to detect a category of coding system.
3898    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3899    decide a single possible category, it selects a category of the
3900    highest priority.  Priorities of categories are also specified by a
3901    user in a Lisp variable `coding-category-list'.
3902
3903 */
3904
3905 static
3906 int ascii_skip_code[256];
3907
3908 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3909    If it detects possible coding systems, return an integer in which
3910    appropriate flag bits are set.  Flag bits are defined by macros
3911    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3912    it should point the table `coding_priorities'.  In that case, only
3913    the flag bit for a coding system of the highest priority is set in
3914    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3915    range 0x80..0x9F are in multibyte form.
3916
3917    How many ASCII characters are at the head is returned as *SKIP.  */
3918
3919 static int
3920 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3921      unsigned char *source;
3922      int src_bytes, *priorities, *skip;
3923      int multibytep;
3924 {
3925   register unsigned char c;
3926   unsigned char *src = source, *src_end = source + src_bytes;
3927   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3928   int i;
3929
3930   /* At first, skip all ASCII characters and control characters except
3931      for three ISO2022 specific control characters.  */
3932   ascii_skip_code[ISO_CODE_SO] = 0;
3933   ascii_skip_code[ISO_CODE_SI] = 0;
3934   ascii_skip_code[ISO_CODE_ESC] = 0;
3935
3936  label_loop_detect_coding:
3937   while (src < src_end && ascii_skip_code[*src]) src++;
3938   *skip = src - source;
3939
3940   if (src >= src_end)
3941     /* We found nothing other than ASCII.  There's nothing to do.  */
3942     return 0;
3943
3944   c = *src;
3945   /* The text seems to be encoded in some multilingual coding system.
3946      Now, try to find in which coding system the text is encoded.  */
3947   if (c < 0x80)
3948     {
3949       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3950       /* C is an ISO2022 specific control code of C0.  */
3951       mask = detect_coding_iso2022 (src, src_end, multibytep);
3952       if (mask == 0)
3953         {
3954           /* No valid ISO2022 code follows C.  Try again.  */
3955           src++;
3956           if (c == ISO_CODE_ESC)
3957             ascii_skip_code[ISO_CODE_ESC] = 1;
3958           else
3959             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3960           goto label_loop_detect_coding;
3961         }
3962       if (priorities)
3963         {
3964           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3965             {
3966               if (mask & priorities[i])
3967                 return priorities[i];
3968             }
3969           return CODING_CATEGORY_MASK_RAW_TEXT;
3970         }
3971     }
3972   else
3973     {
3974       int try;
3975
3976       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3977         c = src[1] - 0x20;
3978
3979       if (c < 0xA0)
3980         {
3981           /* C is the first byte of SJIS character code,
3982              or a leading-code of Emacs' internal format (emacs-mule),
3983              or the first byte of UTF-16.  */
3984           try = (CODING_CATEGORY_MASK_SJIS
3985                   | CODING_CATEGORY_MASK_EMACS_MULE
3986                   | CODING_CATEGORY_MASK_UTF_16_BE
3987                   | CODING_CATEGORY_MASK_UTF_16_LE);
3988
3989           /* Or, if C is a special latin extra code,
3990              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3991              or is an ISO2022 control-sequence-introducer (CSI),
3992              we should also consider the possibility of ISO2022 codings.  */
3993           if ((VECTORP (Vlatin_extra_code_table)
3994                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3995               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3996               || (c == ISO_CODE_CSI
3997                   && (src < src_end
3998                       && (*src == ']'
3999                           || ((*src == '0' || *src == '1' || *src == '2')
4000                               && src + 1 < src_end
4001                               && src[1] == ']')))))
4002             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4003                      | CODING_CATEGORY_MASK_ISO_8BIT);
4004         }
4005       else
4006         /* C is a character of ISO2022 in graphic plane right,
4007            or a SJIS's 1-byte character code (i.e. JISX0201),
4008            or the first byte of BIG5's 2-byte code,
4009            or the first byte of UTF-8/16.  */
4010         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4011                 | CODING_CATEGORY_MASK_ISO_8BIT
4012                 | CODING_CATEGORY_MASK_SJIS
4013                 | CODING_CATEGORY_MASK_BIG5
4014                 | CODING_CATEGORY_MASK_UTF_8
4015                 | CODING_CATEGORY_MASK_UTF_16_BE
4016                 | CODING_CATEGORY_MASK_UTF_16_LE);
4017
4018       /* Or, we may have to consider the possibility of CCL.  */
4019       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4020           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4021               ->spec.ccl.valid_codes)[c])
4022         try |= CODING_CATEGORY_MASK_CCL;
4023
4024       mask = 0;
4025       utf16_examined_p = iso2022_examined_p = 0;
4026       if (priorities)
4027         {
4028           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4029             {
4030               if (!iso2022_examined_p
4031                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4032                 {
4033                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4034                   iso2022_examined_p = 1;
4035                 }
4036               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4037                 mask |= detect_coding_sjis (src, src_end, multibytep);
4038               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4039                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4040               else if (!utf16_examined_p
4041                        && (priorities[i] & try &
4042                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4043                 {
4044                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4045                   utf16_examined_p = 1;
4046                 }
4047               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4048                 mask |= detect_coding_big5 (src, src_end, multibytep);
4049               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4050                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4051               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4052                 mask |= detect_coding_ccl (src, src_end, multibytep);
4053               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4054                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4055               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4056                 mask |= CODING_CATEGORY_MASK_BINARY;
4057               if (mask & priorities[i])
4058                 return priorities[i];
4059             }
4060           return CODING_CATEGORY_MASK_RAW_TEXT;
4061         }
4062       if (try & CODING_CATEGORY_MASK_ISO)
4063         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4064       if (try & CODING_CATEGORY_MASK_SJIS)
4065         mask |= detect_coding_sjis (src, src_end, multibytep);
4066       if (try & CODING_CATEGORY_MASK_BIG5)
4067         mask |= detect_coding_big5 (src, src_end, multibytep);
4068       if (try & CODING_CATEGORY_MASK_UTF_8)
4069         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4070       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4071         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4072       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4073         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4074       if (try & CODING_CATEGORY_MASK_CCL)
4075         mask |= detect_coding_ccl (src, src_end, multibytep);
4076     }
4077   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4078 }
4079
4080 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4081    The information of the detected coding system is set in CODING.  */
4082
4083 void
4084 detect_coding (coding, src, src_bytes)
4085      struct coding_system *coding;
4086      const unsigned char *src;
4087      int src_bytes;
4088 {
4089   unsigned int idx;
4090   int skip, mask;
4091   Lisp_Object val;
4092
4093   val = Vcoding_category_list;
4094   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4095                              coding->src_multibyte);
4096   coding->heading_ascii = skip;
4097
4098   if (!mask) return;
4099
4100   /* We found a single coding system of the highest priority in MASK.  */
4101   idx = 0;
4102   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4103   if (! mask)
4104     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4105
4106   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4107
4108   if (coding->eol_type != CODING_EOL_UNDECIDED)
4109     {
4110       Lisp_Object tmp;
4111
4112       tmp = Fget (val, Qeol_type);
4113       if (VECTORP (tmp))
4114         val = XVECTOR (tmp)->contents[coding->eol_type];
4115     }
4116
4117   /* Setup this new coding system while preserving some slots.  */
4118   {
4119     int src_multibyte = coding->src_multibyte;
4120     int dst_multibyte = coding->dst_multibyte;
4121
4122     setup_coding_system (val, coding);
4123     coding->src_multibyte = src_multibyte;
4124     coding->dst_multibyte = dst_multibyte;
4125     coding->heading_ascii = skip;
4126   }
4127 }
4128
4129 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4130    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4131    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4132
4133    How many non-eol characters are at the head is returned as *SKIP.  */
4134
4135 #define MAX_EOL_CHECK_COUNT 3
4136
4137 static int
4138 detect_eol_type (source, src_bytes, skip)
4139      unsigned char *source;
4140      int src_bytes, *skip;
4141 {
4142   unsigned char *src = source, *src_end = src + src_bytes;
4143   unsigned char c;
4144   int total = 0;                /* How many end-of-lines are found so far.  */
4145   int eol_type = CODING_EOL_UNDECIDED;
4146   int this_eol_type;
4147
4148   *skip = 0;
4149
4150   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4151     {
4152       c = *src++;
4153       if (c == '\n' || c == '\r')
4154         {
4155           if (*skip == 0)
4156             *skip = src - 1 - source;
4157           total++;
4158           if (c == '\n')
4159             this_eol_type = CODING_EOL_LF;
4160           else if (src >= src_end || *src != '\n')
4161             this_eol_type = CODING_EOL_CR;
4162           else
4163             this_eol_type = CODING_EOL_CRLF, src++;
4164
4165           if (eol_type == CODING_EOL_UNDECIDED)
4166             /* This is the first end-of-line.  */
4167             eol_type = this_eol_type;
4168           else if (eol_type != this_eol_type)
4169             {
4170               /* The found type is different from what found before.  */
4171               eol_type = CODING_EOL_INCONSISTENT;
4172               break;
4173             }
4174         }
4175     }
4176
4177   if (*skip == 0)
4178     *skip = src_end - source;
4179   return eol_type;
4180 }
4181
4182 /* Like detect_eol_type, but detect EOL type in 2-octet
4183    big-endian/little-endian format for coding systems utf-16-be and
4184    utf-16-le.  */
4185
4186 static int
4187 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4188      unsigned char *source;
4189      int src_bytes, *skip, big_endian_p;
4190 {
4191   unsigned char *src = source, *src_end = src + src_bytes;
4192   unsigned int c1, c2;
4193   int total = 0;                /* How many end-of-lines are found so far.  */
4194   int eol_type = CODING_EOL_UNDECIDED;
4195   int this_eol_type;
4196   int msb, lsb;
4197
4198   if (big_endian_p)
4199     msb = 0, lsb = 1;
4200   else
4201     msb = 1, lsb = 0;
4202
4203   *skip = 0;
4204
4205   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4206     {
4207       c1 = (src[msb] << 8) | (src[lsb]);
4208       src += 2;
4209
4210       if (c1 == '\n' || c1 == '\r')
4211         {
4212           if (*skip == 0)
4213             *skip = src - 2 - source;
4214           total++;
4215           if (c1 == '\n')
4216             {
4217               this_eol_type = CODING_EOL_LF;
4218             }
4219           else
4220             {
4221               if ((src + 1) >= src_end)
4222                 {
4223                   this_eol_type = CODING_EOL_CR;
4224                 }
4225               else
4226                 {
4227                   c2 = (src[msb] << 8) | (src[lsb]);
4228                   if (c2 == '\n')
4229                     this_eol_type = CODING_EOL_CRLF, src += 2;
4230                   else
4231                     this_eol_type = CODING_EOL_CR;
4232                 }
4233             }
4234
4235           if (eol_type == CODING_EOL_UNDECIDED)
4236             /* This is the first end-of-line.  */
4237             eol_type = this_eol_type;
4238           else if (eol_type != this_eol_type)
4239             {
4240               /* The found type is different from what found before.  */
4241               eol_type = CODING_EOL_INCONSISTENT;
4242               break;
4243             }
4244         }
4245     }
4246
4247   if (*skip == 0)
4248     *skip = src_end - source;
4249   return eol_type;
4250 }
4251
4252 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4253    is encoded.  If it detects an appropriate format of end-of-line, it
4254    sets the information in *CODING.  */
4255
4256 void
4257 detect_eol (coding, src, src_bytes)
4258      struct coding_system *coding;
4259      const unsigned char *src;
4260      int src_bytes;
4261 {
4262   Lisp_Object val;
4263   int skip;
4264   int eol_type;
4265
4266   switch (coding->category_idx)
4267     {
4268     case CODING_CATEGORY_IDX_UTF_16_BE:
4269       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4270       break;
4271     case CODING_CATEGORY_IDX_UTF_16_LE:
4272       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4273       break;
4274     default:
4275       eol_type = detect_eol_type (src, src_bytes, &skip);
4276       break;
4277     }
4278
4279   if (coding->heading_ascii > skip)
4280     coding->heading_ascii = skip;
4281   else
4282     skip = coding->heading_ascii;
4283
4284   if (eol_type == CODING_EOL_UNDECIDED)
4285     return;
4286   if (eol_type == CODING_EOL_INCONSISTENT)
4287     {
4288 #if 0
4289       /* This code is suppressed until we find a better way to
4290          distinguish raw text file and binary file.  */
4291
4292       /* If we have already detected that the coding is raw-text, the
4293          coding should actually be no-conversion.  */
4294       if (coding->type == coding_type_raw_text)
4295         {
4296           setup_coding_system (Qno_conversion, coding);
4297           return;
4298         }
4299       /* Else, let's decode only text code anyway.  */
4300 #endif /* 0 */
4301       eol_type = CODING_EOL_LF;
4302     }
4303
4304   val = Fget (coding->symbol, Qeol_type);
4305   if (VECTORP (val) && XVECTOR (val)->size == 3)
4306     {
4307       int src_multibyte = coding->src_multibyte;
4308       int dst_multibyte = coding->dst_multibyte;
4309       struct composition_data *cmp_data = coding->cmp_data;
4310
4311       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4312       coding->src_multibyte = src_multibyte;
4313       coding->dst_multibyte = dst_multibyte;
4314       coding->heading_ascii = skip;
4315       coding->cmp_data = cmp_data;
4316     }
4317 }
4318
4319 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4320
4321 #define DECODING_BUFFER_MAG(coding)                     \
4322   (coding->type == coding_type_iso2022                  \
4323    ? 3                                                  \
4324    : (coding->type == coding_type_ccl                   \
4325       ? coding->spec.ccl.decoder.buf_magnification      \
4326       : 2))
4327
4328 /* Return maximum size (bytes) of a buffer enough for decoding
4329    SRC_BYTES of text encoded in CODING.  */
4330
4331 int
4332 decoding_buffer_size (coding, src_bytes)
4333      struct coding_system *coding;
4334      int src_bytes;
4335 {
4336   return (src_bytes * DECODING_BUFFER_MAG (coding)
4337           + CONVERSION_BUFFER_EXTRA_ROOM);
4338 }
4339
4340 /* Return maximum size (bytes) of a buffer enough for encoding
4341    SRC_BYTES of text to CODING.  */
4342
4343 int
4344 encoding_buffer_size (coding, src_bytes)
4345      struct coding_system *coding;
4346      int src_bytes;
4347 {
4348   int magnification;
4349
4350   if (coding->type == coding_type_ccl)
4351     magnification = coding->spec.ccl.encoder.buf_magnification;
4352   else if (CODING_REQUIRE_ENCODING (coding))
4353     magnification = 3;
4354   else
4355     magnification = 1;
4356
4357   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4358 }
4359
4360 /* Working buffer for code conversion.  */
4361 struct conversion_buffer
4362 {
4363   int size;                     /* size of data.  */
4364   int on_stack;                 /* 1 if allocated by alloca.  */
4365   unsigned char *data;
4366 };
4367
4368 /* Don't use alloca for allocating memory space larger than this, lest
4369    we overflow their stack.  */
4370 #define MAX_ALLOCA 16*1024
4371
4372 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4373 #define allocate_conversion_buffer(buf, len)            \
4374   do {                                                  \
4375     if (len < MAX_ALLOCA)                               \
4376       {                                                 \
4377         buf.data = (unsigned char *) alloca (len);      \
4378         buf.on_stack = 1;                               \
4379       }                                                 \
4380     else                                                \
4381       {                                                 \
4382         buf.data = (unsigned char *) xmalloc (len);     \
4383         buf.on_stack = 0;                               \
4384       }                                                 \
4385     buf.size = len;                                     \
4386   } while (0)
4387
4388 /* Double the allocated memory for *BUF.  */
4389 static void
4390 extend_conversion_buffer (buf)
4391      struct conversion_buffer *buf;
4392 {
4393   if (buf->on_stack)
4394     {
4395       unsigned char *save = buf->data;
4396       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4397       bcopy (save, buf->data, buf->size);
4398       buf->on_stack = 0;
4399     }
4400   else
4401     {
4402       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4403     }
4404   buf->size *= 2;
4405 }
4406
4407 /* Free the allocated memory for BUF if it is not on stack.  */
4408 static void
4409 free_conversion_buffer (buf)
4410      struct conversion_buffer *buf;
4411 {
4412   if (!buf->on_stack)
4413     xfree (buf->data);
4414 }
4415
4416 int
4417 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4418      struct coding_system *coding;
4419      unsigned char *source, *destination;
4420      int src_bytes, dst_bytes, encodep;
4421 {
4422   struct ccl_program *ccl
4423     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4424   unsigned char *dst = destination;
4425
4426   ccl->suppress_error = coding->suppress_error;
4427   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4428   if (encodep)
4429     {
4430       /* On encoding, EOL format is converted within ccl_driver.  For
4431          that, setup proper information in the structure CCL.  */
4432       ccl->eol_type = coding->eol_type;
4433       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4434         ccl->eol_type = CODING_EOL_LF;
4435       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4436     }
4437   ccl->multibyte = coding->src_multibyte;
4438   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4439     {
4440       /* Move carryover bytes to DESTINATION.  */
4441       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4442       while (*p)
4443         *dst++ = *p++;
4444       coding->spec.ccl.eight_bit_carryover[0] = 0;
4445       if (dst_bytes)
4446         dst_bytes -= dst - destination;
4447     }
4448
4449   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4450                                   &(coding->consumed))
4451                       + dst - destination);
4452
4453   if (encodep)
4454     {
4455       coding->produced_char = coding->produced;
4456       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4457     }
4458   else if (!ccl->eight_bit_control)
4459     {
4460       /* The produced bytes forms a valid multibyte sequence. */
4461       coding->produced_char
4462         = multibyte_chars_in_text (destination, coding->produced);
4463       coding->spec.ccl.eight_bit_carryover[0] = 0;
4464     }
4465   else
4466     {
4467       /* On decoding, the destination should always multibyte.  But,
4468          CCL program might have been generated an invalid multibyte
4469          sequence.  Here we make such a sequence valid as
4470          multibyte.  */
4471       int bytes
4472         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4473
4474       if ((coding->consumed < src_bytes
4475            || !ccl->last_block)
4476           && coding->produced >= 1
4477           && destination[coding->produced - 1] >= 0x80)
4478         {
4479           /* We should not convert the tailing 8-bit codes to
4480              multibyte form even if they doesn't form a valid
4481              multibyte sequence.  They may form a valid sequence in
4482              the next call.  */
4483           int carryover = 0;
4484
4485           if (destination[coding->produced - 1] < 0xA0)
4486             carryover = 1;
4487           else if (coding->produced >= 2)
4488             {
4489               if (destination[coding->produced - 2] >= 0x80)
4490                 {
4491                   if (destination[coding->produced - 2] < 0xA0)
4492                     carryover = 2;
4493                   else if (coding->produced >= 3
4494                            && destination[coding->produced - 3] >= 0x80
4495                            && destination[coding->produced - 3] < 0xA0)
4496                     carryover = 3;
4497                 }
4498             }
4499           if (carryover > 0)
4500             {
4501               BCOPY_SHORT (destination + coding->produced - carryover,
4502                            coding->spec.ccl.eight_bit_carryover,
4503                            carryover);
4504               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4505               coding->produced -= carryover;
4506             }
4507         }
4508       coding->produced = str_as_multibyte (destination, bytes,
4509                                            coding->produced,
4510                                            &(coding->produced_char));
4511     }
4512
4513   switch (ccl->status)
4514     {
4515     case CCL_STAT_SUSPEND_BY_SRC:
4516       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4517       break;
4518     case CCL_STAT_SUSPEND_BY_DST:
4519       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4520       break;
4521     case CCL_STAT_QUIT:
4522     case CCL_STAT_INVALID_CMD:
4523       coding->result = CODING_FINISH_INTERRUPT;
4524       break;
4525     default:
4526       coding->result = CODING_FINISH_NORMAL;
4527       break;
4528     }
4529   return coding->result;
4530 }
4531
4532 /* Decode EOL format of the text at PTR of BYTES length destructively
4533    according to CODING->eol_type.  This is called after the CCL
4534    program produced a decoded text at PTR.  If we do CRLF->LF
4535    conversion, update CODING->produced and CODING->produced_char.  */
4536
4537 static void
4538 decode_eol_post_ccl (coding, ptr, bytes)
4539      struct coding_system *coding;
4540      unsigned char *ptr;
4541      int bytes;
4542 {
4543   Lisp_Object val, saved_coding_symbol;
4544   unsigned char *pend = ptr + bytes;
4545   int dummy;
4546
4547   /* Remember the current coding system symbol.  We set it back when
4548      an inconsistent EOL is found so that `last-coding-system-used' is
4549      set to the coding system that doesn't specify EOL conversion.  */
4550   saved_coding_symbol = coding->symbol;
4551
4552   coding->spec.ccl.cr_carryover = 0;
4553   if (coding->eol_type == CODING_EOL_UNDECIDED)
4554     {
4555       /* Here, to avoid the call of setup_coding_system, we directly
4556          call detect_eol_type.  */
4557       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4558       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4559         coding->eol_type = CODING_EOL_LF;
4560       if (coding->eol_type != CODING_EOL_UNDECIDED)
4561         {
4562           val = Fget (coding->symbol, Qeol_type);
4563           if (VECTORP (val) && XVECTOR (val)->size == 3)
4564             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4565         }
4566       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4567     }
4568
4569   if (coding->eol_type == CODING_EOL_LF
4570       || coding->eol_type == CODING_EOL_UNDECIDED)
4571     {
4572       /* We have nothing to do.  */
4573       ptr = pend;
4574     }
4575   else if (coding->eol_type == CODING_EOL_CRLF)
4576     {
4577       unsigned char *pstart = ptr, *p = ptr;
4578
4579       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4580           && *(pend - 1) == '\r')
4581         {
4582           /* If the last character is CR, we can't handle it here
4583              because LF will be in the not-yet-decoded source text.
4584              Record that the CR is not yet processed.  */
4585           coding->spec.ccl.cr_carryover = 1;
4586           coding->produced--;
4587           coding->produced_char--;
4588           pend--;
4589         }
4590       while (ptr < pend)
4591         {
4592           if (*ptr == '\r')
4593             {
4594               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4595                 {
4596                   *p++ = '\n';
4597                   ptr += 2;
4598                 }
4599               else
4600                 {
4601                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4602                     goto undo_eol_conversion;
4603                   *p++ = *ptr++;
4604                 }
4605             }
4606           else if (*ptr == '\n'
4607                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4608             goto undo_eol_conversion;
4609           else
4610             *p++ = *ptr++;
4611           continue;
4612
4613         undo_eol_conversion:
4614           /* We have faced with inconsistent EOL format at PTR.
4615              Convert all LFs before PTR back to CRLFs.  */
4616           for (p--, ptr--; p >= pstart; p--)
4617             {
4618               if (*p == '\n')
4619                 *ptr-- = '\n', *ptr-- = '\r';
4620               else
4621                 *ptr-- = *p;
4622             }
4623           /*  If carryover is recorded, cancel it because we don't
4624               convert CRLF anymore.  */
4625           if (coding->spec.ccl.cr_carryover)
4626             {
4627               coding->spec.ccl.cr_carryover = 0;
4628               coding->produced++;
4629               coding->produced_char++;
4630               pend++;
4631             }
4632           p = ptr = pend;
4633           coding->eol_type = CODING_EOL_LF;
4634           coding->symbol = saved_coding_symbol;
4635         }
4636       if (p < pend)
4637         {
4638           /* As each two-byte sequence CRLF was converted to LF, (PEND
4639              - P) is the number of deleted characters.  */
4640           coding->produced -= pend - p;
4641           coding->produced_char -= pend - p;
4642         }
4643     }
4644   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4645     {
4646       unsigned char *p = ptr;
4647
4648       for (; ptr < pend; ptr++)
4649         {
4650           if (*ptr == '\r')
4651             *ptr = '\n';
4652           else if (*ptr == '\n'
4653                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4654             {
4655               for (; p < ptr; p++)
4656                 {
4657                   if (*p == '\n')
4658                     *p = '\r';
4659                 }
4660               ptr = pend;
4661               coding->eol_type = CODING_EOL_LF;
4662               coding->symbol = saved_coding_symbol;
4663             }
4664         }
4665     }
4666 }
4667
4668 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4669    decoding, it may detect coding system and format of end-of-line if
4670    those are not yet decided.  The source should be unibyte, the
4671    result is multibyte if CODING->dst_multibyte is nonzero, else
4672    unibyte.  */
4673
4674 int
4675 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4676      struct coding_system *coding;
4677      const unsigned char *source;
4678      unsigned char *destination;
4679      int src_bytes, dst_bytes;
4680 {
4681   int extra = 0;
4682
4683   if (coding->type == coding_type_undecided)
4684     detect_coding (coding, source, src_bytes);
4685
4686   if (coding->eol_type == CODING_EOL_UNDECIDED
4687       && coding->type != coding_type_ccl)
4688     {
4689       detect_eol (coding, source, src_bytes);
4690       /* We had better recover the original eol format if we
4691          encounter an inconsistent eol format while decoding.  */
4692       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4693     }
4694
4695   coding->produced = coding->produced_char = 0;
4696   coding->consumed = coding->consumed_char = 0;
4697   coding->errors = 0;
4698   coding->result = CODING_FINISH_NORMAL;
4699
4700   switch (coding->type)
4701     {
4702     case coding_type_sjis:
4703       decode_coding_sjis_big5 (coding, source, destination,
4704                                src_bytes, dst_bytes, 1);
4705       break;
4706
4707     case coding_type_iso2022:
4708       decode_coding_iso2022 (coding, source, destination,
4709                              src_bytes, dst_bytes);
4710       break;
4711
4712     case coding_type_big5:
4713       decode_coding_sjis_big5 (coding, source, destination,
4714                                src_bytes, dst_bytes, 0);
4715       break;
4716
4717     case coding_type_emacs_mule:
4718       decode_coding_emacs_mule (coding, source, destination,
4719                                 src_bytes, dst_bytes);
4720       break;
4721
4722     case coding_type_ccl:
4723       if (coding->spec.ccl.cr_carryover)
4724         {
4725           /* Put the CR which was not processed by the previous call
4726              of decode_eol_post_ccl in DESTINATION.  It will be
4727              decoded together with the following LF by the call to
4728              decode_eol_post_ccl below.  */
4729           *destination = '\r';
4730           coding->produced++;
4731           coding->produced_char++;
4732           dst_bytes--;
4733           extra = coding->spec.ccl.cr_carryover;
4734         }
4735       ccl_coding_driver (coding, source, destination + extra,
4736                          src_bytes, dst_bytes, 0);
4737       if (coding->eol_type != CODING_EOL_LF)
4738         {
4739           coding->produced += extra;
4740           coding->produced_char += extra;
4741           decode_eol_post_ccl (coding, destination, coding->produced);
4742         }
4743       break;
4744
4745     default:
4746       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4747     }
4748
4749   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4750       && coding->mode & CODING_MODE_LAST_BLOCK
4751       && coding->consumed == src_bytes)
4752     coding->result = CODING_FINISH_NORMAL;
4753
4754   if (coding->mode & CODING_MODE_LAST_BLOCK
4755       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4756     {
4757       const unsigned char *src = source + coding->consumed;
4758       unsigned char *dst = destination + coding->produced;
4759
4760       src_bytes -= coding->consumed;
4761       coding->errors++;
4762       if (COMPOSING_P (coding))
4763         DECODE_COMPOSITION_END ('1');
4764       while (src_bytes--)
4765         {
4766           int c = *src++;
4767           dst += CHAR_STRING (c, dst);
4768           coding->produced_char++;
4769         }
4770       coding->consumed = coding->consumed_char = src - source;
4771       coding->produced = dst - destination;
4772       coding->result = CODING_FINISH_NORMAL;
4773     }
4774
4775   if (!coding->dst_multibyte)
4776     {
4777       coding->produced = str_as_unibyte (destination, coding->produced);
4778       coding->produced_char = coding->produced;
4779     }
4780
4781   return coding->result;
4782 }
4783
4784 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4785    multibyteness of the source is CODING->src_multibyte, the
4786    multibyteness of the result is always unibyte.  */
4787
4788 int
4789 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4790      struct coding_system *coding;
4791      const unsigned char *source;
4792      unsigned char *destination;
4793      int src_bytes, dst_bytes;
4794 {
4795   coding->produced = coding->produced_char = 0;
4796   coding->consumed = coding->consumed_char = 0;
4797   coding->errors = 0;
4798   coding->result = CODING_FINISH_NORMAL;
4799
4800   switch (coding->type)
4801     {
4802     case coding_type_sjis:
4803       encode_coding_sjis_big5 (coding, source, destination,
4804                                src_bytes, dst_bytes, 1);
4805       break;
4806
4807     case coding_type_iso2022:
4808       encode_coding_iso2022 (coding, source, destination,
4809                              src_bytes, dst_bytes);
4810       break;
4811
4812     case coding_type_big5:
4813       encode_coding_sjis_big5 (coding, source, destination,
4814                                src_bytes, dst_bytes, 0);
4815       break;
4816
4817     case coding_type_emacs_mule:
4818       encode_coding_emacs_mule (coding, source, destination,
4819                                 src_bytes, dst_bytes);
4820       break;
4821
4822     case coding_type_ccl:
4823       ccl_coding_driver (coding, source, destination,
4824                          src_bytes, dst_bytes, 1);
4825       break;
4826
4827     default:
4828       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4829     }
4830
4831   if (coding->mode & CODING_MODE_LAST_BLOCK
4832       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4833     {
4834       const unsigned char *src = source + coding->consumed;
4835       unsigned char *dst = destination + coding->produced;
4836
4837       if (coding->type == coding_type_iso2022)
4838         ENCODE_RESET_PLANE_AND_REGISTER;
4839       if (COMPOSING_P (coding))
4840         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4841       if (coding->consumed < src_bytes)
4842         {
4843           int len = src_bytes - coding->consumed;
4844
4845           BCOPY_SHORT (src, dst, len);
4846           if (coding->src_multibyte)
4847             len = str_as_unibyte (dst, len);
4848           dst += len;
4849           coding->consumed = src_bytes;
4850         }
4851       coding->produced = coding->produced_char = dst - destination;
4852       coding->result = CODING_FINISH_NORMAL;
4853     }
4854
4855   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4856       && coding->consumed == src_bytes)
4857     coding->result = CODING_FINISH_NORMAL;
4858
4859   return coding->result;
4860 }
4861
4862 /* Scan text in the region between *BEG and *END (byte positions),
4863    skip characters which we don't have to decode by coding system
4864    CODING at the head and tail, then set *BEG and *END to the region
4865    of the text we actually have to convert.  The caller should move
4866    the gap out of the region in advance if the region is from a
4867    buffer.
4868
4869    If STR is not NULL, *BEG and *END are indices into STR.  */
4870
4871 static void
4872 shrink_decoding_region (beg, end, coding, str)
4873      int *beg, *end;
4874      struct coding_system *coding;
4875      unsigned char *str;
4876 {
4877   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4878   int eol_conversion;
4879   Lisp_Object translation_table;
4880
4881   if (coding->type == coding_type_ccl
4882       || coding->type == coding_type_undecided
4883       || coding->eol_type != CODING_EOL_LF
4884       || !NILP (coding->post_read_conversion)
4885       || coding->composing != COMPOSITION_DISABLED)
4886     {
4887       /* We can't skip any data.  */
4888       return;
4889     }
4890   if (coding->type == coding_type_no_conversion
4891       || coding->type == coding_type_raw_text
4892       || coding->type == coding_type_emacs_mule)
4893     {
4894       /* We need no conversion, but don't have to skip any data here.
4895          Decoding routine handles them effectively anyway.  */
4896       return;
4897     }
4898
4899   translation_table = coding->translation_table_for_decode;
4900   if (NILP (translation_table) && !NILP (Venable_character_translation))
4901     translation_table = Vstandard_translation_table_for_decode;
4902   if (CHAR_TABLE_P (translation_table))
4903     {
4904       int i;
4905       for (i = 0; i < 128; i++)
4906         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4907           break;
4908       if (i < 128)
4909         /* Some ASCII character should be translated.  We give up
4910            shrinking.  */
4911         return;
4912     }
4913
4914   if (coding->heading_ascii >= 0)
4915     /* Detection routine has already found how much we can skip at the
4916        head.  */
4917     *beg += coding->heading_ascii;
4918
4919   if (str)
4920     {
4921       begp_orig = begp = str + *beg;
4922       endp_orig = endp = str + *end;
4923     }
4924   else
4925     {
4926       begp_orig = begp = BYTE_POS_ADDR (*beg);
4927       endp_orig = endp = begp + *end - *beg;
4928     }
4929
4930   eol_conversion = (coding->eol_type == CODING_EOL_CR
4931                     || coding->eol_type == CODING_EOL_CRLF);
4932
4933   switch (coding->type)
4934     {
4935     case coding_type_sjis:
4936     case coding_type_big5:
4937       /* We can skip all ASCII characters at the head.  */
4938       if (coding->heading_ascii < 0)
4939         {
4940           if (eol_conversion)
4941             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4942           else
4943             while (begp < endp && *begp < 0x80) begp++;
4944         }
4945       /* We can skip all ASCII characters at the tail except for the
4946          second byte of SJIS or BIG5 code.  */
4947       if (eol_conversion)
4948         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4949       else
4950         while (begp < endp && endp[-1] < 0x80) endp--;
4951       /* Do not consider LF as ascii if preceded by CR, since that
4952          confuses eol decoding. */
4953       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4954         endp++;
4955       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4956         endp++;
4957       break;
4958
4959     case coding_type_iso2022:
4960       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4961         /* We can't skip any data.  */
4962         break;
4963       if (coding->heading_ascii < 0)
4964         {
4965           /* We can skip all ASCII characters at the head except for a
4966              few control codes.  */
4967           while (begp < endp && (c = *begp) < 0x80
4968                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4969                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4970                  && (!eol_conversion || c != ISO_CODE_LF))
4971             begp++;
4972         }
4973       switch (coding->category_idx)
4974         {
4975         case CODING_CATEGORY_IDX_ISO_8_1:
4976         case CODING_CATEGORY_IDX_ISO_8_2:
4977           /* We can skip all ASCII characters at the tail.  */
4978           if (eol_conversion)
4979             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4980           else
4981             while (begp < endp && endp[-1] < 0x80) endp--;
4982           /* Do not consider LF as ascii if preceded by CR, since that
4983              confuses eol decoding. */
4984           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4985             endp++;
4986           break;
4987
4988         case CODING_CATEGORY_IDX_ISO_7:
4989         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4990           {
4991             /* We can skip all characters at the tail except for 8-bit
4992                codes and ESC and the following 2-byte at the tail.  */
4993             unsigned char *eight_bit = NULL;
4994
4995             if (eol_conversion)
4996               while (begp < endp
4997                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4998                 {
4999                   if (!eight_bit && c & 0x80) eight_bit = endp;
5000                   endp--;
5001                 }
5002             else
5003               while (begp < endp
5004                      && (c = endp[-1]) != ISO_CODE_ESC)
5005                 {
5006                   if (!eight_bit && c & 0x80) eight_bit = endp;
5007                   endp--;
5008                 }
5009             /* Do not consider LF as ascii if preceded by CR, since that
5010                confuses eol decoding. */
5011             if (begp < endp && endp < endp_orig
5012                 && endp[-1] == '\r' && endp[0] == '\n')
5013               endp++;
5014             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5015               {
5016                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5017                   /* This is an ASCII designation sequence.  We can
5018                      surely skip the tail.  But, if we have
5019                      encountered an 8-bit code, skip only the codes
5020                      after that.  */
5021                   endp = eight_bit ? eight_bit : endp + 2;
5022                 else
5023                   /* Hmmm, we can't skip the tail.  */
5024                   endp = endp_orig;
5025               }
5026             else if (eight_bit)
5027               endp = eight_bit;
5028           }
5029         }
5030       break;
5031
5032     default:
5033       abort ();
5034     }
5035   *beg += begp - begp_orig;
5036   *end += endp - endp_orig;
5037   return;
5038 }
5039
5040 /* Like shrink_decoding_region but for encoding.  */
5041
5042 static void
5043 shrink_encoding_region (beg, end, coding, str)
5044      int *beg, *end;
5045      struct coding_system *coding;
5046      unsigned char *str;
5047 {
5048   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5049   int eol_conversion;
5050   Lisp_Object translation_table;
5051
5052   if (coding->type == coding_type_ccl
5053       || coding->eol_type == CODING_EOL_CRLF
5054       || coding->eol_type == CODING_EOL_CR
5055       || (coding->cmp_data && coding->cmp_data->used > 0))
5056     {
5057       /* We can't skip any data.  */
5058       return;
5059     }
5060   if (coding->type == coding_type_no_conversion
5061       || coding->type == coding_type_raw_text
5062       || coding->type == coding_type_emacs_mule
5063       || coding->type == coding_type_undecided)
5064     {
5065       /* We need no conversion, but don't have to skip any data here.
5066          Encoding routine handles them effectively anyway.  */
5067       return;
5068     }
5069
5070   translation_table = coding->translation_table_for_encode;
5071   if (NILP (translation_table) && !NILP (Venable_character_translation))
5072     translation_table = Vstandard_translation_table_for_encode;
5073   if (CHAR_TABLE_P (translation_table))
5074     {
5075       int i;
5076       for (i = 0; i < 128; i++)
5077         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5078           break;
5079       if (i < 128)
5080         /* Some ASCII character should be translated.  We give up
5081            shrinking.  */
5082         return;
5083     }
5084
5085   if (str)
5086     {
5087       begp_orig = begp = str + *beg;
5088       endp_orig = endp = str + *end;
5089     }
5090   else
5091     {
5092       begp_orig = begp = BYTE_POS_ADDR (*beg);
5093       endp_orig = endp = begp + *end - *beg;
5094     }
5095
5096   eol_conversion = (coding->eol_type == CODING_EOL_CR
5097                     || coding->eol_type == CODING_EOL_CRLF);
5098
5099   /* Here, we don't have to check coding->pre_write_conversion because
5100      the caller is expected to have handled it already.  */
5101   switch (coding->type)
5102     {
5103     case coding_type_iso2022:
5104       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5105         /* We can't skip any data.  */
5106         break;
5107       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5108         {
5109           unsigned char *bol = begp;
5110           while (begp < endp && *begp < 0x80)
5111             {
5112               begp++;
5113               if (begp[-1] == '\n')
5114                 bol = begp;
5115             }
5116           begp = bol;
5117           goto label_skip_tail;
5118         }
5119       /* fall down ... */
5120
5121     case coding_type_sjis:
5122     case coding_type_big5:
5123       /* We can skip all ASCII characters at the head and tail.  */
5124       if (eol_conversion)
5125         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5126       else
5127         while (begp < endp && *begp < 0x80) begp++;
5128     label_skip_tail:
5129       if (eol_conversion)
5130         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5131       else
5132         while (begp < endp && *(endp - 1) < 0x80) endp--;
5133       break;
5134
5135     default:
5136       abort ();
5137     }
5138
5139   *beg += begp - begp_orig;
5140   *end += endp - endp_orig;
5141   return;
5142 }
5143
5144 /* As shrinking conversion region requires some overhead, we don't try
5145    shrinking if the length of conversion region is less than this
5146    value.  */
5147 static int shrink_conversion_region_threshhold = 1024;
5148
5149 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5150   do {                                                                  \
5151     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5152       {                                                                 \
5153         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5154         else shrink_decoding_region (beg, end, coding, str);            \
5155       }                                                                 \
5156   } while (0)
5157
5158 static Lisp_Object
5159 code_convert_region_unwind (dummy)
5160      Lisp_Object dummy;
5161 {
5162   inhibit_pre_post_conversion = 0;
5163   return Qnil;
5164 }
5165
5166 /* Store information about all compositions in the range FROM and TO
5167    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5168    buffer or a string, defaults to the current buffer.  */
5169
5170 void
5171 coding_save_composition (coding, from, to, obj)
5172      struct coding_system *coding;
5173      int from, to;
5174      Lisp_Object obj;
5175 {
5176   Lisp_Object prop;
5177   int start, end;
5178
5179   if (coding->composing == COMPOSITION_DISABLED)
5180     return;
5181   if (!coding->cmp_data)
5182     coding_allocate_composition_data (coding, from);
5183   if (!find_composition (from, to, &start, &end, &prop, obj)
5184       || end > to)
5185     return;
5186   if (start < from
5187       && (!find_composition (end, to, &start, &end, &prop, obj)
5188           || end > to))
5189     return;
5190   coding->composing = COMPOSITION_NO;
5191   do
5192     {
5193       if (COMPOSITION_VALID_P (start, end, prop))
5194         {
5195           enum composition_method method = COMPOSITION_METHOD (prop);
5196           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5197               >= COMPOSITION_DATA_SIZE)
5198             coding_allocate_composition_data (coding, from);
5199           /* For relative composition, we remember start and end
5200              positions, for the other compositions, we also remember
5201              components.  */
5202           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5203           if (method != COMPOSITION_RELATIVE)
5204             {
5205               /* We must store a*/
5206               Lisp_Object val, ch;
5207
5208               val = COMPOSITION_COMPONENTS (prop);
5209               if (CONSP (val))
5210                 while (CONSP (val))
5211                   {
5212                     ch = XCAR (val), val = XCDR (val);
5213                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5214                   }
5215               else if (VECTORP (val) || STRINGP (val))
5216                 {
5217                   int len = (VECTORP (val)
5218                              ? XVECTOR (val)->size : SCHARS (val));
5219                   int i;
5220                   for (i = 0; i < len; i++)
5221                     {
5222                       ch = (STRINGP (val)
5223                             ? Faref (val, make_number (i))
5224                             : XVECTOR (val)->contents[i]);
5225                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5226                     }
5227                 }
5228               else              /* INTEGERP (val) */
5229                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5230             }
5231           CODING_ADD_COMPOSITION_END (coding, end - from);
5232         }
5233       start = end;
5234     }
5235   while (start < to
5236          && find_composition (start, to, &start, &end, &prop, obj)
5237          && end <= to);
5238
5239   /* Make coding->cmp_data point to the first memory block.  */
5240   while (coding->cmp_data->prev)
5241     coding->cmp_data = coding->cmp_data->prev;
5242   coding->cmp_data_start = 0;
5243 }
5244
5245 /* Reflect the saved information about compositions to OBJ.
5246    CODING->cmp_data points to a memory block for the information.  OBJ
5247    is a buffer or a string, defaults to the current buffer.  */
5248
5249 void
5250 coding_restore_composition (coding, obj)
5251      struct coding_system *coding;
5252      Lisp_Object obj;
5253 {
5254   struct composition_data *cmp_data = coding->cmp_data;
5255
5256   if (!cmp_data)
5257     return;
5258
5259   while (cmp_data->prev)
5260     cmp_data = cmp_data->prev;
5261
5262   while (cmp_data)
5263     {
5264       int i;
5265
5266       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5267            i += cmp_data->data[i])
5268         {
5269           int *data = cmp_data->data + i;
5270           enum composition_method method = (enum composition_method) data[3];
5271           Lisp_Object components;
5272
5273           if (method == COMPOSITION_RELATIVE)
5274             components = Qnil;
5275           else
5276             {
5277               int len = data[0] - 4, j;
5278               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5279
5280               for (j = 0; j < len; j++)
5281                 args[j] = make_number (data[4 + j]);
5282               components = (method == COMPOSITION_WITH_ALTCHARS
5283                             ? Fstring (len, args) : Fvector (len, args));
5284             }
5285           compose_text (data[1], data[2], components, Qnil, obj);
5286         }
5287       cmp_data = cmp_data->next;
5288     }
5289 }
5290
5291 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5292    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5293    coding system CODING, and return the status code of code conversion
5294    (currently, this value has no meaning).
5295
5296    How many characters (and bytes) are converted to how many
5297    characters (and bytes) are recorded in members of the structure
5298    CODING.
5299
5300    If REPLACE is nonzero, we do various things as if the original text
5301    is deleted and a new text is inserted.  See the comments in
5302    replace_range (insdel.c) to know what we are doing.
5303
5304    If REPLACE is zero, it is assumed that the source text is unibyte.
5305    Otherwise, it is assumed that the source text is multibyte.  */
5306
5307 int
5308 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5309      int from, from_byte, to, to_byte, encodep, replace;
5310      struct coding_system *coding;
5311 {
5312   int len = to - from, len_byte = to_byte - from_byte;
5313   int nchars_del = 0, nbytes_del = 0;
5314   int require, inserted, inserted_byte;
5315   int head_skip, tail_skip, total_skip = 0;
5316   Lisp_Object saved_coding_symbol;
5317   int first = 1;
5318   unsigned char *src, *dst;
5319   Lisp_Object deletion;
5320   int orig_point = PT, orig_len = len;
5321   int prev_Z;
5322   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5323
5324   deletion = Qnil;
5325   saved_coding_symbol = coding->symbol;
5326
5327   if (from < PT && PT < to)
5328     {
5329       TEMP_SET_PT_BOTH (from, from_byte);
5330       orig_point = from;
5331     }
5332
5333   if (replace)
5334     {
5335       int saved_from = from;
5336       int saved_inhibit_modification_hooks;
5337
5338       prepare_to_modify_buffer (from, to, &from);
5339       if (saved_from != from)
5340         {
5341           to = from + len;
5342           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5343           len_byte = to_byte - from_byte;
5344         }
5345
5346       /* The code conversion routine can not preserve text properties
5347          for now.  So, we must remove all text properties in the
5348          region.  Here, we must suppress all modification hooks.  */
5349       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5350       inhibit_modification_hooks = 1;
5351       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5352       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5353     }
5354
5355   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5356     {
5357       /* We must detect encoding of text and eol format.  */
5358
5359       if (from < GPT && to > GPT)
5360         move_gap_both (from, from_byte);
5361       if (coding->type == coding_type_undecided)
5362         {
5363           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5364           if (coding->type == coding_type_undecided)
5365             {
5366               /* It seems that the text contains only ASCII, but we
5367                  should not leave it undecided because the deeper
5368                  decoding routine (decode_coding) tries to detect the
5369                  encodings again in vain.  */
5370               coding->type = coding_type_emacs_mule;
5371               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5372               /* As emacs-mule decoder will handle composition, we
5373                  need this setting to allocate coding->cmp_data
5374                  later.  */
5375               coding->composing = COMPOSITION_NO;
5376             }
5377         }
5378       if (coding->eol_type == CODING_EOL_UNDECIDED
5379           && coding->type != coding_type_ccl)
5380         {
5381           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5382           if (coding->eol_type == CODING_EOL_UNDECIDED)
5383             coding->eol_type = CODING_EOL_LF;
5384           /* We had better recover the original eol format if we
5385              encounter an inconsistent eol format while decoding.  */
5386           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5387         }
5388     }
5389
5390   /* Now we convert the text.  */
5391
5392   /* For encoding, we must process pre-write-conversion in advance.  */
5393   if (! inhibit_pre_post_conversion
5394       && encodep
5395       && SYMBOLP (coding->pre_write_conversion)
5396       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5397     {
5398       /* The function in pre-write-conversion may put a new text in a
5399          new buffer.  */
5400       struct buffer *prev = current_buffer;
5401       Lisp_Object new;
5402
5403       record_unwind_protect (code_convert_region_unwind, Qnil);
5404       /* We should not call any more pre-write/post-read-conversion
5405          functions while this pre-write-conversion is running.  */
5406       inhibit_pre_post_conversion = 1;
5407       call2 (coding->pre_write_conversion,
5408              make_number (from), make_number (to));
5409       inhibit_pre_post_conversion = 0;
5410       /* Discard the unwind protect.  */
5411       specpdl_ptr--;
5412
5413       if (current_buffer != prev)
5414         {
5415           len = ZV - BEGV;
5416           new = Fcurrent_buffer ();
5417           set_buffer_internal_1 (prev);
5418           del_range_2 (from, from_byte, to, to_byte, 0);
5419           TEMP_SET_PT_BOTH (from, from_byte);
5420           insert_from_buffer (XBUFFER (new), 1, len, 0);
5421           Fkill_buffer (new);
5422           if (orig_point >= to)
5423             orig_point += len - orig_len;
5424           else if (orig_point > from)
5425             orig_point = from;
5426           orig_len = len;
5427           to = from + len;
5428           from_byte = CHAR_TO_BYTE (from);
5429           to_byte = CHAR_TO_BYTE (to);
5430           len_byte = to_byte - from_byte;
5431           TEMP_SET_PT_BOTH (from, from_byte);
5432         }
5433     }
5434
5435   if (replace)
5436     {
5437       if (! EQ (current_buffer->undo_list, Qt))
5438         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5439       else
5440         {
5441           nchars_del = to - from;
5442           nbytes_del = to_byte - from_byte;
5443         }
5444     }
5445
5446   if (coding->composing != COMPOSITION_DISABLED)
5447     {
5448       if (encodep)
5449         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5450       else
5451         coding_allocate_composition_data (coding, from);
5452     }
5453
5454   /* Try to skip the heading and tailing ASCIIs.  */
5455   if (coding->type != coding_type_ccl)
5456     {
5457       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5458
5459       if (from < GPT && GPT < to)
5460         move_gap_both (from, from_byte);
5461       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5462       if (from_byte == to_byte
5463           && (encodep || NILP (coding->post_read_conversion))
5464           && ! CODING_REQUIRE_FLUSHING (coding))
5465         {
5466           coding->produced = len_byte;
5467           coding->produced_char = len;
5468           if (!replace)
5469             /* We must record and adjust for this new text now.  */
5470             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5471           return 0;
5472         }
5473
5474       head_skip = from_byte - from_byte_orig;
5475       tail_skip = to_byte_orig - to_byte;
5476       total_skip = head_skip + tail_skip;
5477       from += head_skip;
5478       to -= tail_skip;
5479       len -= total_skip; len_byte -= total_skip;
5480     }
5481
5482   /* For conversion, we must put the gap before the text in addition to
5483      making the gap larger for efficient decoding.  The required gap
5484      size starts from 2000 which is the magic number used in make_gap.
5485      But, after one batch of conversion, it will be incremented if we
5486      find that it is not enough .  */
5487   require = 2000;
5488
5489   if (GAP_SIZE  < require)
5490     make_gap (require - GAP_SIZE);
5491   move_gap_both (from, from_byte);
5492
5493   inserted = inserted_byte = 0;
5494
5495   GAP_SIZE += len_byte;
5496   ZV -= len;
5497   Z -= len;
5498   ZV_BYTE -= len_byte;
5499   Z_BYTE -= len_byte;
5500
5501   if (GPT - BEG < BEG_UNCHANGED)
5502     BEG_UNCHANGED = GPT - BEG;
5503   if (Z - GPT < END_UNCHANGED)
5504     END_UNCHANGED = Z - GPT;
5505
5506   if (!encodep && coding->src_multibyte)
5507     {
5508       /* Decoding routines expects that the source text is unibyte.
5509          We must convert 8-bit characters of multibyte form to
5510          unibyte.  */
5511       int len_byte_orig = len_byte;
5512       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5513       if (len_byte < len_byte_orig)
5514         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5515                     len_byte);
5516       coding->src_multibyte = 0;
5517     }
5518
5519   for (;;)
5520     {
5521       int result;
5522
5523       /* The buffer memory is now:
5524          +--------+converted-text+---------+-------original-text-------+---+
5525          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5526                   |<---------------------- GAP ----------------------->|  */
5527       src = GAP_END_ADDR - len_byte;
5528       dst = GPT_ADDR + inserted_byte;
5529
5530       if (encodep)
5531         result = encode_coding (coding, src, dst, len_byte, 0);
5532       else
5533         {
5534           if (coding->composing != COMPOSITION_DISABLED)
5535             coding->cmp_data->char_offset = from + inserted;
5536           result = decode_coding (coding, src, dst, len_byte, 0);
5537         }
5538
5539       /* The buffer memory is now:
5540          +--------+-------converted-text----+--+------original-text----+---+
5541          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5542                   |<---------------------- GAP ----------------------->|  */
5543
5544       inserted += coding->produced_char;
5545       inserted_byte += coding->produced;
5546       len_byte -= coding->consumed;
5547
5548       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5549         {
5550           coding_allocate_composition_data (coding, from + inserted);
5551           continue;
5552         }
5553
5554       src += coding->consumed;
5555       dst += coding->produced;
5556
5557       if (result == CODING_FINISH_NORMAL)
5558         {
5559           src += len_byte;
5560           break;
5561         }
5562       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5563         {
5564           unsigned char *pend = dst, *p = pend - inserted_byte;
5565           Lisp_Object eol_type;
5566
5567           /* Encode LFs back to the original eol format (CR or CRLF).  */
5568           if (coding->eol_type == CODING_EOL_CR)
5569             {
5570               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5571             }
5572           else
5573             {
5574               int count = 0;
5575
5576               while (p < pend) if (*p++ == '\n') count++;
5577               if (src - dst < count)
5578                 {
5579                   /* We don't have sufficient room for encoding LFs
5580                      back to CRLF.  We must record converted and
5581                      not-yet-converted text back to the buffer
5582                      content, enlarge the gap, then record them out of
5583                      the buffer contents again.  */
5584                   int add = len_byte + inserted_byte;
5585
5586                   GAP_SIZE -= add;
5587                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5588                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5589                   make_gap (count - GAP_SIZE);
5590                   GAP_SIZE += add;
5591                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5592                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5593                   /* Don't forget to update SRC, DST, and PEND.  */
5594                   src = GAP_END_ADDR - len_byte;
5595                   dst = GPT_ADDR + inserted_byte;
5596                   pend = dst;
5597                 }
5598               inserted += count;
5599               inserted_byte += count;
5600               coding->produced += count;
5601               p = dst = pend + count;
5602               while (count)
5603                 {
5604                   *--p = *--pend;
5605                   if (*p == '\n') count--, *--p = '\r';
5606                 }
5607             }
5608
5609           /* Suppress eol-format conversion in the further conversion.  */
5610           coding->eol_type = CODING_EOL_LF;
5611
5612           /* Set the coding system symbol to that for Unix-like EOL.  */
5613           eol_type = Fget (saved_coding_symbol, Qeol_type);
5614           if (VECTORP (eol_type)
5615               && XVECTOR (eol_type)->size == 3
5616               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5617             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5618           else
5619             coding->symbol = saved_coding_symbol;
5620
5621           continue;
5622         }
5623       if (len_byte <= 0)
5624         {
5625           if (coding->type != coding_type_ccl
5626               || coding->mode & CODING_MODE_LAST_BLOCK)
5627             break;
5628           coding->mode |= CODING_MODE_LAST_BLOCK;
5629           continue;
5630         }
5631       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5632         {
5633           /* The source text ends in invalid codes.  Let's just
5634              make them valid buffer contents, and finish conversion.  */
5635           if (multibyte_p)
5636             {
5637               unsigned char *start = dst;
5638
5639               inserted += len_byte;
5640               while (len_byte--)
5641                 {
5642                   int c = *src++;
5643                   dst += CHAR_STRING (c, dst);
5644                 }
5645
5646               inserted_byte += dst - start;
5647             }
5648           else
5649             {
5650               inserted += len_byte;
5651               inserted_byte += len_byte;
5652               while (len_byte--)
5653                 *dst++ = *src++;
5654             }
5655           break;
5656         }
5657       if (result == CODING_FINISH_INTERRUPT)
5658         {
5659           /* The conversion procedure was interrupted by a user.  */
5660           break;
5661         }
5662       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5663       if (coding->consumed < 1)
5664         {
5665           /* It's quite strange to require more memory without
5666              consuming any bytes.  Perhaps CCL program bug.  */
5667           break;
5668         }
5669       if (first)
5670         {
5671           /* We have just done the first batch of conversion which was
5672              stopped because of insufficient gap.  Let's reconsider the
5673              required gap size (i.e. SRT - DST) now.
5674
5675              We have converted ORIG bytes (== coding->consumed) into
5676              NEW bytes (coding->produced).  To convert the remaining
5677              LEN bytes, we may need REQUIRE bytes of gap, where:
5678                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5679                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5680              Here, we are sure that NEW >= ORIG.  */
5681           float ratio;
5682
5683           if (coding->produced <= coding->consumed)
5684             {
5685               /* This happens because of CCL-based coding system with
5686                  eol-type CRLF.  */
5687               require = 0;
5688             }
5689           else
5690             {
5691               ratio = (coding->produced - coding->consumed) / coding->consumed;
5692               require = len_byte * ratio;
5693             }
5694           first = 0;
5695         }
5696       if ((src - dst) < (require + 2000))
5697         {
5698           /* See the comment above the previous call of make_gap.  */
5699           int add = len_byte + inserted_byte;
5700
5701           GAP_SIZE -= add;
5702           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5703           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5704           make_gap (require + 2000);
5705           GAP_SIZE += add;
5706           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5707           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5708         }
5709     }
5710   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5711
5712   if (encodep && coding->dst_multibyte)
5713     {
5714       /* The output is unibyte.  We must convert 8-bit characters to
5715          multibyte form.  */
5716       if (inserted_byte * 2 > GAP_SIZE)
5717         {
5718           GAP_SIZE -= inserted_byte;
5719           ZV += inserted_byte; Z += inserted_byte;
5720           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5721           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5722           make_gap (inserted_byte - GAP_SIZE);
5723           GAP_SIZE += inserted_byte;
5724           ZV -= inserted_byte; Z -= inserted_byte;
5725           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5726           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5727         }
5728       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5729     }
5730
5731   /* If we shrank the conversion area, adjust it now.  */
5732   if (total_skip > 0)
5733     {
5734       if (tail_skip > 0)
5735         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5736       inserted += total_skip; inserted_byte += total_skip;
5737       GAP_SIZE += total_skip;
5738       GPT -= head_skip; GPT_BYTE -= head_skip;
5739       ZV -= total_skip; ZV_BYTE -= total_skip;
5740       Z -= total_skip; Z_BYTE -= total_skip;
5741       from -= head_skip; from_byte -= head_skip;
5742       to += tail_skip; to_byte += tail_skip;
5743     }
5744
5745   prev_Z = Z;
5746   if (! EQ (current_buffer->undo_list, Qt))
5747     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5748   else
5749     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5750                                  inserted, inserted_byte);
5751   inserted = Z - prev_Z;
5752
5753   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5754     coding_restore_composition (coding, Fcurrent_buffer ());
5755   coding_free_composition_data (coding);
5756
5757   if (! inhibit_pre_post_conversion
5758       && ! encodep && ! NILP (coding->post_read_conversion))
5759     {
5760       Lisp_Object val;
5761
5762       if (from != PT)
5763         TEMP_SET_PT_BOTH (from, from_byte);
5764       prev_Z = Z;
5765       record_unwind_protect (code_convert_region_unwind, Qnil);
5766       /* We should not call any more pre-write/post-read-conversion
5767          functions while this post-read-conversion is running.  */
5768       inhibit_pre_post_conversion = 1;
5769       val = call1 (coding->post_read_conversion, make_number (inserted));
5770       inhibit_pre_post_conversion = 0;
5771       /* Discard the unwind protect.  */
5772       specpdl_ptr--;
5773       CHECK_NUMBER (val);
5774       inserted += Z - prev_Z;
5775     }
5776
5777   if (orig_point >= from)
5778     {
5779       if (orig_point >= from + orig_len)
5780         orig_point += inserted - orig_len;
5781       else
5782         orig_point = from;
5783       TEMP_SET_PT (orig_point);
5784     }
5785
5786   if (replace)
5787     {
5788       signal_after_change (from, to - from, inserted);
5789       update_compositions (from, from + inserted, CHECK_BORDER);
5790     }
5791
5792   {
5793     coding->consumed = to_byte - from_byte;
5794     coding->consumed_char = to - from;
5795     coding->produced = inserted_byte;
5796     coding->produced_char = inserted;
5797   }
5798
5799   return 0;
5800 }
5801
5802 Lisp_Object
5803 run_pre_post_conversion_on_str (str, coding, encodep)
5804      Lisp_Object str;
5805      struct coding_system *coding;
5806      int encodep;
5807 {
5808   int count = SPECPDL_INDEX ();
5809   struct gcpro gcpro1, gcpro2;
5810   int multibyte = STRING_MULTIBYTE (str);
5811   Lisp_Object buffer;
5812   struct buffer *buf;
5813   Lisp_Object old_deactivate_mark;
5814
5815   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5816   record_unwind_protect (code_convert_region_unwind, Qnil);
5817   /* It is not crucial to specbind this.  */
5818   old_deactivate_mark = Vdeactivate_mark;
5819   GCPRO2 (str, old_deactivate_mark);
5820
5821   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5822   buf = XBUFFER (buffer);
5823
5824   buf->directory = current_buffer->directory;
5825   buf->read_only = Qnil;
5826   buf->filename = Qnil;
5827   buf->undo_list = Qt;
5828   buf->overlays_before = Qnil;
5829   buf->overlays_after = Qnil;
5830
5831   set_buffer_internal (buf);
5832   /* We must insert the contents of STR as is without
5833      unibyte<->multibyte conversion.  For that, we adjust the
5834      multibyteness of the working buffer to that of STR.  */
5835   Ferase_buffer ();
5836   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
5837
5838   insert_from_string (str, 0, 0,
5839                       SCHARS (str), SBYTES (str), 0);
5840   UNGCPRO;
5841   inhibit_pre_post_conversion = 1;
5842   if (encodep)
5843     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5844   else
5845     {
5846       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5847       call1 (coding->post_read_conversion, make_number (Z - BEG));
5848     }
5849   inhibit_pre_post_conversion = 0;
5850   Vdeactivate_mark = old_deactivate_mark;
5851   str = make_buffer_string (BEG, Z, 1);
5852   return unbind_to (count, str);
5853 }
5854
5855 Lisp_Object
5856 decode_coding_string (str, coding, nocopy)
5857      Lisp_Object str;
5858      struct coding_system *coding;
5859      int nocopy;
5860 {
5861   int len;
5862   struct conversion_buffer buf;
5863   int from, to_byte;
5864   Lisp_Object saved_coding_symbol;
5865   int result;
5866   int require_decoding;
5867   int shrinked_bytes = 0;
5868   Lisp_Object newstr;
5869   int consumed, consumed_char, produced, produced_char;
5870
5871   from = 0;
5872   to_byte = SBYTES (str);
5873
5874   saved_coding_symbol = coding->symbol;
5875   coding->src_multibyte = STRING_MULTIBYTE (str);
5876   coding->dst_multibyte = 1;
5877   if (CODING_REQUIRE_DETECTION (coding))
5878     {
5879       /* See the comments in code_convert_region.  */
5880       if (coding->type == coding_type_undecided)
5881         {
5882           detect_coding (coding, SDATA (str), to_byte);
5883           if (coding->type == coding_type_undecided)
5884             {
5885               coding->type = coding_type_emacs_mule;
5886               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5887               /* As emacs-mule decoder will handle composition, we
5888                  need this setting to allocate coding->cmp_data
5889                  later.  */
5890               coding->composing = COMPOSITION_NO;
5891             }
5892         }
5893       if (coding->eol_type == CODING_EOL_UNDECIDED
5894           && coding->type != coding_type_ccl)
5895         {
5896           saved_coding_symbol = coding->symbol;
5897           detect_eol (coding, SDATA (str), to_byte);
5898           if (coding->eol_type == CODING_EOL_UNDECIDED)
5899             coding->eol_type = CODING_EOL_LF;
5900           /* We had better recover the original eol format if we
5901              encounter an inconsistent eol format while decoding.  */
5902           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5903         }
5904     }
5905
5906   if (coding->type == coding_type_no_conversion
5907       || coding->type == coding_type_raw_text)
5908     coding->dst_multibyte = 0;
5909
5910   require_decoding = CODING_REQUIRE_DECODING (coding);
5911
5912   if (STRING_MULTIBYTE (str))
5913     {
5914       /* Decoding routines expect the source text to be unibyte.  */
5915       str = Fstring_as_unibyte (str);
5916       to_byte = SBYTES (str);
5917       nocopy = 1;
5918       coding->src_multibyte = 0;
5919     }
5920
5921   /* Try to skip the heading and tailing ASCIIs.  */
5922   if (require_decoding && coding->type != coding_type_ccl)
5923     {
5924       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
5925                                 0);
5926       if (from == to_byte)
5927         require_decoding = 0;
5928       shrinked_bytes = from + (SBYTES (str) - to_byte);
5929     }
5930
5931   if (!require_decoding)
5932     {
5933       coding->consumed = SBYTES (str);
5934       coding->consumed_char = SCHARS (str);
5935       if (coding->dst_multibyte)
5936         {
5937           str = Fstring_as_multibyte (str);
5938           nocopy = 1;
5939         }
5940       coding->produced = SBYTES (str);
5941       coding->produced_char = SCHARS (str);
5942       return (nocopy ? str : Fcopy_sequence (str));
5943     }
5944
5945   if (coding->composing != COMPOSITION_DISABLED)
5946     coding_allocate_composition_data (coding, from);
5947   len = decoding_buffer_size (coding, to_byte - from);
5948   allocate_conversion_buffer (buf, len);
5949
5950   consumed = consumed_char = produced = produced_char = 0;
5951   while (1)
5952     {
5953       result = decode_coding (coding, SDATA (str) + from + consumed,
5954                               buf.data + produced, to_byte - from - consumed,
5955                               buf.size - produced);
5956       consumed += coding->consumed;
5957       consumed_char += coding->consumed_char;
5958       produced += coding->produced;
5959       produced_char += coding->produced_char;
5960       if (result == CODING_FINISH_NORMAL
5961           || (result == CODING_FINISH_INSUFFICIENT_SRC
5962               && coding->consumed == 0))
5963         break;
5964       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5965         coding_allocate_composition_data (coding, from + produced_char);
5966       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5967         extend_conversion_buffer (&buf);
5968       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5969         {
5970           Lisp_Object eol_type;
5971
5972           /* Recover the original EOL format.  */
5973           if (coding->eol_type == CODING_EOL_CR)
5974             {
5975               unsigned char *p;
5976               for (p = buf.data; p < buf.data + produced; p++)
5977                 if (*p == '\n') *p = '\r';
5978             }
5979           else if (coding->eol_type == CODING_EOL_CRLF)
5980             {
5981               int num_eol = 0;
5982               unsigned char *p0, *p1;
5983               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5984                 if (*p0 == '\n') num_eol++;
5985               if (produced + num_eol >= buf.size)
5986                 extend_conversion_buffer (&buf);
5987               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5988                 {
5989                   *--p1 = *--p0;
5990                   if (*p0 == '\n') *--p1 = '\r';
5991                 }
5992               produced += num_eol;
5993               produced_char += num_eol;
5994             }
5995           /* Suppress eol-format conversion in the further conversion.  */
5996           coding->eol_type = CODING_EOL_LF;
5997
5998           /* Set the coding system symbol to that for Unix-like EOL.  */
5999           eol_type = Fget (saved_coding_symbol, Qeol_type);
6000           if (VECTORP (eol_type)
6001               && XVECTOR (eol_type)->size == 3
6002               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6003             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6004           else
6005             coding->symbol = saved_coding_symbol;
6006
6007
6008         }
6009     }
6010
6011   coding->consumed = consumed;
6012   coding->consumed_char = consumed_char;
6013   coding->produced = produced;
6014   coding->produced_char = produced_char;
6015
6016   if (coding->dst_multibyte)
6017     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6018                                            produced + shrinked_bytes);
6019   else
6020     newstr = make_uninit_string (produced + shrinked_bytes);
6021   if (from > 0)
6022     STRING_COPYIN (newstr, 0, SDATA (str), from);
6023   STRING_COPYIN (newstr, from, buf.data, produced);
6024   if (shrinked_bytes > from)
6025     STRING_COPYIN (newstr, from + produced,
6026                    SDATA (str) + to_byte,
6027                    shrinked_bytes - from);
6028   free_conversion_buffer (&buf);
6029
6030   if (coding->cmp_data && coding->cmp_data->used)
6031     coding_restore_composition (coding, newstr);
6032   coding_free_composition_data (coding);
6033
6034   if (SYMBOLP (coding->post_read_conversion)
6035       && !NILP (Ffboundp (coding->post_read_conversion)))
6036     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6037
6038   return newstr;
6039 }
6040
6041 Lisp_Object
6042 encode_coding_string (str, coding, nocopy)
6043      Lisp_Object str;
6044      struct coding_system *coding;
6045      int nocopy;
6046 {
6047   int len;
6048   struct conversion_buffer buf;
6049   int from, to, to_byte;
6050   int result;
6051   int shrinked_bytes = 0;
6052   Lisp_Object newstr;
6053   int consumed, consumed_char, produced, produced_char;
6054
6055   if (SYMBOLP (coding->pre_write_conversion)
6056       && !NILP (Ffboundp (coding->pre_write_conversion)))
6057     str = run_pre_post_conversion_on_str (str, coding, 1);
6058
6059   from = 0;
6060   to = SCHARS (str);
6061   to_byte = SBYTES (str);
6062
6063   /* Encoding routines determine the multibyteness of the source text
6064      by coding->src_multibyte.  */
6065   coding->src_multibyte = STRING_MULTIBYTE (str);
6066   coding->dst_multibyte = 0;
6067   if (! CODING_REQUIRE_ENCODING (coding))
6068     {
6069       coding->consumed = SBYTES (str);
6070       coding->consumed_char = SCHARS (str);
6071       if (STRING_MULTIBYTE (str))
6072         {
6073           str = Fstring_as_unibyte (str);
6074           nocopy = 1;
6075         }
6076       coding->produced = SBYTES (str);
6077       coding->produced_char = SCHARS (str);
6078       return (nocopy ? str : Fcopy_sequence (str));
6079     }
6080
6081   if (coding->composing != COMPOSITION_DISABLED)
6082     coding_save_composition (coding, from, to, str);
6083
6084   /* Try to skip the heading and tailing ASCIIs.  */
6085   if (coding->type != coding_type_ccl)
6086     {
6087       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6088                                 1);
6089       if (from == to_byte)
6090         return (nocopy ? str : Fcopy_sequence (str));
6091       shrinked_bytes = from + (SBYTES (str) - to_byte);
6092     }
6093
6094   len = encoding_buffer_size (coding, to_byte - from);
6095   allocate_conversion_buffer (buf, len);
6096
6097   consumed = consumed_char = produced = produced_char = 0;
6098   while (1)
6099     {
6100       result = encode_coding (coding, SDATA (str) + from + consumed,
6101                               buf.data + produced, to_byte - from - consumed,
6102                               buf.size - produced);
6103       consumed += coding->consumed;
6104       consumed_char += coding->consumed_char;
6105       produced += coding->produced;
6106       produced_char += coding->produced_char;
6107       if (result == CODING_FINISH_NORMAL
6108           || (result == CODING_FINISH_INSUFFICIENT_SRC
6109               && coding->consumed == 0))
6110         break;
6111       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6112       extend_conversion_buffer (&buf);
6113     }
6114
6115   coding->consumed = consumed;
6116   coding->consumed_char = consumed_char;
6117   coding->produced = produced;
6118   coding->produced_char = produced_char;
6119
6120   newstr = make_uninit_string (produced + shrinked_bytes);
6121   if (from > 0)
6122     STRING_COPYIN (newstr, 0, SDATA (str), from);
6123   STRING_COPYIN (newstr, from, buf.data, produced);
6124   if (shrinked_bytes > from)
6125     STRING_COPYIN (newstr, from + produced,
6126                    SDATA (str) + to_byte,
6127                    shrinked_bytes - from);
6128
6129   free_conversion_buffer (&buf);
6130   coding_free_composition_data (coding);
6131
6132   return newstr;
6133 }
6134
6135 \f
6136 #ifdef emacs
6137 /*** 8. Emacs Lisp library functions ***/
6138
6139 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6140        doc: /* Return t if OBJECT is nil or a coding-system.
6141 See the documentation of `make-coding-system' for information
6142 about coding-system objects.  */)
6143      (obj)
6144      Lisp_Object obj;
6145 {
6146   if (NILP (obj))
6147     return Qt;
6148   if (!SYMBOLP (obj))
6149     return Qnil;
6150   /* Get coding-spec vector for OBJ.  */
6151   obj = Fget (obj, Qcoding_system);
6152   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6153           ? Qt : Qnil);
6154 }
6155
6156 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6157        Sread_non_nil_coding_system, 1, 1, 0,
6158        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6159      (prompt)
6160      Lisp_Object prompt;
6161 {
6162   Lisp_Object val;
6163   do
6164     {
6165       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6166                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6167     }
6168   while (SCHARS (val) == 0);
6169   return (Fintern (val, Qnil));
6170 }
6171
6172 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6173        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6174 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6175      (prompt, default_coding_system)
6176      Lisp_Object prompt, default_coding_system;
6177 {
6178   Lisp_Object val;
6179   if (SYMBOLP (default_coding_system))
6180     default_coding_system = SYMBOL_NAME (default_coding_system);
6181   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6182                           Qt, Qnil, Qcoding_system_history,
6183                           default_coding_system, Qnil);
6184   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6185 }
6186
6187 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6188        1, 1, 0,
6189        doc: /* Check validity of CODING-SYSTEM.
6190 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6191 It is valid if it is a symbol with a non-nil `coding-system' property.
6192 The value of property should be a vector of length 5.  */)
6193      (coding_system)
6194      Lisp_Object coding_system;
6195 {
6196   CHECK_SYMBOL (coding_system);
6197   if (!NILP (Fcoding_system_p (coding_system)))
6198     return coding_system;
6199   while (1)
6200     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6201 }
6202 \f
6203 Lisp_Object
6204 detect_coding_system (src, src_bytes, highest, multibytep)
6205      const unsigned char *src;
6206      int src_bytes, highest;
6207      int multibytep;
6208 {
6209   int coding_mask, eol_type;
6210   Lisp_Object val, tmp;
6211   int dummy;
6212
6213   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6214   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6215   if (eol_type == CODING_EOL_INCONSISTENT)
6216     eol_type = CODING_EOL_UNDECIDED;
6217
6218   if (!coding_mask)
6219     {
6220       val = Qundecided;
6221       if (eol_type != CODING_EOL_UNDECIDED)
6222         {
6223           Lisp_Object val2;
6224           val2 = Fget (Qundecided, Qeol_type);
6225           if (VECTORP (val2))
6226             val = XVECTOR (val2)->contents[eol_type];
6227         }
6228       return (highest ? val : Fcons (val, Qnil));
6229     }
6230
6231   /* At first, gather possible coding systems in VAL.  */
6232   val = Qnil;
6233   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6234     {
6235       Lisp_Object category_val, category_index;
6236
6237       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6238       category_val = Fsymbol_value (XCAR (tmp));
6239       if (!NILP (category_val)
6240           && NATNUMP (category_index)
6241           && (coding_mask & (1 << XFASTINT (category_index))))
6242         {
6243           val = Fcons (category_val, val);
6244           if (highest)
6245             break;
6246         }
6247     }
6248   if (!highest)
6249     val = Fnreverse (val);
6250
6251   /* Then, replace the elements with subsidiary coding systems.  */
6252   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6253     {
6254       if (eol_type != CODING_EOL_UNDECIDED
6255           && eol_type != CODING_EOL_INCONSISTENT)
6256         {
6257           Lisp_Object eol;
6258           eol = Fget (XCAR (tmp), Qeol_type);
6259           if (VECTORP (eol))
6260             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6261         }
6262     }
6263   return (highest ? XCAR (val) : val);
6264 }
6265
6266 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6267        2, 3, 0,
6268        doc: /* Detect coding system of the text in the region between START and END.
6269 Return a list of possible coding systems ordered by priority.
6270
6271 If only ASCII characters are found, it returns a list of single element
6272 `undecided' or its subsidiary coding system according to a detected
6273 end-of-line format.
6274
6275 If optional argument HIGHEST is non-nil, return the coding system of
6276 highest priority.  */)
6277      (start, end, highest)
6278      Lisp_Object start, end, highest;
6279 {
6280   int from, to;
6281   int from_byte, to_byte;
6282   int include_anchor_byte = 0;
6283
6284   CHECK_NUMBER_COERCE_MARKER (start);
6285   CHECK_NUMBER_COERCE_MARKER (end);
6286
6287   validate_region (&start, &end);
6288   from = XINT (start), to = XINT (end);
6289   from_byte = CHAR_TO_BYTE (from);
6290   to_byte = CHAR_TO_BYTE (to);
6291
6292   if (from < GPT && to >= GPT)
6293     move_gap_both (to, to_byte);
6294   /* If we an anchor byte `\0' follows the region, we include it in
6295      the detecting source.  Then code detectors can handle the tailing
6296      byte sequence more accurately.
6297
6298      Fix me: This is not a perfect solution.  It is better that we
6299      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6300   */
6301   if (to == Z || (to == GPT && GAP_SIZE > 0))
6302     include_anchor_byte = 1;
6303   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6304                                to_byte - from_byte + include_anchor_byte,
6305                                !NILP (highest),
6306                                !NILP (current_buffer
6307                                       ->enable_multibyte_characters));
6308 }
6309
6310 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6311        1, 2, 0,
6312        doc: /* Detect coding system of the text in STRING.
6313 Return a list of possible coding systems ordered by priority.
6314
6315 If only ASCII characters are found, it returns a list of single element
6316 `undecided' or its subsidiary coding system according to a detected
6317 end-of-line format.
6318
6319 If optional argument HIGHEST is non-nil, return the coding system of
6320 highest priority.  */)
6321      (string, highest)
6322      Lisp_Object string, highest;
6323 {
6324   CHECK_STRING (string);
6325
6326   return detect_coding_system (SDATA (string),
6327                                /* "+ 1" is to include the anchor byte
6328                                   `\0'.  With this, code detectors can
6329                                   handle the tailing bytes more
6330                                   accurately.  */
6331                                SBYTES (string) + 1,
6332                                !NILP (highest),
6333                                STRING_MULTIBYTE (string));
6334 }
6335
6336 /* Return an intersection of lists L1 and L2.  */
6337
6338 static Lisp_Object
6339 intersection (l1, l2)
6340      Lisp_Object l1, l2;
6341 {
6342   Lisp_Object val = Fcons (Qnil, Qnil), tail;
6343
6344   for (tail = val; CONSP (l1); l1 = XCDR (l1))
6345     {
6346       if (!NILP (Fmemq (XCAR (l1), l2)))
6347         {
6348           XSETCDR (tail, Fcons (XCAR (l1), Qnil));
6349           tail = XCDR (tail);
6350         }
6351     }
6352   return XCDR (val);
6353 }
6354
6355
6356 /*  Subroutine for Fsafe_coding_systems_region_internal.
6357
6358     Return a list of coding systems that safely encode the multibyte
6359     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
6360     possible coding systems.  If it is nil, it means that we have not
6361     yet found any coding systems.
6362
6363     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6364     element of WORK_TABLE is set to t once the element is looked up.
6365
6366     If a non-ASCII single byte char is found, set
6367     *single_byte_char_found to 1.  */
6368
6369 static Lisp_Object
6370 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6371      unsigned char *p, *pend;
6372      Lisp_Object safe_codings, work_table;
6373      int *single_byte_char_found;
6374 {
6375   int c, len, idx;
6376   Lisp_Object val;
6377
6378   while (p < pend)
6379     {
6380       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6381       p += len;
6382       if (ASCII_BYTE_P (c))
6383         /* We can ignore ASCII characters here.  */
6384         continue;
6385       if (SINGLE_BYTE_CHAR_P (c))
6386         *single_byte_char_found = 1;
6387       if (NILP (safe_codings))
6388         continue;
6389       /* Check the safe coding systems for C.  */
6390       val = char_table_ref_and_index (work_table, c, &idx);
6391       if (EQ (val, Qt))
6392         /* This element was already checked.  Ignore it.  */
6393         continue;
6394       /* Remember that we checked this element.  */
6395       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
6396
6397       /* If there are some safe coding systems for C and we have
6398          already found the other set of coding systems for the
6399          different characters, get the intersection of them.  */
6400       if (!EQ (safe_codings, Qt) && !NILP (val))
6401         val = intersection (safe_codings, val);
6402       safe_codings = val;
6403     }
6404   return safe_codings;
6405 }
6406
6407
6408 /* Return a list of coding systems that safely encode the text between
6409    START and END.  If the text contains only ASCII or is unibyte,
6410    return t.  */
6411
6412 DEFUN ("find-coding-systems-region-internal",
6413        Ffind_coding_systems_region_internal,
6414        Sfind_coding_systems_region_internal, 2, 2, 0,
6415        doc: /* Internal use only.  */)
6416      (start, end)
6417      Lisp_Object start, end;
6418 {
6419   Lisp_Object work_table, safe_codings;
6420   int non_ascii_p = 0;
6421   int single_byte_char_found = 0;
6422   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6423
6424   if (STRINGP (start))
6425     {
6426       if (!STRING_MULTIBYTE (start))
6427         return Qt;
6428       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6429       p2 = p2end = p1end;
6430       if (SCHARS (start) != SBYTES (start))
6431         non_ascii_p = 1;
6432     }
6433   else
6434     {
6435       int from, to, stop;
6436
6437       CHECK_NUMBER_COERCE_MARKER (start);
6438       CHECK_NUMBER_COERCE_MARKER (end);
6439       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6440         args_out_of_range (start, end);
6441       if (NILP (current_buffer->enable_multibyte_characters))
6442         return Qt;
6443       from = CHAR_TO_BYTE (XINT (start));
6444       to = CHAR_TO_BYTE (XINT (end));
6445       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6446       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6447       if (stop == to)
6448         p2 = p2end = p1end;
6449       else
6450         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6451       if (XINT (end) - XINT (start) != to - from)
6452         non_ascii_p = 1;
6453     }
6454
6455   if (!non_ascii_p)
6456     {
6457       /* We are sure that the text contains no multibyte character.
6458          Check if it contains eight-bit-graphic.  */
6459       p = p1;
6460       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6461       if (p == p1end)
6462         {
6463           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6464           if (p == p2end)
6465             return Qt;
6466         }
6467     }
6468
6469   /* The text contains non-ASCII characters.  */
6470   work_table = Fcopy_sequence (Vchar_coding_system_table);
6471   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
6472                                     &single_byte_char_found);
6473   if (p2 < p2end)
6474     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6475                                       &single_byte_char_found);
6476
6477   if (EQ (safe_codings, Qt))
6478     ; /* Nothing to be done.  */
6479   else if (!single_byte_char_found)
6480     {
6481       /* Append generic coding systems.  */
6482       Lisp_Object args[2];
6483       args[0] = safe_codings;
6484       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
6485                                         make_number (0));
6486       safe_codings = Fappend (2, args);
6487     }
6488   else
6489     safe_codings = Fcons (Qraw_text,
6490                           Fcons (Qemacs_mule,
6491                                  Fcons (Qno_conversion, safe_codings)));
6492   return safe_codings;
6493 }
6494
6495
6496 /* Search from position POS for such characters that are unencodable
6497    accoding to SAFE_CHARS, and return a list of their positions.  P
6498    points where in the memory the character at POS exists.  Limit the
6499    search at PEND or when Nth unencodable characters are found.
6500
6501    If SAFE_CHARS is a char table, an element for an unencodable
6502    character is nil.
6503
6504    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6505
6506    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6507    eight-bit-graphic characters are unencodable.  */
6508
6509 static Lisp_Object
6510 unencodable_char_position (safe_chars, pos, p, pend, n)
6511      Lisp_Object safe_chars;
6512      int pos;
6513      unsigned char *p, *pend;
6514      int n;
6515 {
6516   Lisp_Object pos_list;
6517
6518   pos_list = Qnil;
6519   while (p < pend)
6520     {
6521       int len;
6522       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6523
6524       if (c >= 128
6525           && (CHAR_TABLE_P (safe_chars)
6526               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6527               : (NILP (safe_chars) || c < 256)))
6528         {
6529           pos_list = Fcons (make_number (pos), pos_list);
6530           if (--n <= 0)
6531             break;
6532         }
6533       pos++;
6534       p += len;
6535     }
6536   return Fnreverse (pos_list);
6537 }
6538
6539
6540 DEFUN ("unencodable-char-position", Funencodable_char_position,
6541        Sunencodable_char_position, 3, 5, 0,
6542        doc: /*
6543 Return position of first un-encodable character in a region.
6544 START and END specfiy the region and CODING-SYSTEM specifies the
6545 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6546
6547 If optional 4th argument COUNT is non-nil, it specifies at most how
6548 many un-encodable characters to search.  In this case, the value is a
6549 list of positions.
6550
6551 If optional 5th argument STRING is non-nil, it is a string to search
6552 for un-encodable characters.  In that case, START and END are indexes
6553 to the string.  */)
6554      (start, end, coding_system, count, string)
6555      Lisp_Object start, end, coding_system, count, string;
6556 {
6557   int n;
6558   Lisp_Object safe_chars;
6559   struct coding_system coding;
6560   Lisp_Object positions;
6561   int from, to;
6562   unsigned char *p, *pend;
6563
6564   if (NILP (string))
6565     {
6566       validate_region (&start, &end);
6567       from = XINT (start);
6568       to = XINT (end);
6569       if (NILP (current_buffer->enable_multibyte_characters))
6570         return Qnil;
6571       p = CHAR_POS_ADDR (from);
6572       pend = CHAR_POS_ADDR (to);
6573     }
6574   else
6575     {
6576       CHECK_STRING (string);
6577       CHECK_NATNUM (start);
6578       CHECK_NATNUM (end);
6579       from = XINT (start);
6580       to = XINT (end);
6581       if (from > to
6582           || to > SCHARS (string))
6583         args_out_of_range_3 (string, start, end);
6584       if (! STRING_MULTIBYTE (string))
6585         return Qnil;
6586       p = SDATA (string) + string_char_to_byte (string, from);
6587       pend = SDATA (string) + string_char_to_byte (string, to);
6588     }
6589
6590   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6591
6592   if (NILP (count))
6593     n = 1;
6594   else
6595     {
6596       CHECK_NATNUM (count);
6597       n = XINT (count);
6598     }
6599
6600   if (coding.type == coding_type_no_conversion
6601       || coding.type == coding_type_raw_text)
6602     return Qnil;
6603
6604   if (coding.type == coding_type_undecided)
6605     safe_chars = Qnil;
6606   else
6607     safe_chars = coding_safe_chars (&coding);
6608
6609   if (STRINGP (string)
6610       || from >= GPT || to <= GPT)
6611     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6612   else
6613     {
6614       Lisp_Object args[2];
6615
6616       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6617       n -= XINT (Flength (args[0]));
6618       if (n <= 0)
6619         positions = args[0];
6620       else
6621         {
6622           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6623                                                pend, n);
6624           positions = Fappend (2, args);
6625         }
6626     }
6627
6628   return  (NILP (count) ? Fcar (positions) : positions);
6629 }
6630
6631
6632 Lisp_Object
6633 code_convert_region1 (start, end, coding_system, encodep)
6634      Lisp_Object start, end, coding_system;
6635      int encodep;
6636 {
6637   struct coding_system coding;
6638   int from, to;
6639
6640   CHECK_NUMBER_COERCE_MARKER (start);
6641   CHECK_NUMBER_COERCE_MARKER (end);
6642   CHECK_SYMBOL (coding_system);
6643
6644   validate_region (&start, &end);
6645   from = XFASTINT (start);
6646   to = XFASTINT (end);
6647
6648   if (NILP (coding_system))
6649     return make_number (to - from);
6650
6651   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6652     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6653
6654   coding.mode |= CODING_MODE_LAST_BLOCK;
6655   coding.src_multibyte = coding.dst_multibyte
6656     = !NILP (current_buffer->enable_multibyte_characters);
6657   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6658                        &coding, encodep, 1);
6659   Vlast_coding_system_used = coding.symbol;
6660   return make_number (coding.produced_char);
6661 }
6662
6663 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6664        3, 3, "r\nzCoding system: ",
6665        doc: /* Decode the current region from the specified coding system.
6666 When called from a program, takes three arguments:
6667 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6668 This function sets `last-coding-system-used' to the precise coding system
6669 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6670 not fully specified.)
6671 It returns the length of the decoded text.  */)
6672      (start, end, coding_system)
6673      Lisp_Object start, end, coding_system;
6674 {
6675   return code_convert_region1 (start, end, coding_system, 0);
6676 }
6677
6678 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6679        3, 3, "r\nzCoding system: ",
6680        doc: /* Encode the current region into the specified coding system.
6681 When called from a program, takes three arguments:
6682 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6683 This function sets `last-coding-system-used' to the precise coding system
6684 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6685 not fully specified.)
6686 It returns the length of the encoded text.  */)
6687      (start, end, coding_system)
6688      Lisp_Object start, end, coding_system;
6689 {
6690   return code_convert_region1 (start, end, coding_system, 1);
6691 }
6692
6693 Lisp_Object
6694 code_convert_string1 (string, coding_system, nocopy, encodep)
6695      Lisp_Object string, coding_system, nocopy;
6696      int encodep;
6697 {
6698   struct coding_system coding;
6699
6700   CHECK_STRING (string);
6701   CHECK_SYMBOL (coding_system);
6702
6703   if (NILP (coding_system))
6704     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6705
6706   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6707     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6708
6709   coding.mode |= CODING_MODE_LAST_BLOCK;
6710   string = (encodep
6711             ? encode_coding_string (string, &coding, !NILP (nocopy))
6712             : decode_coding_string (string, &coding, !NILP (nocopy)));
6713   Vlast_coding_system_used = coding.symbol;
6714
6715   return string;
6716 }
6717
6718 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6719        2, 3, 0,
6720        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6721 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6722 if the decoding operation is trivial.
6723 This function sets `last-coding-system-used' to the precise coding system
6724 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6725 not fully specified.)  */)
6726      (string, coding_system, nocopy)
6727      Lisp_Object string, coding_system, nocopy;
6728 {
6729   return code_convert_string1 (string, coding_system, nocopy, 0);
6730 }
6731
6732 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6733        2, 3, 0,
6734        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6735 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6736 if the encoding operation is trivial.
6737 This function sets `last-coding-system-used' to the precise coding system
6738 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6739 not fully specified.)  */)
6740      (string, coding_system, nocopy)
6741      Lisp_Object string, coding_system, nocopy;
6742 {
6743   return code_convert_string1 (string, coding_system, nocopy, 1);
6744 }
6745
6746 /* Encode or decode STRING according to CODING_SYSTEM.
6747    Do not set Vlast_coding_system_used.
6748
6749    This function is called only from macros DECODE_FILE and
6750    ENCODE_FILE, thus we ignore character composition.  */
6751
6752 Lisp_Object
6753 code_convert_string_norecord (string, coding_system, encodep)
6754      Lisp_Object string, coding_system;
6755      int encodep;
6756 {
6757   struct coding_system coding;
6758
6759   CHECK_STRING (string);
6760   CHECK_SYMBOL (coding_system);
6761
6762   if (NILP (coding_system))
6763     return string;
6764
6765   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6766     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6767
6768   coding.composing = COMPOSITION_DISABLED;
6769   coding.mode |= CODING_MODE_LAST_BLOCK;
6770   return (encodep
6771           ? encode_coding_string (string, &coding, 1)
6772           : decode_coding_string (string, &coding, 1));
6773 }
6774 \f
6775 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6776        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6777 Return the corresponding character.  */)
6778      (code)
6779      Lisp_Object code;
6780 {
6781   unsigned char c1, c2, s1, s2;
6782   Lisp_Object val;
6783
6784   CHECK_NUMBER (code);
6785   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6786   if (s1 == 0)
6787     {
6788       if (s2 < 0x80)
6789         XSETFASTINT (val, s2);
6790       else if (s2 >= 0xA0 || s2 <= 0xDF)
6791         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6792       else
6793         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6794     }
6795   else
6796     {
6797       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
6798           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6799         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6800       DECODE_SJIS (s1, s2, c1, c2);
6801       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6802     }
6803   return val;
6804 }
6805
6806 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6807        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6808 Return the corresponding code in SJIS.  */)
6809      (ch)
6810      Lisp_Object ch;
6811 {
6812   int charset, c1, c2, s1, s2;
6813   Lisp_Object val;
6814
6815   CHECK_NUMBER (ch);
6816   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6817   if (charset == CHARSET_ASCII)
6818     {
6819       val = ch;
6820     }
6821   else if (charset == charset_jisx0208
6822            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6823     {
6824       ENCODE_SJIS (c1, c2, s1, s2);
6825       XSETFASTINT (val, (s1 << 8) | s2);
6826     }
6827   else if (charset == charset_katakana_jisx0201
6828            && c1 > 0x20 && c2 < 0xE0)
6829     {
6830       XSETFASTINT (val, c1 | 0x80);
6831     }
6832   else
6833     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6834   return val;
6835 }
6836
6837 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6838        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6839 Return the corresponding character.  */)
6840      (code)
6841      Lisp_Object code;
6842 {
6843   int charset;
6844   unsigned char b1, b2, c1, c2;
6845   Lisp_Object val;
6846
6847   CHECK_NUMBER (code);
6848   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6849   if (b1 == 0)
6850     {
6851       if (b2 >= 0x80)
6852         error ("Invalid BIG5 code: %x", XFASTINT (code));
6853       val = code;
6854     }
6855   else
6856     {
6857       if ((b1 < 0xA1 || b1 > 0xFE)
6858           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6859         error ("Invalid BIG5 code: %x", XFASTINT (code));
6860       DECODE_BIG5 (b1, b2, charset, c1, c2);
6861       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6862     }
6863   return val;
6864 }
6865
6866 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6867        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6868 Return the corresponding character code in Big5.  */)
6869      (ch)
6870      Lisp_Object ch;
6871 {
6872   int charset, c1, c2, b1, b2;
6873   Lisp_Object val;
6874
6875   CHECK_NUMBER (ch);
6876   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6877   if (charset == CHARSET_ASCII)
6878     {
6879       val = ch;
6880     }
6881   else if ((charset == charset_big5_1
6882             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6883            || (charset == charset_big5_2
6884                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6885     {
6886       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6887       XSETFASTINT (val, (b1 << 8) | b2);
6888     }
6889   else
6890     error ("Can't encode to Big5: %d", XFASTINT (ch));
6891   return val;
6892 }
6893 \f
6894 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
6895        Sset_terminal_coding_system_internal, 1, 1, 0,
6896        doc: /* Internal use only.  */)
6897      (coding_system)
6898      Lisp_Object coding_system;
6899 {
6900   CHECK_SYMBOL (coding_system);
6901   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6902   /* We had better not send unsafe characters to terminal.  */
6903   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6904   /* Character composition should be disabled.  */
6905   terminal_coding.composing = COMPOSITION_DISABLED;
6906   /* Error notification should be suppressed.  */
6907   terminal_coding.suppress_error = 1;
6908   terminal_coding.src_multibyte = 1;
6909   terminal_coding.dst_multibyte = 0;
6910   return Qnil;
6911 }
6912
6913 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
6914        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6915        doc: /* Internal use only.  */)
6916      (coding_system)
6917      Lisp_Object coding_system;
6918 {
6919   CHECK_SYMBOL (coding_system);
6920   setup_coding_system (Fcheck_coding_system (coding_system),
6921                        &safe_terminal_coding);
6922   /* Character composition should be disabled.  */
6923   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6924   /* Error notification should be suppressed.  */
6925   terminal_coding.suppress_error = 1;
6926   safe_terminal_coding.src_multibyte = 1;
6927   safe_terminal_coding.dst_multibyte = 0;
6928   return Qnil;
6929 }
6930
6931 DEFUN ("terminal-coding-system", Fterminal_coding_system,
6932        Sterminal_coding_system, 0, 0, 0,
6933        doc: /* Return coding system specified for terminal output.  */)
6934      ()
6935 {
6936   return terminal_coding.symbol;
6937 }
6938
6939 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
6940        Sset_keyboard_coding_system_internal, 1, 1, 0,
6941        doc: /* Internal use only.  */)
6942      (coding_system)
6943      Lisp_Object coding_system;
6944 {
6945   CHECK_SYMBOL (coding_system);
6946   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6947   /* Character composition should be disabled.  */
6948   keyboard_coding.composing = COMPOSITION_DISABLED;
6949   return Qnil;
6950 }
6951
6952 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
6953        Skeyboard_coding_system, 0, 0, 0,
6954        doc: /* Return coding system specified for decoding keyboard input.  */)
6955      ()
6956 {
6957   return keyboard_coding.symbol;
6958 }
6959
6960 \f
6961 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6962        Sfind_operation_coding_system,  1, MANY, 0,
6963        doc: /* Choose a coding system for an operation based on the target name.
6964 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
6965 DECODING-SYSTEM is the coding system to use for decoding
6966 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
6967 for encoding (in case OPERATION does encoding).
6968
6969 The first argument OPERATION specifies an I/O primitive:
6970   For file I/O, `insert-file-contents' or `write-region'.
6971   For process I/O, `call-process', `call-process-region', or `start-process'.
6972   For network I/O, `open-network-stream'.
6973
6974 The remaining arguments should be the same arguments that were passed
6975 to the primitive.  Depending on which primitive, one of those arguments
6976 is selected as the TARGET.  For example, if OPERATION does file I/O,
6977 whichever argument specifies the file name is TARGET.
6978
6979 TARGET has a meaning which depends on OPERATION:
6980   For file I/O, TARGET is a file name.
6981   For process I/O, TARGET is a process name.
6982   For network I/O, TARGET is a service name or a port number
6983
6984 This function looks up what specified for TARGET in,
6985 `file-coding-system-alist', `process-coding-system-alist',
6986 or `network-coding-system-alist' depending on OPERATION.
6987 They may specify a coding system, a cons of coding systems,
6988 or a function symbol to call.
6989 In the last case, we call the function with one argument,
6990 which is a list of all the arguments given to this function.
6991
6992 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
6993      (nargs, args)
6994      int nargs;
6995      Lisp_Object *args;
6996 {
6997   Lisp_Object operation, target_idx, target, val;
6998   register Lisp_Object chain;
6999
7000   if (nargs < 2)
7001     error ("Too few arguments");
7002   operation = args[0];
7003   if (!SYMBOLP (operation)
7004       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7005     error ("Invalid first argument");
7006   if (nargs < 1 + XINT (target_idx))
7007     error ("Too few arguments for operation: %s",
7008            SDATA (SYMBOL_NAME (operation)));
7009   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7010      argument to write-region) is string, it must be treated as a
7011      target file name.  */
7012   if (EQ (operation, Qwrite_region)
7013       && nargs > 5
7014       && STRINGP (args[5]))
7015     target_idx = make_number (4);
7016   target = args[XINT (target_idx) + 1];
7017   if (!(STRINGP (target)
7018         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7019     error ("Invalid argument %d", XINT (target_idx) + 1);
7020
7021   chain = ((EQ (operation, Qinsert_file_contents)
7022             || EQ (operation, Qwrite_region))
7023            ? Vfile_coding_system_alist
7024            : (EQ (operation, Qopen_network_stream)
7025               ? Vnetwork_coding_system_alist
7026               : Vprocess_coding_system_alist));
7027   if (NILP (chain))
7028     return Qnil;
7029
7030   for (; CONSP (chain); chain = XCDR (chain))
7031     {
7032       Lisp_Object elt;
7033       elt = XCAR (chain);
7034
7035       if (CONSP (elt)
7036           && ((STRINGP (target)
7037                && STRINGP (XCAR (elt))
7038                && fast_string_match (XCAR (elt), target) >= 0)
7039               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7040         {
7041           val = XCDR (elt);
7042           /* Here, if VAL is both a valid coding system and a valid
7043              function symbol, we return VAL as a coding system.  */
7044           if (CONSP (val))
7045             return val;
7046           if (! SYMBOLP (val))
7047             return Qnil;
7048           if (! NILP (Fcoding_system_p (val)))
7049             return Fcons (val, val);
7050           if (! NILP (Ffboundp (val)))
7051             {
7052               val = call1 (val, Flist (nargs, args));
7053               if (CONSP (val))
7054                 return val;
7055               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7056                 return Fcons (val, val);
7057             }
7058           return Qnil;
7059         }
7060     }
7061   return Qnil;
7062 }
7063
7064 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7065        Supdate_coding_systems_internal, 0, 0, 0,
7066        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7067 When values of any coding categories are changed, you must
7068 call this function.  */)
7069      ()
7070 {
7071   int i;
7072
7073   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7074     {
7075       Lisp_Object val;
7076
7077       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7078       if (!NILP (val))
7079         {
7080           if (! coding_system_table[i])
7081             coding_system_table[i] = ((struct coding_system *)
7082                                       xmalloc (sizeof (struct coding_system)));
7083           setup_coding_system (val, coding_system_table[i]);
7084         }
7085       else if (coding_system_table[i])
7086         {
7087           xfree (coding_system_table[i]);
7088           coding_system_table[i] = NULL;
7089         }
7090     }
7091
7092   return Qnil;
7093 }
7094
7095 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7096        Sset_coding_priority_internal, 0, 0, 0,
7097        doc: /* Update internal database for the current value of `coding-category-list'.
7098 This function is internal use only.  */)
7099      ()
7100 {
7101   int i = 0, idx;
7102   Lisp_Object val;
7103
7104   val = Vcoding_category_list;
7105
7106   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7107     {
7108       if (! SYMBOLP (XCAR (val)))
7109         break;
7110       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7111       if (idx >= CODING_CATEGORY_IDX_MAX)
7112         break;
7113       coding_priorities[i++] = (1 << idx);
7114       val = XCDR (val);
7115     }
7116   /* If coding-category-list is valid and contains all coding
7117      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7118      the following code saves Emacs from crashing.  */
7119   while (i < CODING_CATEGORY_IDX_MAX)
7120     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7121
7122   return Qnil;
7123 }
7124
7125 #endif /* emacs */
7126
7127 \f
7128 /*** 9. Post-amble ***/
7129
7130 void
7131 init_coding_once ()
7132 {
7133   int i;
7134
7135   /* Emacs' internal format specific initialize routine.  */
7136   for (i = 0; i <= 0x20; i++)
7137     emacs_code_class[i] = EMACS_control_code;
7138   emacs_code_class[0x0A] = EMACS_linefeed_code;
7139   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7140   for (i = 0x21 ; i < 0x7F; i++)
7141     emacs_code_class[i] = EMACS_ascii_code;
7142   emacs_code_class[0x7F] = EMACS_control_code;
7143   for (i = 0x80; i < 0xFF; i++)
7144     emacs_code_class[i] = EMACS_invalid_code;
7145   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7146   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7147   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7148   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7149
7150   /* ISO2022 specific initialize routine.  */
7151   for (i = 0; i < 0x20; i++)
7152     iso_code_class[i] = ISO_control_0;
7153   for (i = 0x21; i < 0x7F; i++)
7154     iso_code_class[i] = ISO_graphic_plane_0;
7155   for (i = 0x80; i < 0xA0; i++)
7156     iso_code_class[i] = ISO_control_1;
7157   for (i = 0xA1; i < 0xFF; i++)
7158     iso_code_class[i] = ISO_graphic_plane_1;
7159   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7160   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7161   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7162   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7163   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7164   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7165   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7166   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7167   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7168   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7169
7170   setup_coding_system (Qnil, &keyboard_coding);
7171   setup_coding_system (Qnil, &terminal_coding);
7172   setup_coding_system (Qnil, &safe_terminal_coding);
7173   setup_coding_system (Qnil, &default_buffer_file_coding);
7174
7175   bzero (coding_system_table, sizeof coding_system_table);
7176
7177   bzero (ascii_skip_code, sizeof ascii_skip_code);
7178   for (i = 0; i < 128; i++)
7179     ascii_skip_code[i] = 1;
7180
7181 #if defined (MSDOS) || defined (WINDOWSNT)
7182   system_eol_type = CODING_EOL_CRLF;
7183 #else
7184   system_eol_type = CODING_EOL_LF;
7185 #endif
7186
7187   inhibit_pre_post_conversion = 0;
7188 }
7189
7190 #ifdef emacs
7191
7192 void
7193 syms_of_coding ()
7194 {
7195   Qtarget_idx = intern ("target-idx");
7196   staticpro (&Qtarget_idx);
7197
7198   Qcoding_system_history = intern ("coding-system-history");
7199   staticpro (&Qcoding_system_history);
7200   Fset (Qcoding_system_history, Qnil);
7201
7202   /* Target FILENAME is the first argument.  */
7203   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7204   /* Target FILENAME is the third argument.  */
7205   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7206
7207   Qcall_process = intern ("call-process");
7208   staticpro (&Qcall_process);
7209   /* Target PROGRAM is the first argument.  */
7210   Fput (Qcall_process, Qtarget_idx, make_number (0));
7211
7212   Qcall_process_region = intern ("call-process-region");
7213   staticpro (&Qcall_process_region);
7214   /* Target PROGRAM is the third argument.  */
7215   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7216
7217   Qstart_process = intern ("start-process");
7218   staticpro (&Qstart_process);
7219   /* Target PROGRAM is the third argument.  */
7220   Fput (Qstart_process, Qtarget_idx, make_number (2));
7221
7222   Qopen_network_stream = intern ("open-network-stream");
7223   staticpro (&Qopen_network_stream);
7224   /* Target SERVICE is the fourth argument.  */
7225   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7226
7227   Qcoding_system = intern ("coding-system");
7228   staticpro (&Qcoding_system);
7229
7230   Qeol_type = intern ("eol-type");
7231   staticpro (&Qeol_type);
7232
7233   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7234   staticpro (&Qbuffer_file_coding_system);
7235
7236   Qpost_read_conversion = intern ("post-read-conversion");
7237   staticpro (&Qpost_read_conversion);
7238
7239   Qpre_write_conversion = intern ("pre-write-conversion");
7240   staticpro (&Qpre_write_conversion);
7241
7242   Qno_conversion = intern ("no-conversion");
7243   staticpro (&Qno_conversion);
7244
7245   Qundecided = intern ("undecided");
7246   staticpro (&Qundecided);
7247
7248   Qcoding_system_p = intern ("coding-system-p");
7249   staticpro (&Qcoding_system_p);
7250
7251   Qcoding_system_error = intern ("coding-system-error");
7252   staticpro (&Qcoding_system_error);
7253
7254   Fput (Qcoding_system_error, Qerror_conditions,
7255         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7256   Fput (Qcoding_system_error, Qerror_message,
7257         build_string ("Invalid coding system"));
7258
7259   Qcoding_category = intern ("coding-category");
7260   staticpro (&Qcoding_category);
7261   Qcoding_category_index = intern ("coding-category-index");
7262   staticpro (&Qcoding_category_index);
7263
7264   Vcoding_category_table
7265     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7266   staticpro (&Vcoding_category_table);
7267   {
7268     int i;
7269     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7270       {
7271         XVECTOR (Vcoding_category_table)->contents[i]
7272           = intern (coding_category_name[i]);
7273         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7274               Qcoding_category_index, make_number (i));
7275       }
7276   }
7277
7278   Qtranslation_table = intern ("translation-table");
7279   staticpro (&Qtranslation_table);
7280   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7281
7282   Qtranslation_table_id = intern ("translation-table-id");
7283   staticpro (&Qtranslation_table_id);
7284
7285   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7286   staticpro (&Qtranslation_table_for_decode);
7287
7288   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7289   staticpro (&Qtranslation_table_for_encode);
7290
7291   Qsafe_chars = intern ("safe-chars");
7292   staticpro (&Qsafe_chars);
7293
7294   Qchar_coding_system = intern ("char-coding-system");
7295   staticpro (&Qchar_coding_system);
7296
7297   /* Intern this now in case it isn't already done.
7298      Setting this variable twice is harmless.
7299      But don't staticpro it here--that is done in alloc.c.  */
7300   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7301   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7302   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7303
7304   Qvalid_codes = intern ("valid-codes");
7305   staticpro (&Qvalid_codes);
7306
7307   Qemacs_mule = intern ("emacs-mule");
7308   staticpro (&Qemacs_mule);
7309
7310   Qraw_text = intern ("raw-text");
7311   staticpro (&Qraw_text);
7312
7313   defsubr (&Scoding_system_p);
7314   defsubr (&Sread_coding_system);
7315   defsubr (&Sread_non_nil_coding_system);
7316   defsubr (&Scheck_coding_system);
7317   defsubr (&Sdetect_coding_region);
7318   defsubr (&Sdetect_coding_string);
7319   defsubr (&Sfind_coding_systems_region_internal);
7320   defsubr (&Sunencodable_char_position);
7321   defsubr (&Sdecode_coding_region);
7322   defsubr (&Sencode_coding_region);
7323   defsubr (&Sdecode_coding_string);
7324   defsubr (&Sencode_coding_string);
7325   defsubr (&Sdecode_sjis_char);
7326   defsubr (&Sencode_sjis_char);
7327   defsubr (&Sdecode_big5_char);
7328   defsubr (&Sencode_big5_char);
7329   defsubr (&Sset_terminal_coding_system_internal);
7330   defsubr (&Sset_safe_terminal_coding_system_internal);
7331   defsubr (&Sterminal_coding_system);
7332   defsubr (&Sset_keyboard_coding_system_internal);
7333   defsubr (&Skeyboard_coding_system);
7334   defsubr (&Sfind_operation_coding_system);
7335   defsubr (&Supdate_coding_systems_internal);
7336   defsubr (&Sset_coding_priority_internal);
7337
7338   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7339                doc: /* List of coding systems.
7340
7341 Do not alter the value of this variable manually.  This variable should be
7342 updated by the functions `make-coding-system' and
7343 `define-coding-system-alias'.  */);
7344   Vcoding_system_list = Qnil;
7345
7346   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7347                doc: /* Alist of coding system names.
7348 Each element is one element list of coding system name.
7349 This variable is given to `completing-read' as TABLE argument.
7350
7351 Do not alter the value of this variable manually.  This variable should be
7352 updated by the functions `make-coding-system' and
7353 `define-coding-system-alias'.  */);
7354   Vcoding_system_alist = Qnil;
7355
7356   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7357                doc: /* List of coding-categories (symbols) ordered by priority.
7358
7359 On detecting a coding system, Emacs tries code detection algorithms
7360 associated with each coding-category one by one in this order.  When
7361 one algorithm agrees with a byte sequence of source text, the coding
7362 system bound to the corresponding coding-category is selected.  */);
7363   {
7364     int i;
7365
7366     Vcoding_category_list = Qnil;
7367     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7368       Vcoding_category_list
7369         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7370                  Vcoding_category_list);
7371   }
7372
7373   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7374                doc: /* Specify the coding system for read operations.
7375 It is useful to bind this variable with `let', but do not set it globally.
7376 If the value is a coding system, it is used for decoding on read operation.
7377 If not, an appropriate element is used from one of the coding system alists:
7378 There are three such tables, `file-coding-system-alist',
7379 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7380   Vcoding_system_for_read = Qnil;
7381
7382   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7383                doc: /* Specify the coding system for write operations.
7384 Programs bind this variable with `let', but you should not set it globally.
7385 If the value is a coding system, it is used for encoding of output,
7386 when writing it to a file and when sending it to a file or subprocess.
7387
7388 If this does not specify a coding system, an appropriate element
7389 is used from one of the coding system alists:
7390 There are three such tables, `file-coding-system-alist',
7391 `process-coding-system-alist', and `network-coding-system-alist'.
7392 For output to files, if the above procedure does not specify a coding system,
7393 the value of `buffer-file-coding-system' is used.  */);
7394   Vcoding_system_for_write = Qnil;
7395
7396   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7397                doc: /* Coding system used in the latest file or process I/O.  */);
7398   Vlast_coding_system_used = Qnil;
7399
7400   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7401                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7402 See info node `Coding Systems' and info node `Text and Binary' concerning
7403 such conversion.  */);
7404   inhibit_eol_conversion = 0;
7405
7406   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7407                doc: /* Non-nil means process buffer inherits coding system of process output.
7408 Bind it to t if the process output is to be treated as if it were a file
7409 read from some filesystem.  */);
7410   inherit_process_coding_system = 0;
7411
7412   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7413                doc: /* Alist to decide a coding system to use for a file I/O operation.
7414 The format is ((PATTERN . VAL) ...),
7415 where PATTERN is a regular expression matching a file name,
7416 VAL is a coding system, a cons of coding systems, or a function symbol.
7417 If VAL is a coding system, it is used for both decoding and encoding
7418 the file contents.
7419 If VAL is a cons of coding systems, the car part is used for decoding,
7420 and the cdr part is used for encoding.
7421 If VAL is a function symbol, the function must return a coding system
7422 or a cons of coding systems which are used as above.  The function gets
7423 the arguments with which `find-operation-coding-system' was called.
7424
7425 See also the function `find-operation-coding-system'
7426 and the variable `auto-coding-alist'.  */);
7427   Vfile_coding_system_alist = Qnil;
7428
7429   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7430     doc: /* Alist to decide a coding system to use for a process I/O operation.
7431 The format is ((PATTERN . VAL) ...),
7432 where PATTERN is a regular expression matching a program name,
7433 VAL is a coding system, a cons of coding systems, or a function symbol.
7434 If VAL is a coding system, it is used for both decoding what received
7435 from the program and encoding what sent to the program.
7436 If VAL is a cons of coding systems, the car part is used for decoding,
7437 and the cdr part is used for encoding.
7438 If VAL is a function symbol, the function must return a coding system
7439 or a cons of coding systems which are used as above.
7440
7441 See also the function `find-operation-coding-system'.  */);
7442   Vprocess_coding_system_alist = Qnil;
7443
7444   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7445     doc: /* Alist to decide a coding system to use for a network I/O operation.
7446 The format is ((PATTERN . VAL) ...),
7447 where PATTERN is a regular expression matching a network service name
7448 or is a port number to connect to,
7449 VAL is a coding system, a cons of coding systems, or a function symbol.
7450 If VAL is a coding system, it is used for both decoding what received
7451 from the network stream and encoding what sent to the network stream.
7452 If VAL is a cons of coding systems, the car part is used for decoding,
7453 and the cdr part is used for encoding.
7454 If VAL is a function symbol, the function must return a coding system
7455 or a cons of coding systems which are used as above.
7456
7457 See also the function `find-operation-coding-system'.  */);
7458   Vnetwork_coding_system_alist = Qnil;
7459
7460   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7461                doc: /* Coding system to use with system messages.
7462 Also used for decoding keyboard input on X Window system.  */);
7463   Vlocale_coding_system = Qnil;
7464
7465   /* The eol mnemonics are reset in startup.el system-dependently.  */
7466   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7467                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7468   eol_mnemonic_unix = build_string (":");
7469
7470   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7471                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7472   eol_mnemonic_dos = build_string ("\\");
7473
7474   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7475                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7476   eol_mnemonic_mac = build_string ("/");
7477
7478   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7479                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7480   eol_mnemonic_undecided = build_string (":");
7481
7482   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7483                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7484   Venable_character_translation = Qt;
7485
7486   DEFVAR_LISP ("standard-translation-table-for-decode",
7487                &Vstandard_translation_table_for_decode,
7488                doc: /* Table for translating characters while decoding.  */);
7489   Vstandard_translation_table_for_decode = Qnil;
7490
7491   DEFVAR_LISP ("standard-translation-table-for-encode",
7492                &Vstandard_translation_table_for_encode,
7493                doc: /* Table for translating characters while encoding.  */);
7494   Vstandard_translation_table_for_encode = Qnil;
7495
7496   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7497                doc: /* Alist of charsets vs revision numbers.
7498 While encoding, if a charset (car part of an element) is found,
7499 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7500   Vcharset_revision_alist = Qnil;
7501
7502   DEFVAR_LISP ("default-process-coding-system",
7503                &Vdefault_process_coding_system,
7504                doc: /* Cons of coding systems used for process I/O by default.
7505 The car part is used for decoding a process output,
7506 the cdr part is used for encoding a text to be sent to a process.  */);
7507   Vdefault_process_coding_system = Qnil;
7508
7509   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7510                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7511 This is a vector of length 256.
7512 If Nth element is non-nil, the existence of code N in a file
7513 \(or output of subprocess) doesn't prevent it to be detected as
7514 a coding system of ISO 2022 variant which has a flag
7515 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7516 or reading output of a subprocess.
7517 Only 128th through 159th elements has a meaning.  */);
7518   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7519
7520   DEFVAR_LISP ("select-safe-coding-system-function",
7521                &Vselect_safe_coding_system_function,
7522                doc: /* Function to call to select safe coding system for encoding a text.
7523
7524 If set, this function is called to force a user to select a proper
7525 coding system which can encode the text in the case that a default
7526 coding system used in each operation can't encode the text.
7527
7528 The default value is `select-safe-coding-system' (which see).  */);
7529   Vselect_safe_coding_system_function = Qnil;
7530
7531   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
7532                doc: /* Char-table containing safe coding systems of each characters.
7533 Each element doesn't include such generic coding systems that can
7534 encode any characters.  They are in the first extra slot.  */);
7535   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
7536
7537   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7538                &inhibit_iso_escape_detection,
7539                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7540
7541 By default, on reading a file, Emacs tries to detect how the text is
7542 encoded.  This code detection is sensitive to escape sequences.  If
7543 the sequence is valid as ISO2022, the code is determined as one of
7544 the ISO2022 encodings, and the file is decoded by the corresponding
7545 coding system (e.g. `iso-2022-7bit').
7546
7547 However, there may be a case that you want to read escape sequences in
7548 a file as is.  In such a case, you can set this variable to non-nil.
7549 Then, as the code detection ignores any escape sequences, no file is
7550 detected as encoded in some ISO2022 encoding.  The result is that all
7551 escape sequences become visible in a buffer.
7552
7553 The default value is nil, and it is strongly recommended not to change
7554 it.  That is because many Emacs Lisp source files that contain
7555 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7556 in Emacs's distribution, and they won't be decoded correctly on
7557 reading if you suppress escape sequence detection.
7558
7559 The other way to read escape sequences in a file without decoding is
7560 to explicitly specify some coding system that doesn't use ISO2022's
7561 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7562   inhibit_iso_escape_detection = 0;
7563
7564   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7565                doc: /* Char table for translating self-inserting characters.
7566 This is applied to the result of input methods, not their input.  See also
7567 `keyboard-translate-table'.  */);
7568     Vtranslation_table_for_input = Qnil;
7569 }
7570
7571 char *
7572 emacs_strerror (error_number)
7573      int error_number;
7574 {
7575   char *str;
7576
7577   synchronize_system_messages_locale ();
7578   str = strerror (error_number);
7579
7580   if (! NILP (Vlocale_coding_system))
7581     {
7582       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7583                                                       Vlocale_coding_system,
7584                                                       0);
7585       str = (char *) SDATA (dec);
7586     }
7587
7588   return str;
7589 }
7590
7591 #endif /* emacs */
7592