src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348 #include "intervals.h"
 349
 350 #else  /* not emacs */
 351
 352 #include "mulelib.h"
 353
 354 #endif /* not emacs */
 355
 356 Lisp_Object Qcoding_system, Qeol_type;
 357 Lisp_Object Qbuffer_file_coding_system;
 358 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 359 Lisp_Object Qno_conversion, Qundecided;
 360 Lisp_Object Qcoding_system_history;
 361 Lisp_Object Qsafe_chars;
 362 Lisp_Object Qvalid_codes;
 363
 364 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 365 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 366 Lisp_Object Qstart_process, Qopen_network_stream;
 367 Lisp_Object Qtarget_idx;
 368
 369 /* If a symbol has this property, evaluate the value to define the
 370    symbol as a coding system.  */
 371 Lisp_Object Qcoding_system_define_form;
 372
 373 Lisp_Object Vselect_safe_coding_system_function;
 374
 375 int coding_system_require_warning;
 376
 377 /* Mnemonic string for each format of end-of-line.  */
 378 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 379 /* Mnemonic string to indicate format of end-of-line is not yet
 380    decided.  */
 381 Lisp_Object eol_mnemonic_undecided;
 382
 383 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 384    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 385 int system_eol_type;
 386
 387 #ifdef emacs
 388
 389 /* Information about which coding system is safe for which chars.
 390    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 391
 392    GENERIC-LIST is a list of generic coding systems which can encode
 393    any characters.
 394
 395    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 396    corresponding char table that contains safe chars.  */
 397 Lisp_Object Vcoding_system_safe_chars;
 398
 399 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 400
 401 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 402
 403 /* Coding system emacs-mule and raw-text are for converting only
 404    end-of-line format.  */
 405 Lisp_Object Qemacs_mule, Qraw_text;
 406
 407 Lisp_Object Qutf_8;
 408
 409 /* Coding-systems are handed between Emacs Lisp programs and C internal
 410    routines by the following three variables.  */
 411 /* Coding-system for reading files and receiving data from process.  */
 412 Lisp_Object Vcoding_system_for_read;
 413 /* Coding-system for writing files and sending data to process.  */
 414 Lisp_Object Vcoding_system_for_write;
 415 /* Coding-system actually used in the latest I/O.  */
 416 Lisp_Object Vlast_coding_system_used;
 417
 418 /* A vector of length 256 which contains information about special
 419    Latin codes (especially for dealing with Microsoft codes).  */
 420 Lisp_Object Vlatin_extra_code_table;
 421
 422 /* Flag to inhibit code conversion of end-of-line format.  */
 423 int inhibit_eol_conversion;
 424
 425 /* Flag to inhibit ISO2022 escape sequence detection.  */
 426 int inhibit_iso_escape_detection;
 427
 428 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 429 int inherit_process_coding_system;
 430
 431 /* Coding system to be used to encode text for terminal display.  */
 432 struct coding_system terminal_coding;
 433
 434 /* Coding system to be used to encode text for terminal display when
 435    terminal coding system is nil.  */
 436 struct coding_system safe_terminal_coding;
 437
 438 /* Coding system of what is sent from terminal keyboard.  */
 439 struct coding_system keyboard_coding;
 440
 441 /* Default coding system to be used to write a file.  */
 442 struct coding_system default_buffer_file_coding;
 443
 444 Lisp_Object Vfile_coding_system_alist;
 445 Lisp_Object Vprocess_coding_system_alist;
 446 Lisp_Object Vnetwork_coding_system_alist;
 447
 448 Lisp_Object Vlocale_coding_system;
 449
 450 #endif /* emacs */
 451
 452 Lisp_Object Qcoding_category, Qcoding_category_index;
 453
 454 /* List of symbols `coding-category-xxx' ordered by priority.  */
 455 Lisp_Object Vcoding_category_list;
 456
 457 /* Table of coding categories (Lisp symbols).  */
 458 Lisp_Object Vcoding_category_table;
 459
 460 /* Table of names of symbol for each coding-category.  */
 461 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 462   "coding-category-emacs-mule",
 463   "coding-category-sjis",
 464   "coding-category-iso-7",
 465   "coding-category-iso-7-tight",
 466   "coding-category-iso-8-1",
 467   "coding-category-iso-8-2",
 468   "coding-category-iso-7-else",
 469   "coding-category-iso-8-else",
 470   "coding-category-ccl",
 471   "coding-category-big5",
 472   "coding-category-utf-8",
 473   "coding-category-utf-16-be",
 474   "coding-category-utf-16-le",
 475   "coding-category-raw-text",
 476   "coding-category-binary"
 477 };
 478
 479 /* Table of pointers to coding systems corresponding to each coding
 480    categories.  */
 481 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 482
 483 /* Table of coding category masks.  Nth element is a mask for a coding
 484    category of which priority is Nth.  */
 485 static
 486 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 487
 488 /* Flag to tell if we look up translation table on character code
 489    conversion.  */
 490 Lisp_Object Venable_character_translation;
 491 /* Standard translation table to look up on decoding (reading).  */
 492 Lisp_Object Vstandard_translation_table_for_decode;
 493 /* Standard translation table to look up on encoding (writing).  */
 494 Lisp_Object Vstandard_translation_table_for_encode;
 495
 496 Lisp_Object Qtranslation_table;
 497 Lisp_Object Qtranslation_table_id;
 498 Lisp_Object Qtranslation_table_for_decode;
 499 Lisp_Object Qtranslation_table_for_encode;
 500
 501 /* Alist of charsets vs revision number.  */
 502 Lisp_Object Vcharset_revision_alist;
 503
 504 /* Default coding systems used for process I/O.  */
 505 Lisp_Object Vdefault_process_coding_system;
 506
 507 /* Char table for translating Quail and self-inserting input.  */
 508 Lisp_Object Vtranslation_table_for_input;
 509
 510 /* Global flag to tell that we can't call post-read-conversion and
 511    pre-write-conversion functions.  Usually the value is zero, but it
 512    is set to 1 temporarily while such functions are running.  This is
 513    to avoid infinite recursive call.  */
 514 static int inhibit_pre_post_conversion;
 515
 516 Lisp_Object Qchar_coding_system;
 517
 518 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 519    its validity.  */
 520
 521 Lisp_Object
 522 coding_safe_chars (coding_system)
 523      Lisp_Object coding_system;
 524 {
 525   Lisp_Object coding_spec, plist, safe_chars;
 526
 527   coding_spec = Fget (coding_system, Qcoding_system);
 528   plist = XVECTOR (coding_spec)->contents[3];
 529   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 530   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 531 }
 532
 533 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 534   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 535
 536 \f
 537 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 538
 539 /* Emacs' internal format for representation of multiple character
 540    sets is a kind of multi-byte encoding, i.e. characters are
 541    represented by variable-length sequences of one-byte codes.
 542
 543    ASCII characters and control characters (e.g. `tab', `newline') are
 544    represented by one-byte sequences which are their ASCII codes, in
 545    the range 0x00 through 0x7F.
 546
 547    8-bit characters of the range 0x80..0x9F are represented by
 548    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 549    code + 0x20).
 550
 551    8-bit characters of the range 0xA0..0xFF are represented by
 552    one-byte sequences which are their 8-bit code.
 553
 554    The other characters are represented by a sequence of `base
 555    leading-code', optional `extended leading-code', and one or two
 556    `position-code's.  The length of the sequence is determined by the
 557    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 558    whereas extended leading-code and position-code take the range 0xA0
 559    through 0xFF.  See `charset.h' for more details about leading-code
 560    and position-code.
 561
 562    --- CODE RANGE of Emacs' internal format ---
 563    character set        range
 564    -------------        -----
 565    ascii                0x00..0x7F
 566    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 567    eight-bit-graphic    0xA0..0xBF
 568    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 569    ---------------------------------------------
 570
 571    As this is the internal character representation, the format is
 572    usually not used externally (i.e. in a file or in a data sent to a
 573    process).  But, it is possible to have a text externally in this
 574    format (i.e. by encoding by the coding system `emacs-mule').
 575
 576    In that case, a sequence of one-byte codes has a slightly different
 577    form.
 578
 579    Firstly, all characters in eight-bit-control are represented by
 580    one-byte sequences which are their 8-bit code.
 581
 582    Next, character composition data are represented by the byte
 583    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 584    where,
 585         METHOD is 0xF0 plus one of composition method (enum
 586         composition_method),
 587
 588         BYTES is 0xA0 plus the byte length of these composition data,
 589
 590         CHARS is 0xA0 plus the number of characters composed by these
 591         data,
 592
 593         COMPONENTs are characters of multibyte form or composition
 594         rules encoded by two-byte of ASCII codes.
 595
 596    In addition, for backward compatibility, the following formats are
 597    also recognized as composition data on decoding.
 598
 599    0x80 MSEQ ...
 600    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 601
 602    Here,
 603         MSEQ is a multibyte form but in these special format:
 604           ASCII: 0xA0 ASCII_CODE+0x80,
 605           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 606         RULE is a one byte code of the range 0xA0..0xF0 that
 607         represents a composition rule.
 608   */
 609
 610 enum emacs_code_class_type emacs_code_class[256];
 611
 612 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 613    Check if a text is encoded in Emacs' internal format.  If it is,
 614    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 615
 616 static int
 617 detect_coding_emacs_mule (src, src_end, multibytep)
 618       unsigned char *src, *src_end;
 619       int multibytep;
 620 {
 621   unsigned char c;
 622   int composing = 0;
 623   /* Dummy for ONE_MORE_BYTE.  */
 624   struct coding_system dummy_coding;
 625   struct coding_system *coding = &dummy_coding;
 626
 627   while (1)
 628     {
 629       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 630
 631       if (composing)
 632         {
 633           if (c < 0xA0)
 634             composing = 0;
 635           else if (c == 0xA0)
 636             {
 637               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 638               c &= 0x7F;
 639             }
 640           else
 641             c -= 0x20;
 642         }
 643
 644       if (c < 0x20)
 645         {
 646           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 647             return 0;
 648         }
 649       else if (c >= 0x80 && c < 0xA0)
 650         {
 651           if (c == 0x80)
 652             /* Old leading code for a composite character.  */
 653             composing = 1;
 654           else
 655             {
 656               unsigned char *src_base = src - 1;
 657               int bytes;
 658
 659               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 660                                                bytes))
 661                 return 0;
 662               src = src_base + bytes;
 663             }
 664         }
 665     }
 666  label_end_of_loop:
 667   return CODING_CATEGORY_MASK_EMACS_MULE;
 668 }
 669
 670
 671 /* Record the starting position START and METHOD of one composition.  */
 672
 673 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 674   do {                                                          \
 675     struct composition_data *cmp_data = coding->cmp_data;       \
 676     int *data = cmp_data->data + cmp_data->used;                \
 677     coding->cmp_data_start = cmp_data->used;                    \
 678     data[0] = -1;                                               \
 679     data[1] = cmp_data->char_offset + start;                    \
 680     data[3] = (int) method;                                     \
 681     cmp_data->used += 4;                                        \
 682   } while (0)
 683
 684 /* Record the ending position END of the current composition.  */
 685
 686 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 687   do {                                                          \
 688     struct composition_data *cmp_data = coding->cmp_data;       \
 689     int *data = cmp_data->data + coding->cmp_data_start;        \
 690     data[0] = cmp_data->used - coding->cmp_data_start;          \
 691     data[2] = cmp_data->char_offset + end;                      \
 692   } while (0)
 693
 694 /* Record one COMPONENT (alternate character or composition rule).  */
 695
 696 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 697   do {                                                                  \
 698     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 699     if (coding->cmp_data->used - coding->cmp_data_start                 \
 700         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 701       {                                                                 \
 702         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 703         coding->composing = COMPOSITION_NO;                             \
 704       }                                                                 \
 705   } while (0)
 706
 707
 708 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 709    is not less than SRC_END, return -1 without incrementing Src.  */
 710
 711 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 712
 713
 714 /* Decode a character represented as a component of composition
 715    sequence of Emacs 20 style at SRC.  Set C to that character, store
 716    its multibyte form sequence at P, and set P to the end of that
 717    sequence.  If no valid character is found, set C to -1.  */
 718
 719 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 720   do {                                                          \
 721     int bytes;                                                  \
 722                                                                 \
 723     c = SAFE_ONE_MORE_BYTE ();                                  \
 724     if (c < 0)                                                  \
 725       break;                                                    \
 726     if (CHAR_HEAD_P (c))                                        \
 727       c = -1;                                                   \
 728     else if (c == 0xA0)                                         \
 729       {                                                         \
 730         c = SAFE_ONE_MORE_BYTE ();                              \
 731         if (c < 0xA0)                                           \
 732           c = -1;                                               \
 733         else                                                    \
 734           {                                                     \
 735             c -= 0xA0;                                          \
 736             *p++ = c;                                           \
 737           }                                                     \
 738       }                                                         \
 739     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 740       {                                                         \
 741         unsigned char *p0 = p;                                  \
 742                                                                 \
 743         c -= 0x20;                                              \
 744         *p++ = c;                                               \
 745         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 746         while (--bytes)                                         \
 747           {                                                     \
 748             c = SAFE_ONE_MORE_BYTE ();                          \
 749             if (c < 0)                                          \
 750               break;                                            \
 751             *p++ = c;                                           \
 752           }                                                     \
 753         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 754             || (coding->flags /* We are recovering a file.  */  \
 755                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 756                 && ! CHAR_HEAD_P (p0[1])))                      \
 757           c = STRING_CHAR (p0, bytes);                          \
 758         else                                                    \
 759           c = -1;                                               \
 760       }                                                         \
 761     else                                                        \
 762       c = -1;                                                   \
 763   } while (0)
 764
 765
 766 /* Decode a composition rule represented as a component of composition
 767    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 768    valid rule is found, set C to -1.  */
 769
 770 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 771   do {                                                  \
 772     c = SAFE_ONE_MORE_BYTE ();                          \
 773     c -= 0xA0;                                          \
 774     if (c < 0 || c >= 81)                               \
 775       c = -1;                                           \
 776     else                                                \
 777       {                                                 \
 778         gref = c / 9, nref = c % 9;                     \
 779         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 780       }                                                 \
 781   } while (0)
 782
 783
 784 /* Decode composition sequence encoded by `emacs-mule' at the source
 785    pointed by SRC.  SRC_END is the end of source.  Store information
 786    of the composition in CODING->cmp_data.
 787
 788    For backward compatibility, decode also a composition sequence of
 789    Emacs 20 style.  In that case, the composition sequence contains
 790    characters that should be extracted into a buffer or string.  Store
 791    those characters at *DESTINATION in multibyte form.
 792
 793    If we encounter an invalid byte sequence, return 0.
 794    If we encounter an insufficient source or destination, or
 795    insufficient space in CODING->cmp_data, return 1.
 796    Otherwise, return consumed bytes in the source.
 797
 798 */
 799 static INLINE int
 800 decode_composition_emacs_mule (coding, src, src_end,
 801                                destination, dst_end, dst_bytes)
 802      struct coding_system *coding;
 803      unsigned char *src, *src_end, **destination, *dst_end;
 804      int dst_bytes;
 805 {
 806   unsigned char *dst = *destination;
 807   int method, data_len, nchars;
 808   unsigned char *src_base = src++;
 809   /* Store components of composition.  */
 810   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 811   int ncomponent;
 812   /* Store multibyte form of characters to be composed.  This is for
 813      Emacs 20 style composition sequence.  */
 814   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 815   unsigned char *bufp = buf;
 816   int c, i, gref, nref;
 817
 818   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 819       >= COMPOSITION_DATA_SIZE)
 820     {
 821       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 822       return -1;
 823     }
 824
 825   ONE_MORE_BYTE (c);
 826   if (c - 0xF0 >= COMPOSITION_RELATIVE
 827            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 828     {
 829       int with_rule;
 830
 831       method = c - 0xF0;
 832       with_rule = (method == COMPOSITION_WITH_RULE
 833                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 834       ONE_MORE_BYTE (c);
 835       data_len = c - 0xA0;
 836       if (data_len < 4
 837           || src_base + data_len > src_end)
 838         return 0;
 839       ONE_MORE_BYTE (c);
 840       nchars = c - 0xA0;
 841       if (c < 1)
 842         return 0;
 843       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 844         {
 845           /* If it is longer than this, it can't be valid.  */
 846           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 847             return 0;
 848
 849           if (ncomponent % 2 && with_rule)
 850             {
 851               ONE_MORE_BYTE (gref);
 852               gref -= 32;
 853               ONE_MORE_BYTE (nref);
 854               nref -= 32;
 855               c = COMPOSITION_ENCODE_RULE (gref, nref);
 856             }
 857           else
 858             {
 859               int bytes;
 860               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 861                   || (coding->flags /* We are recovering a file.  */
 862                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 863                       && ! CHAR_HEAD_P (src[1])))
 864                 c = STRING_CHAR (src, bytes);
 865               else
 866                 c = *src, bytes = 1;
 867               src += bytes;
 868             }
 869           component[ncomponent] = c;
 870         }
 871     }
 872   else
 873     {
 874       /* This may be an old Emacs 20 style format.  See the comment at
 875          the section 2 of this file.  */
 876       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 877       if (src == src_end
 878           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 879         goto label_end_of_loop;
 880
 881       src_end = src;
 882       src = src_base + 1;
 883       if (c < 0xC0)
 884         {
 885           method = COMPOSITION_RELATIVE;
 886           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 887             {
 888               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 889               if (c < 0)
 890                 break;
 891               component[ncomponent++] = c;
 892             }
 893           if (ncomponent < 2)
 894             return 0;
 895           nchars = ncomponent;
 896         }
 897       else if (c == 0xFF)
 898         {
 899           method = COMPOSITION_WITH_RULE;
 900           src++;
 901           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 902           if (c < 0)
 903             return 0;
 904           component[0] = c;
 905           for (ncomponent = 1;
 906                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 907             {
 908               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 909               if (c < 0)
 910                 break;
 911               component[ncomponent++] = c;
 912               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 913               if (c < 0)
 914                 break;
 915               component[ncomponent++] = c;
 916             }
 917           if (ncomponent < 3)
 918             return 0;
 919           nchars = (ncomponent + 1) / 2;
 920         }
 921       else
 922         return 0;
 923     }
 924
 925   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 926     {
 927       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 928       for (i = 0; i < ncomponent; i++)
 929         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 930       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 931       if (buf < bufp)
 932         {
 933           unsigned char *p = buf;
 934           EMIT_BYTES (p, bufp);
 935           *destination += bufp - buf;
 936           coding->produced_char += nchars;
 937         }
 938       return (src - src_base);
 939     }
 940  label_end_of_loop:
 941   return -1;
 942 }
 943
 944 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 945
 946 static void
 947 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 948      struct coding_system *coding;
 949      unsigned char *source, *destination;
 950      int src_bytes, dst_bytes;
 951 {
 952   unsigned char *src = source;
 953   unsigned char *src_end = source + src_bytes;
 954   unsigned char *dst = destination;
 955   unsigned char *dst_end = destination + dst_bytes;
 956   /* SRC_BASE remembers the start position in source in each loop.
 957      The loop will be exited when there's not enough source code, or
 958      when there's not enough destination area to produce a
 959      character.  */
 960   unsigned char *src_base;
 961
 962   coding->produced_char = 0;
 963   while ((src_base = src) < src_end)
 964     {
 965       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 966       int bytes;
 967
 968       if (*src == '\r')
 969         {
 970           int c = *src++;
 971
 972           if (coding->eol_type == CODING_EOL_CR)
 973             c = '\n';
 974           else if (coding->eol_type == CODING_EOL_CRLF)
 975             {
 976               ONE_MORE_BYTE (c);
 977               if (c != '\n')
 978                 {
 979                   src--;
 980                   c = '\r';
 981                 }
 982             }
 983           *dst++ = c;
 984           coding->produced_char++;
 985           continue;
 986         }
 987       else if (*src == '\n')
 988         {
 989           if ((coding->eol_type == CODING_EOL_CR
 990                || coding->eol_type == CODING_EOL_CRLF)
 991               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 992             {
 993               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 994               goto label_end_of_loop;
 995             }
 996           *dst++ = *src++;
 997           coding->produced_char++;
 998           continue;
 999         }
1000       else if (*src == 0x80 && coding->cmp_data)
1001         {
1002           /* Start of composition data.  */
1003           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1004                                                          &dst, dst_end,
1005                                                          dst_bytes);
1006           if (consumed < 0)
1007             goto label_end_of_loop;
1008           else if (consumed > 0)
1009             {
1010               src += consumed;
1011               continue;
1012             }
1013           bytes = CHAR_STRING (*src, tmp);
1014           p = tmp;
1015           src++;
1016         }
1017       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1018                || (coding->flags /* We are recovering a file.  */
1019                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1020                    && ! CHAR_HEAD_P (src[1])))
1021         {
1022           p = src;
1023           src += bytes;
1024         }
1025       else
1026         {
1027           int i, c;
1028
1029           bytes = BYTES_BY_CHAR_HEAD (*src);
1030           src++;
1031           for (i = 1; i < bytes; i++)
1032             {
1033               ONE_MORE_BYTE (c);
1034               if (CHAR_HEAD_P (c))
1035                 break;
1036             }
1037           if (i < bytes)
1038             {
1039               bytes = CHAR_STRING (*src_base, tmp);
1040               p = tmp;
1041               src = src_base + 1;
1042             }
1043           else
1044             {
1045               p = src_base;
1046             }
1047         }
1048       if (dst + bytes >= (dst_bytes ? dst_end : src))
1049         {
1050           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1051           break;
1052         }
1053       while (bytes--) *dst++ = *p++;
1054       coding->produced_char++;
1055     }
1056  label_end_of_loop:
1057   coding->consumed = coding->consumed_char = src_base - source;
1058   coding->produced = dst - destination;
1059 }
1060
1061
1062 /* Encode composition data stored at DATA into a special byte sequence
1063    starting by 0x80.  Update CODING->cmp_data_start and maybe
1064    CODING->cmp_data for the next call.  */
1065
1066 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1067   do {                                                                  \
1068     unsigned char buf[1024], *p0 = buf, *p;                             \
1069     int len = data[0];                                                  \
1070     int i;                                                              \
1071                                                                         \
1072     buf[0] = 0x80;                                                      \
1073     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1074     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1075     p = buf + 4;                                                        \
1076     if (data[3] == COMPOSITION_WITH_RULE                                \
1077         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1078       {                                                                 \
1079         p += CHAR_STRING (data[4], p);                                  \
1080         for (i = 5; i < len; i += 2)                                    \
1081           {                                                             \
1082             int gref, nref;                                             \
1083              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1084             *p++ = 0x20 + gref;                                         \
1085             *p++ = 0x20 + nref;                                         \
1086             p += CHAR_STRING (data[i + 1], p);                          \
1087           }                                                             \
1088       }                                                                 \
1089     else                                                                \
1090       {                                                                 \
1091         for (i = 4; i < len; i++)                                       \
1092           p += CHAR_STRING (data[i], p);                                \
1093       }                                                                 \
1094     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1095                                                                         \
1096     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1097       {                                                                 \
1098         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1099         goto label_end_of_loop;                                         \
1100       }                                                                 \
1101     while (p0 < p)                                                      \
1102       *dst++ = *p0++;                                                   \
1103     coding->cmp_data_start += data[0];                                  \
1104     if (coding->cmp_data_start == coding->cmp_data->used                \
1105         && coding->cmp_data->next)                                      \
1106       {                                                                 \
1107         coding->cmp_data = coding->cmp_data->next;                      \
1108         coding->cmp_data_start = 0;                                     \
1109       }                                                                 \
1110   } while (0)
1111
1112
1113 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1114                             unsigned char *, int, int));
1115
1116 static void
1117 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1118      struct coding_system *coding;
1119      unsigned char *source, *destination;
1120      int src_bytes, dst_bytes;
1121 {
1122   unsigned char *src = source;
1123   unsigned char *src_end = source + src_bytes;
1124   unsigned char *dst = destination;
1125   unsigned char *dst_end = destination + dst_bytes;
1126   unsigned char *src_base;
1127   int c;
1128   int char_offset;
1129   int *data;
1130
1131   Lisp_Object translation_table;
1132
1133   translation_table = Qnil;
1134
1135   /* Optimization for the case that there's no composition.  */
1136   if (!coding->cmp_data || coding->cmp_data->used == 0)
1137     {
1138       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1139       return;
1140     }
1141
1142   char_offset = coding->cmp_data->char_offset;
1143   data = coding->cmp_data->data + coding->cmp_data_start;
1144   while (1)
1145     {
1146       src_base = src;
1147
1148       /* If SRC starts a composition, encode the information about the
1149          composition in advance.  */
1150       if (coding->cmp_data_start < coding->cmp_data->used
1151           && char_offset + coding->consumed_char == data[1])
1152         {
1153           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1154           char_offset = coding->cmp_data->char_offset;
1155           data = coding->cmp_data->data + coding->cmp_data_start;
1156         }
1157
1158       ONE_MORE_CHAR (c);
1159       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1160                         || coding->eol_type == CODING_EOL_CR))
1161         {
1162           if (coding->eol_type == CODING_EOL_CRLF)
1163             EMIT_TWO_BYTES ('\r', c);
1164           else
1165             EMIT_ONE_BYTE ('\r');
1166         }
1167       else if (SINGLE_BYTE_CHAR_P (c))
1168         {
1169           if (coding->flags && ! ASCII_BYTE_P (c))
1170             {
1171               /* As we are auto saving, retain the multibyte form for
1172                  8-bit chars.  */
1173               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1174               int bytes = CHAR_STRING (c, buf);
1175
1176               if (bytes == 1)
1177                 EMIT_ONE_BYTE (buf[0]);
1178               else
1179                 EMIT_TWO_BYTES (buf[0], buf[1]);
1180             }
1181           else
1182             EMIT_ONE_BYTE (c);
1183         }
1184       else
1185         EMIT_BYTES (src_base, src);
1186       coding->consumed_char++;
1187     }
1188  label_end_of_loop:
1189   coding->consumed = src_base - source;
1190   coding->produced = coding->produced_char = dst - destination;
1191   return;
1192 }
1193
1194 \f
1195 /*** 3. ISO2022 handlers ***/
1196
1197 /* The following note describes the coding system ISO2022 briefly.
1198    Since the intention of this note is to help understand the
1199    functions in this file, some parts are NOT ACCURATE or are OVERLY
1200    SIMPLIFIED.  For thorough understanding, please refer to the
1201    original document of ISO2022.  This is equivalent to the standard
1202    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1203
1204    ISO2022 provides many mechanisms to encode several character sets
1205    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1206    is encoded using bytes less than 128.  This may make the encoded
1207    text a little bit longer, but the text passes more easily through
1208    several types of gateway, some of which strip off the MSB (Most
1209    Significant Bit).
1210
1211    There are two kinds of character sets: control character sets and
1212    graphic character sets.  The former contain control characters such
1213    as `newline' and `escape' to provide control functions (control
1214    functions are also provided by escape sequences).  The latter
1215    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1216    two control character sets and many graphic character sets.
1217
1218    Graphic character sets are classified into one of the following
1219    four classes, according to the number of bytes (DIMENSION) and
1220    number of characters in one dimension (CHARS) of the set:
1221    - DIMENSION1_CHARS94
1222    - DIMENSION1_CHARS96
1223    - DIMENSION2_CHARS94
1224    - DIMENSION2_CHARS96
1225
1226    In addition, each character set is assigned an identification tag,
1227    unique for each set, called the "final character" (denoted as <F>
1228    hereafter).  The <F> of each character set is decided by ECMA(*)
1229    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1230    (0x30..0x3F are for private use only).
1231
1232    Note (*): ECMA = European Computer Manufacturers Association
1233
1234    Here are examples of graphic character sets [NAME(<F>)]:
1235         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1236         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1237         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1238         o DIMENSION2_CHARS96 -- none for the moment
1239
1240    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1241         C0 [0x00..0x1F] -- control character plane 0
1242         GL [0x20..0x7F] -- graphic character plane 0
1243         C1 [0x80..0x9F] -- control character plane 1
1244         GR [0xA0..0xFF] -- graphic character plane 1
1245
1246    A control character set is directly designated and invoked to C0 or
1247    C1 by an escape sequence.  The most common case is that:
1248    - ISO646's  control character set is designated/invoked to C0, and
1249    - ISO6429's control character set is designated/invoked to C1,
1250    and usually these designations/invocations are omitted in encoded
1251    text.  In a 7-bit environment, only C0 can be used, and a control
1252    character for C1 is encoded by an appropriate escape sequence to
1253    fit into the environment.  All control characters for C1 are
1254    defined to have corresponding escape sequences.
1255
1256    A graphic character set is at first designated to one of four
1257    graphic registers (G0 through G3), then these graphic registers are
1258    invoked to GL or GR.  These designations and invocations can be
1259    done independently.  The most common case is that G0 is invoked to
1260    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1261    these invocations and designations are omitted in encoded text.
1262    In a 7-bit environment, only GL can be used.
1263
1264    When a graphic character set of CHARS94 is invoked to GL, codes
1265    0x20 and 0x7F of the GL area work as control characters SPACE and
1266    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1267    be used.
1268
1269    There are two ways of invocation: locking-shift and single-shift.
1270    With locking-shift, the invocation lasts until the next different
1271    invocation, whereas with single-shift, the invocation affects the
1272    following character only and doesn't affect the locking-shift
1273    state.  Invocations are done by the following control characters or
1274    escape sequences:
1275
1276    ----------------------------------------------------------------------
1277    abbrev  function                  cntrl escape seq   description
1278    ----------------------------------------------------------------------
1279    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1280    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1281    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1282    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1283    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1284    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1285    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1286    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1287    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1288    ----------------------------------------------------------------------
1289    (*) These are not used by any known coding system.
1290
1291    Control characters for these functions are defined by macros
1292    ISO_CODE_XXX in `coding.h'.
1293
1294    Designations are done by the following escape sequences:
1295    ----------------------------------------------------------------------
1296    escape sequence      description
1297    ----------------------------------------------------------------------
1298    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1299    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1300    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1301    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1302    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1303    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1304    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1305    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1306    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1307    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1308    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1309    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1310    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1311    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1312    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1313    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1314    ----------------------------------------------------------------------
1315
1316    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1317    of dimension 1, chars 94, and final character <F>, etc...
1318
1319    Note (*): Although these designations are not allowed in ISO2022,
1320    Emacs accepts them on decoding, and produces them on encoding
1321    CHARS96 character sets in a coding system which is characterized as
1322    7-bit environment, non-locking-shift, and non-single-shift.
1323
1324    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1325    '(' can be omitted.  We refer to this as "short-form" hereafter.
1326
1327    Now you may notice that there are a lot of ways of encoding the
1328    same multilingual text in ISO2022.  Actually, there exist many
1329    coding systems such as Compound Text (used in X11's inter client
1330    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1331    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1332    localized platforms), and all of these are variants of ISO2022.
1333
1334    In addition to the above, Emacs handles two more kinds of escape
1335    sequences: ISO6429's direction specification and Emacs' private
1336    sequence for specifying character composition.
1337
1338    ISO6429's direction specification takes the following form:
1339         o CSI ']'      -- end of the current direction
1340         o CSI '0' ']'  -- end of the current direction
1341         o CSI '1' ']'  -- start of left-to-right text
1342         o CSI '2' ']'  -- start of right-to-left text
1343    The control character CSI (0x9B: control sequence introducer) is
1344    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1345
1346    Character composition specification takes the following form:
1347         o ESC '0' -- start relative composition
1348         o ESC '1' -- end composition
1349         o ESC '2' -- start rule-base composition (*)
1350         o ESC '3' -- start relative composition with alternate chars  (**)
1351         o ESC '4' -- start rule-base composition with alternate chars  (**)
1352   Since these are not standard escape sequences of any ISO standard,
1353   the use of them with these meanings is restricted to Emacs only.
1354
1355   (*) This form is used only in Emacs 20.5 and older versions,
1356   but the newer versions can safely decode it.
1357   (**) This form is used only in Emacs 21.1 and newer versions,
1358   and the older versions can't decode it.
1359
1360   Here's a list of example usages of these composition escape
1361   sequences (categorized by `enum composition_method').
1362
1363   COMPOSITION_RELATIVE:
1364         ESC 0 CHAR [ CHAR ] ESC 1
1365   COMPOSITION_WITH_RULE:
1366         ESC 2 CHAR [ RULE CHAR ] ESC 1
1367   COMPOSITION_WITH_ALTCHARS:
1368         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1369   COMPOSITION_WITH_RULE_ALTCHARS:
1370         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1371
1372 enum iso_code_class_type iso_code_class[256];
1373
1374 #define CHARSET_OK(idx, charset, c)                                     \
1375   (coding_system_table[idx]                                             \
1376    && (charset == CHARSET_ASCII                                         \
1377        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1378            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1379    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1380                                               charset)                  \
1381        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1382
1383 #define SHIFT_OUT_OK(idx) \
1384   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1385
1386 #define COMPOSITION_OK(idx)     \
1387   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1388
1389 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1390    Check if a text is encoded in ISO2022.  If it is, return an
1391    integer in which appropriate flag bits any of:
1392         CODING_CATEGORY_MASK_ISO_7
1393         CODING_CATEGORY_MASK_ISO_7_TIGHT
1394         CODING_CATEGORY_MASK_ISO_8_1
1395         CODING_CATEGORY_MASK_ISO_8_2
1396         CODING_CATEGORY_MASK_ISO_7_ELSE
1397         CODING_CATEGORY_MASK_ISO_8_ELSE
1398    are set.  If a code which should never appear in ISO2022 is found,
1399    returns 0.  */
1400
1401 static int
1402 detect_coding_iso2022 (src, src_end, multibytep)
1403      unsigned char *src, *src_end;
1404      int multibytep;
1405 {
1406   int mask = CODING_CATEGORY_MASK_ISO;
1407   int mask_found = 0;
1408   int reg[4], shift_out = 0, single_shifting = 0;
1409   int c, c1, charset;
1410   /* Dummy for ONE_MORE_BYTE.  */
1411   struct coding_system dummy_coding;
1412   struct coding_system *coding = &dummy_coding;
1413   Lisp_Object safe_chars;
1414
1415   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1416   while (mask && src < src_end)
1417     {
1418       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1419     retry:
1420       switch (c)
1421         {
1422         case ISO_CODE_ESC:
1423           if (inhibit_iso_escape_detection)
1424             break;
1425           single_shifting = 0;
1426           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1427           if (c >= '(' && c <= '/')
1428             {
1429               /* Designation sequence for a charset of dimension 1.  */
1430               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1431               if (c1 < ' ' || c1 >= 0x80
1432                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1433                 /* Invalid designation sequence.  Just ignore.  */
1434                 break;
1435               reg[(c - '(') % 4] = charset;
1436             }
1437           else if (c == '$')
1438             {
1439               /* Designation sequence for a charset of dimension 2.  */
1440               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1441               if (c >= '@' && c <= 'B')
1442                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1443                 reg[0] = charset = iso_charset_table[1][0][c];
1444               else if (c >= '(' && c <= '/')
1445                 {
1446                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1447                   if (c1 < ' ' || c1 >= 0x80
1448                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1449                     /* Invalid designation sequence.  Just ignore.  */
1450                     break;
1451                   reg[(c - '(') % 4] = charset;
1452                 }
1453               else
1454                 /* Invalid designation sequence.  Just ignore.  */
1455                 break;
1456             }
1457           else if (c == 'N' || c == 'O')
1458             {
1459               /* ESC <Fe> for SS2 or SS3.  */
1460               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1461               break;
1462             }
1463           else if (c >= '0' && c <= '4')
1464             {
1465               /* ESC <Fp> for start/end composition.  */
1466               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1467                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1468               else
1469                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1470               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1471                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1472               else
1473                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1474               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1475                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1476               else
1477                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1478               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1479                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1480               else
1481                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1482               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1483                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1484               else
1485                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1486               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1487                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1488               else
1489                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1490               break;
1491             }
1492           else
1493             /* Invalid escape sequence.  Just ignore.  */
1494             break;
1495
1496           /* We found a valid designation sequence for CHARSET.  */
1497           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1498           c = MAKE_CHAR (charset, 0, 0);
1499           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1500             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1501           else
1502             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1503           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1504             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1505           else
1506             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1507           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1508             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1509           else
1510             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1511           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1512             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1513           else
1514             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1515           break;
1516
1517         case ISO_CODE_SO:
1518           if (inhibit_iso_escape_detection)
1519             break;
1520           single_shifting = 0;
1521           if (shift_out == 0
1522               && (reg[1] >= 0
1523                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1524                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1525             {
1526               /* Locking shift out.  */
1527               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1528               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1529             }
1530           break;
1531
1532         case ISO_CODE_SI:
1533           if (inhibit_iso_escape_detection)
1534             break;
1535           single_shifting = 0;
1536           if (shift_out == 1)
1537             {
1538               /* Locking shift in.  */
1539               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1540               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1541             }
1542           break;
1543
1544         case ISO_CODE_CSI:
1545           single_shifting = 0;
1546         case ISO_CODE_SS2:
1547         case ISO_CODE_SS3:
1548           {
1549             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1550
1551             if (inhibit_iso_escape_detection)
1552               break;
1553             if (c != ISO_CODE_CSI)
1554               {
1555                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1556                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1557                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1558                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1559                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1560                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1561                 single_shifting = 1;
1562               }
1563             if (VECTORP (Vlatin_extra_code_table)
1564                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1565               {
1566                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1567                     & CODING_FLAG_ISO_LATIN_EXTRA)
1568                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1569                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1570                     & CODING_FLAG_ISO_LATIN_EXTRA)
1571                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1572               }
1573             mask &= newmask;
1574             mask_found |= newmask;
1575           }
1576           break;
1577
1578         default:
1579           if (c < 0x80)
1580             {
1581               single_shifting = 0;
1582               break;
1583             }
1584           else if (c < 0xA0)
1585             {
1586               single_shifting = 0;
1587               if (VECTORP (Vlatin_extra_code_table)
1588                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1589                 {
1590                   int newmask = 0;
1591
1592                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1593                       & CODING_FLAG_ISO_LATIN_EXTRA)
1594                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1595                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1596                       & CODING_FLAG_ISO_LATIN_EXTRA)
1597                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1598                   mask &= newmask;
1599                   mask_found |= newmask;
1600                 }
1601               else
1602                 return 0;
1603             }
1604           else
1605             {
1606               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1607                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1608               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1609               /* Check the length of succeeding codes of the range
1610                  0xA0..0FF.  If the byte length is odd, we exclude
1611                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1612                  when we are not single shifting.  */
1613               if (!single_shifting
1614                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1615                 {
1616                   int i = 1;
1617
1618                   c = -1;
1619                   while (src < src_end)
1620                     {
1621                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1622                       if (c < 0xA0)
1623                         break;
1624                       i++;
1625                     }
1626
1627                   if (i & 1 && src < src_end)
1628                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1629                   else
1630                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1631                   if (c >= 0)
1632                     /* This means that we have read one extra byte.  */
1633                     goto retry;
1634                 }
1635             }
1636           break;
1637         }
1638     }
1639  label_end_of_loop:
1640   return (mask & mask_found);
1641 }
1642
1643 /* Decode a character of which charset is CHARSET, the 1st position
1644    code is C1, the 2nd position code is C2, and return the decoded
1645    character code.  If the variable `translation_table' is non-nil,
1646    returned the translated code.  */
1647
1648 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1649   (NILP (translation_table)                     \
1650    ? MAKE_CHAR (charset, c1, c2)                \
1651    : translate_char (translation_table, -1, charset, c1, c2))
1652
1653 /* Set designation state into CODING.  */
1654 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1655   do {                                                                     \
1656     int charset, c;                                                        \
1657                                                                            \
1658     if (final_char < '0' || final_char >= 128)                             \
1659       goto label_invalid_code;                                             \
1660     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1661                                  make_number (chars),                      \
1662                                  make_number (final_char));                \
1663     c = MAKE_CHAR (charset, 0, 0);                                         \
1664     if (charset >= 0                                                       \
1665         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1666             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1667       {                                                                    \
1668         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1669             && reg == 0                                                    \
1670             && charset == CHARSET_ASCII)                                   \
1671           {                                                                \
1672             /* We should insert this designation sequence as is so         \
1673                that it is surely written back to a file.  */               \
1674             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1675             goto label_invalid_code;                                       \
1676           }                                                                \
1677         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1678         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1679             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1680           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1681         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1682       }                                                                    \
1683     else                                                                   \
1684       {                                                                    \
1685         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1686         goto label_invalid_code;                                           \
1687       }                                                                    \
1688   } while (0)
1689
1690 /* Allocate a memory block for storing information about compositions.
1691    The block is chained to the already allocated blocks.  */
1692
1693 void
1694 coding_allocate_composition_data (coding, char_offset)
1695      struct coding_system *coding;
1696      int char_offset;
1697 {
1698   struct composition_data *cmp_data
1699     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1700
1701   cmp_data->char_offset = char_offset;
1702   cmp_data->used = 0;
1703   cmp_data->prev = coding->cmp_data;
1704   cmp_data->next = NULL;
1705   if (coding->cmp_data)
1706     coding->cmp_data->next = cmp_data;
1707   coding->cmp_data = cmp_data;
1708   coding->cmp_data_start = 0;
1709   coding->composing = COMPOSITION_NO;
1710 }
1711
1712 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1713    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1714    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1715    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1716    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1717   */
1718
1719 #define DECODE_COMPOSITION_START(c1)                                       \
1720   do {                                                                     \
1721     if (coding->composing == COMPOSITION_DISABLED)                         \
1722       {                                                                    \
1723         *dst++ = ISO_CODE_ESC;                                             \
1724         *dst++ = c1 & 0x7f;                                                \
1725         coding->produced_char += 2;                                        \
1726       }                                                                    \
1727     else if (!COMPOSING_P (coding))                                        \
1728       {                                                                    \
1729         /* This is surely the start of a composition.  We must be sure     \
1730            that coding->cmp_data has enough space to store the             \
1731            information about the composition.  If not, terminate the       \
1732            current decoding loop, allocate one more memory block for       \
1733            coding->cmp_data in the caller, then start the decoding         \
1734            loop again.  We can't allocate memory here directly because     \
1735            it may cause buffer/string relocation.  */                      \
1736         if (!coding->cmp_data                                              \
1737             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1738                 >= COMPOSITION_DATA_SIZE))                                 \
1739           {                                                                \
1740             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1741             goto label_end_of_loop;                                        \
1742           }                                                                \
1743         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1744                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1745                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1746                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1747         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1748                                       coding->composing);                  \
1749         coding->composition_rule_follows = 0;                              \
1750       }                                                                    \
1751     else                                                                   \
1752       {                                                                    \
1753         /* We are already handling a composition.  If the method is        \
1754            the following two, the codes following the current escape       \
1755            sequence are actual characters stored in a buffer.  */          \
1756         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1757             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1758           {                                                                \
1759             coding->composing = COMPOSITION_RELATIVE;                      \
1760             coding->composition_rule_follows = 0;                          \
1761           }                                                                \
1762       }                                                                    \
1763   } while (0)
1764
1765 /* Handle composition end sequence ESC 1.  */
1766
1767 #define DECODE_COMPOSITION_END(c1)                                      \
1768   do {                                                                  \
1769     if (! COMPOSING_P (coding))                                         \
1770       {                                                                 \
1771         *dst++ = ISO_CODE_ESC;                                          \
1772         *dst++ = c1;                                                    \
1773         coding->produced_char += 2;                                     \
1774       }                                                                 \
1775     else                                                                \
1776       {                                                                 \
1777         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1778         coding->composing = COMPOSITION_NO;                             \
1779       }                                                                 \
1780   } while (0)
1781
1782 /* Decode a composition rule from the byte C1 (and maybe one more byte
1783    from SRC) and store one encoded composition rule in
1784    coding->cmp_data.  */
1785
1786 #define DECODE_COMPOSITION_RULE(c1)                                     \
1787   do {                                                                  \
1788     int rule = 0;                                                       \
1789     (c1) -= 32;                                                         \
1790     if (c1 < 81)                /* old format (before ver.21) */        \
1791       {                                                                 \
1792         int gref = (c1) / 9;                                            \
1793         int nref = (c1) % 9;                                            \
1794         if (gref == 4) gref = 10;                                       \
1795         if (nref == 4) nref = 10;                                       \
1796         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1797       }                                                                 \
1798     else if (c1 < 93)           /* new format (after ver.21) */         \
1799       {                                                                 \
1800         ONE_MORE_BYTE (c2);                                             \
1801         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1802       }                                                                 \
1803     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1804     coding->composition_rule_follows = 0;                               \
1805   } while (0)
1806
1807
1808 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1809
1810 static void
1811 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1812      struct coding_system *coding;
1813      unsigned char *source, *destination;
1814      int src_bytes, dst_bytes;
1815 {
1816   unsigned char *src = source;
1817   unsigned char *src_end = source + src_bytes;
1818   unsigned char *dst = destination;
1819   unsigned char *dst_end = destination + dst_bytes;
1820   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1821   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1822   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1823   /* SRC_BASE remembers the start position in source in each loop.
1824      The loop will be exited when there's not enough source code
1825      (within macro ONE_MORE_BYTE), or when there's not enough
1826      destination area to produce a character (within macro
1827      EMIT_CHAR).  */
1828   unsigned char *src_base;
1829   int c, charset;
1830   Lisp_Object translation_table;
1831   Lisp_Object safe_chars;
1832
1833   safe_chars = coding_safe_chars (coding->symbol);
1834
1835   if (NILP (Venable_character_translation))
1836     translation_table = Qnil;
1837   else
1838     {
1839       translation_table = coding->translation_table_for_decode;
1840       if (NILP (translation_table))
1841         translation_table = Vstandard_translation_table_for_decode;
1842     }
1843
1844   coding->result = CODING_FINISH_NORMAL;
1845
1846   while (1)
1847     {
1848       int c1, c2 = 0;
1849
1850       src_base = src;
1851       ONE_MORE_BYTE (c1);
1852
1853       /* We produce no character or one character.  */
1854       switch (iso_code_class [c1])
1855         {
1856         case ISO_0x20_or_0x7F:
1857           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1858             {
1859               DECODE_COMPOSITION_RULE (c1);
1860               continue;
1861             }
1862           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1863             {
1864               /* This is SPACE or DEL.  */
1865               charset = CHARSET_ASCII;
1866               break;
1867             }
1868           /* This is a graphic character, we fall down ...  */
1869
1870         case ISO_graphic_plane_0:
1871           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1872             {
1873               DECODE_COMPOSITION_RULE (c1);
1874               continue;
1875             }
1876           charset = charset0;
1877           break;
1878
1879         case ISO_0xA0_or_0xFF:
1880           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1881               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1882             goto label_invalid_code;
1883           /* This is a graphic character, we fall down ... */
1884
1885         case ISO_graphic_plane_1:
1886           if (charset1 < 0)
1887             goto label_invalid_code;
1888           charset = charset1;
1889           break;
1890
1891         case ISO_control_0:
1892           if (COMPOSING_P (coding))
1893             DECODE_COMPOSITION_END ('1');
1894
1895           /* All ISO2022 control characters in this class have the
1896              same representation in Emacs internal format.  */
1897           if (c1 == '\n'
1898               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1899               && (coding->eol_type == CODING_EOL_CR
1900                   || coding->eol_type == CODING_EOL_CRLF))
1901             {
1902               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1903               goto label_end_of_loop;
1904             }
1905           charset = CHARSET_ASCII;
1906           break;
1907
1908         case ISO_control_1:
1909           if (COMPOSING_P (coding))
1910             DECODE_COMPOSITION_END ('1');
1911           goto label_invalid_code;
1912
1913         case ISO_carriage_return:
1914           if (COMPOSING_P (coding))
1915             DECODE_COMPOSITION_END ('1');
1916
1917           if (coding->eol_type == CODING_EOL_CR)
1918             c1 = '\n';
1919           else if (coding->eol_type == CODING_EOL_CRLF)
1920             {
1921               ONE_MORE_BYTE (c1);
1922               if (c1 != ISO_CODE_LF)
1923                 {
1924                   src--;
1925                   c1 = '\r';
1926                 }
1927             }
1928           charset = CHARSET_ASCII;
1929           break;
1930
1931         case ISO_shift_out:
1932           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1933               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1934             goto label_invalid_code;
1935           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1936           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1937           continue;
1938
1939         case ISO_shift_in:
1940           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1941             goto label_invalid_code;
1942           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1943           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1944           continue;
1945
1946         case ISO_single_shift_2_7:
1947         case ISO_single_shift_2:
1948           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1949             goto label_invalid_code;
1950           /* SS2 is handled as an escape sequence of ESC 'N' */
1951           c1 = 'N';
1952           goto label_escape_sequence;
1953
1954         case ISO_single_shift_3:
1955           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1956             goto label_invalid_code;
1957           /* SS2 is handled as an escape sequence of ESC 'O' */
1958           c1 = 'O';
1959           goto label_escape_sequence;
1960
1961         case ISO_control_sequence_introducer:
1962           /* CSI is handled as an escape sequence of ESC '[' ...  */
1963           c1 = '[';
1964           goto label_escape_sequence;
1965
1966         case ISO_escape:
1967           ONE_MORE_BYTE (c1);
1968         label_escape_sequence:
1969           /* Escape sequences handled by Emacs are invocation,
1970              designation, direction specification, and character
1971              composition specification.  */
1972           switch (c1)
1973             {
1974             case '&':           /* revision of following character set */
1975               ONE_MORE_BYTE (c1);
1976               if (!(c1 >= '@' && c1 <= '~'))
1977                 goto label_invalid_code;
1978               ONE_MORE_BYTE (c1);
1979               if (c1 != ISO_CODE_ESC)
1980                 goto label_invalid_code;
1981               ONE_MORE_BYTE (c1);
1982               goto label_escape_sequence;
1983
1984             case '$':           /* designation of 2-byte character set */
1985               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1986                 goto label_invalid_code;
1987               ONE_MORE_BYTE (c1);
1988               if (c1 >= '@' && c1 <= 'B')
1989                 {       /* designation of JISX0208.1978, GB2312.1980,
1990                            or JISX0208.1980 */
1991                   DECODE_DESIGNATION (0, 2, 94, c1);
1992                 }
1993               else if (c1 >= 0x28 && c1 <= 0x2B)
1994                 {       /* designation of DIMENSION2_CHARS94 character set */
1995                   ONE_MORE_BYTE (c2);
1996                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1997                 }
1998               else if (c1 >= 0x2C && c1 <= 0x2F)
1999                 {       /* designation of DIMENSION2_CHARS96 character set */
2000                   ONE_MORE_BYTE (c2);
2001                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2002                 }
2003               else
2004                 goto label_invalid_code;
2005               /* We must update these variables now.  */
2006               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2007               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2008               continue;
2009
2010             case 'n':           /* invocation of locking-shift-2 */
2011               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2012                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2013                 goto label_invalid_code;
2014               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2015               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2016               continue;
2017
2018             case 'o':           /* invocation of locking-shift-3 */
2019               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2020                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2021                 goto label_invalid_code;
2022               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2023               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2024               continue;
2025
2026             case 'N':           /* invocation of single-shift-2 */
2027               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2028                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2029                 goto label_invalid_code;
2030               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2031               ONE_MORE_BYTE (c1);
2032               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2033                 goto label_invalid_code;
2034               break;
2035
2036             case 'O':           /* invocation of single-shift-3 */
2037               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2038                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2039                 goto label_invalid_code;
2040               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2041               ONE_MORE_BYTE (c1);
2042               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2043                 goto label_invalid_code;
2044               break;
2045
2046             case '0': case '2': case '3': case '4': /* start composition */
2047               DECODE_COMPOSITION_START (c1);
2048               continue;
2049
2050             case '1':           /* end composition */
2051               DECODE_COMPOSITION_END (c1);
2052               continue;
2053
2054             case '[':           /* specification of direction */
2055               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2056                 goto label_invalid_code;
2057               /* For the moment, nested direction is not supported.
2058                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2059                  left-to-right, and nonzero means right-to-left.  */
2060               ONE_MORE_BYTE (c1);
2061               switch (c1)
2062                 {
2063                 case ']':       /* end of the current direction */
2064                   coding->mode &= ~CODING_MODE_DIRECTION;
2065
2066                 case '0':       /* end of the current direction */
2067                 case '1':       /* start of left-to-right direction */
2068                   ONE_MORE_BYTE (c1);
2069                   if (c1 == ']')
2070                     coding->mode &= ~CODING_MODE_DIRECTION;
2071                   else
2072                     goto label_invalid_code;
2073                   break;
2074
2075                 case '2':       /* start of right-to-left direction */
2076                   ONE_MORE_BYTE (c1);
2077                   if (c1 == ']')
2078                     coding->mode |= CODING_MODE_DIRECTION;
2079                   else
2080                     goto label_invalid_code;
2081                   break;
2082
2083                 default:
2084                   goto label_invalid_code;
2085                 }
2086               continue;
2087
2088             case '%':
2089               if (COMPOSING_P (coding))
2090                 DECODE_COMPOSITION_END ('1');
2091               ONE_MORE_BYTE (c1);
2092               if (c1 == '/')
2093                 {
2094                   /* CTEXT extended segment:
2095                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2096                      We keep these bytes as is for the moment.
2097                      They may be decoded by post-read-conversion.  */
2098                   int dim, M, L;
2099                   int size, required;
2100                   int produced_chars;
2101
2102                   ONE_MORE_BYTE (dim);
2103                   ONE_MORE_BYTE (M);
2104                   ONE_MORE_BYTE (L);
2105                   size = ((M - 128) * 128) + (L - 128);
2106                   required = 8 + size * 2;
2107                   if (dst + required > (dst_bytes ? dst_end : src))
2108                     goto label_end_of_loop;
2109                   *dst++ = ISO_CODE_ESC;
2110                   *dst++ = '%';
2111                   *dst++ = '/';
2112                   *dst++ = dim;
2113                   produced_chars = 4;
2114                   dst += CHAR_STRING (M, dst), produced_chars++;
2115                   dst += CHAR_STRING (L, dst), produced_chars++;
2116                   while (size-- > 0)
2117                     {
2118                       ONE_MORE_BYTE (c1);
2119                       dst += CHAR_STRING (c1, dst), produced_chars++;
2120                     }
2121                   coding->produced_char += produced_chars;
2122                 }
2123               else if (c1 == 'G')
2124                 {
2125                   unsigned char *d = dst;
2126                   int produced_chars;
2127
2128                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2129                      ESC % G --UTF-8-BYTES-- ESC % @
2130                      We keep these bytes as is for the moment.
2131                      They may be decoded by post-read-conversion.  */
2132                   if (d + 6 > (dst_bytes ? dst_end : src))
2133                     goto label_end_of_loop;
2134                   *d++ = ISO_CODE_ESC;
2135                   *d++ = '%';
2136                   *d++ = 'G';
2137                   produced_chars = 3;
2138                   while (d + 1 < (dst_bytes ? dst_end : src))
2139                     {
2140                       ONE_MORE_BYTE (c1);
2141                       if (c1 == ISO_CODE_ESC
2142                           && src + 1 < src_end
2143                           && src[0] == '%'
2144                           && src[1] == '@')
2145                         {
2146                           src += 2;
2147                           break;
2148                         }
2149                       d += CHAR_STRING (c1, d), produced_chars++;
2150                     }
2151                   if (d + 3 > (dst_bytes ? dst_end : src))
2152                     goto label_end_of_loop;
2153                   *d++ = ISO_CODE_ESC;
2154                   *d++ = '%';
2155                   *d++ = '@';
2156                   dst = d;
2157                   coding->produced_char += produced_chars + 3;
2158                 }
2159               else
2160                 goto label_invalid_code;
2161               continue;
2162
2163             default:
2164               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2165                 goto label_invalid_code;
2166               if (c1 >= 0x28 && c1 <= 0x2B)
2167                 {       /* designation of DIMENSION1_CHARS94 character set */
2168                   ONE_MORE_BYTE (c2);
2169                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2170                 }
2171               else if (c1 >= 0x2C && c1 <= 0x2F)
2172                 {       /* designation of DIMENSION1_CHARS96 character set */
2173                   ONE_MORE_BYTE (c2);
2174                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2175                 }
2176               else
2177                 goto label_invalid_code;
2178               /* We must update these variables now.  */
2179               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2180               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2181               continue;
2182             }
2183         }
2184
2185       /* Now we know CHARSET and 1st position code C1 of a character.
2186          Produce a multibyte sequence for that character while getting
2187          2nd position code C2 if necessary.  */
2188       if (CHARSET_DIMENSION (charset) == 2)
2189         {
2190           ONE_MORE_BYTE (c2);
2191           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2192             /* C2 is not in a valid range.  */
2193             goto label_invalid_code;
2194         }
2195       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2196       EMIT_CHAR (c);
2197       continue;
2198
2199     label_invalid_code:
2200       coding->errors++;
2201       if (COMPOSING_P (coding))
2202         DECODE_COMPOSITION_END ('1');
2203       src = src_base;
2204       c = *src++;
2205       EMIT_CHAR (c);
2206     }
2207
2208  label_end_of_loop:
2209   coding->consumed = coding->consumed_char = src_base - source;
2210   coding->produced = dst - destination;
2211   return;
2212 }
2213
2214
2215 /* ISO2022 encoding stuff.  */
2216
2217 /*
2218    It is not enough to say just "ISO2022" on encoding, we have to
2219    specify more details.  In Emacs, each ISO2022 coding system
2220    variant has the following specifications:
2221         1. Initial designation to G0 through G3.
2222         2. Allows short-form designation?
2223         3. ASCII should be designated to G0 before control characters?
2224         4. ASCII should be designated to G0 at end of line?
2225         5. 7-bit environment or 8-bit environment?
2226         6. Use locking-shift?
2227         7. Use Single-shift?
2228    And the following two are only for Japanese:
2229         8. Use ASCII in place of JIS0201-1976-Roman?
2230         9. Use JISX0208-1983 in place of JISX0208-1978?
2231    These specifications are encoded in `coding->flags' as flag bits
2232    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2233    details.
2234 */
2235
2236 /* Produce codes (escape sequence) for designating CHARSET to graphic
2237    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2238    '@', 'A', or 'B' and the coding system CODING allows, produce
2239    designation sequence of short-form.  */
2240
2241 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2242   do {                                                                  \
2243     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2244     char *intermediate_char_94 = "()*+";                                \
2245     char *intermediate_char_96 = ",-./";                                \
2246     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2247                                                                         \
2248     if (revision < 255)                                                 \
2249       {                                                                 \
2250         *dst++ = ISO_CODE_ESC;                                          \
2251         *dst++ = '&';                                                   \
2252         *dst++ = '@' + revision;                                        \
2253       }                                                                 \
2254     *dst++ = ISO_CODE_ESC;                                              \
2255     if (CHARSET_DIMENSION (charset) == 1)                               \
2256       {                                                                 \
2257         if (CHARSET_CHARS (charset) == 94)                              \
2258           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2259         else                                                            \
2260           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2261       }                                                                 \
2262     else                                                                \
2263       {                                                                 \
2264         *dst++ = '$';                                                   \
2265         if (CHARSET_CHARS (charset) == 94)                              \
2266           {                                                             \
2267             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2268                 || reg != 0                                             \
2269                 || final_char < '@' || final_char > 'B')                \
2270               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2271           }                                                             \
2272         else                                                            \
2273           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2274       }                                                                 \
2275     *dst++ = final_char;                                                \
2276     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2277   } while (0)
2278
2279 /* The following two macros produce codes (control character or escape
2280    sequence) for ISO2022 single-shift functions (single-shift-2 and
2281    single-shift-3).  */
2282
2283 #define ENCODE_SINGLE_SHIFT_2                           \
2284   do {                                                  \
2285     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2286       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2287     else                                                \
2288       *dst++ = ISO_CODE_SS2;                            \
2289     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2290   } while (0)
2291
2292 #define ENCODE_SINGLE_SHIFT_3                           \
2293   do {                                                  \
2294     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2295       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2296     else                                                \
2297       *dst++ = ISO_CODE_SS3;                            \
2298     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2299   } while (0)
2300
2301 /* The following four macros produce codes (control character or
2302    escape sequence) for ISO2022 locking-shift functions (shift-in,
2303    shift-out, locking-shift-2, and locking-shift-3).  */
2304
2305 #define ENCODE_SHIFT_IN                         \
2306   do {                                          \
2307     *dst++ = ISO_CODE_SI;                       \
2308     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2309   } while (0)
2310
2311 #define ENCODE_SHIFT_OUT                        \
2312   do {                                          \
2313     *dst++ = ISO_CODE_SO;                       \
2314     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2315   } while (0)
2316
2317 #define ENCODE_LOCKING_SHIFT_2                  \
2318   do {                                          \
2319     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2320     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2321   } while (0)
2322
2323 #define ENCODE_LOCKING_SHIFT_3                  \
2324   do {                                          \
2325     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2326     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2327   } while (0)
2328
2329 /* Produce codes for a DIMENSION1 character whose character set is
2330    CHARSET and whose position-code is C1.  Designation and invocation
2331    sequences are also produced in advance if necessary.  */
2332
2333 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2334   do {                                                                  \
2335     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2336       {                                                                 \
2337         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2338           *dst++ = c1 & 0x7F;                                           \
2339         else                                                            \
2340           *dst++ = c1 | 0x80;                                           \
2341         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2342         break;                                                          \
2343       }                                                                 \
2344     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2345       {                                                                 \
2346         *dst++ = c1 & 0x7F;                                             \
2347         break;                                                          \
2348       }                                                                 \
2349     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2350       {                                                                 \
2351         *dst++ = c1 | 0x80;                                             \
2352         break;                                                          \
2353       }                                                                 \
2354     else                                                                \
2355       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2356          must invoke it, or, at first, designate it to some graphic     \
2357          register.  Then repeat the loop to actually produce the        \
2358          character.  */                                                 \
2359       dst = encode_invocation_designation (charset, coding, dst);       \
2360   } while (1)
2361
2362 /* Produce codes for a DIMENSION2 character whose character set is
2363    CHARSET and whose position-codes are C1 and C2.  Designation and
2364    invocation codes are also produced in advance if necessary.  */
2365
2366 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2367   do {                                                                  \
2368     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2369       {                                                                 \
2370         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2371           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2372         else                                                            \
2373           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2374         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2375         break;                                                          \
2376       }                                                                 \
2377     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2378       {                                                                 \
2379         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2380         break;                                                          \
2381       }                                                                 \
2382     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2383       {                                                                 \
2384         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2385         break;                                                          \
2386       }                                                                 \
2387     else                                                                \
2388       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2389          must invoke it, or, at first, designate it to some graphic     \
2390          register.  Then repeat the loop to actually produce the        \
2391          character.  */                                                 \
2392       dst = encode_invocation_designation (charset, coding, dst);       \
2393   } while (1)
2394
2395 #define ENCODE_ISO_CHARACTER(c)                                 \
2396   do {                                                          \
2397     int charset, c1, c2;                                        \
2398                                                                 \
2399     SPLIT_CHAR (c, charset, c1, c2);                            \
2400     if (CHARSET_DEFINED_P (charset))                            \
2401       {                                                         \
2402         if (CHARSET_DIMENSION (charset) == 1)                   \
2403           {                                                     \
2404             if (charset == CHARSET_ASCII                        \
2405                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2406               charset = charset_latin_jisx0201;                 \
2407             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2408           }                                                     \
2409         else                                                    \
2410           {                                                     \
2411             if (charset == charset_jisx0208                     \
2412                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2413               charset = charset_jisx0208_1978;                  \
2414             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2415           }                                                     \
2416       }                                                         \
2417     else                                                        \
2418       {                                                         \
2419         *dst++ = c1;                                            \
2420         if (c2 >= 0)                                            \
2421           *dst++ = c2;                                          \
2422       }                                                         \
2423   } while (0)
2424
2425
2426 /* Instead of encoding character C, produce one or two `?'s.  */
2427
2428 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2429   do {                                                          \
2430     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2431     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2432       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2433   } while (0)
2434
2435
2436 /* Produce designation and invocation codes at a place pointed by DST
2437    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2438    Return new DST.  */
2439
2440 unsigned char *
2441 encode_invocation_designation (charset, coding, dst)
2442      int charset;
2443      struct coding_system *coding;
2444      unsigned char *dst;
2445 {
2446   int reg;                      /* graphic register number */
2447
2448   /* At first, check designations.  */
2449   for (reg = 0; reg < 4; reg++)
2450     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2451       break;
2452
2453   if (reg >= 4)
2454     {
2455       /* CHARSET is not yet designated to any graphic registers.  */
2456       /* At first check the requested designation.  */
2457       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2458       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2459         /* Since CHARSET requests no special designation, designate it
2460            to graphic register 0.  */
2461         reg = 0;
2462
2463       ENCODE_DESIGNATION (charset, reg, coding);
2464     }
2465
2466   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2467       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2468     {
2469       /* Since the graphic register REG is not invoked to any graphic
2470          planes, invoke it to graphic plane 0.  */
2471       switch (reg)
2472         {
2473         case 0:                 /* graphic register 0 */
2474           ENCODE_SHIFT_IN;
2475           break;
2476
2477         case 1:                 /* graphic register 1 */
2478           ENCODE_SHIFT_OUT;
2479           break;
2480
2481         case 2:                 /* graphic register 2 */
2482           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2483             ENCODE_SINGLE_SHIFT_2;
2484           else
2485             ENCODE_LOCKING_SHIFT_2;
2486           break;
2487
2488         case 3:                 /* graphic register 3 */
2489           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2490             ENCODE_SINGLE_SHIFT_3;
2491           else
2492             ENCODE_LOCKING_SHIFT_3;
2493           break;
2494         }
2495     }
2496
2497   return dst;
2498 }
2499
2500 /* Produce 2-byte codes for encoded composition rule RULE.  */
2501
2502 #define ENCODE_COMPOSITION_RULE(rule)           \
2503   do {                                          \
2504     int gref, nref;                             \
2505     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2506     *dst++ = 32 + 81 + gref;                    \
2507     *dst++ = 32 + nref;                         \
2508   } while (0)
2509
2510 /* Produce codes for indicating the start of a composition sequence
2511    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2512    which specify information about the composition.  See the comment
2513    in coding.h for the format of DATA.  */
2514
2515 #define ENCODE_COMPOSITION_START(coding, data)                          \
2516   do {                                                                  \
2517     coding->composing = data[3];                                        \
2518     *dst++ = ISO_CODE_ESC;                                              \
2519     if (coding->composing == COMPOSITION_RELATIVE)                      \
2520       *dst++ = '0';                                                     \
2521     else                                                                \
2522       {                                                                 \
2523         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2524                   ? '3' : '4');                                         \
2525         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2526         coding->composition_rule_follows = 0;                           \
2527       }                                                                 \
2528   } while (0)
2529
2530 /* Produce codes for indicating the end of the current composition.  */
2531
2532 #define ENCODE_COMPOSITION_END(coding, data)                    \
2533   do {                                                          \
2534     *dst++ = ISO_CODE_ESC;                                      \
2535     *dst++ = '1';                                               \
2536     coding->cmp_data_start += data[0];                          \
2537     coding->composing = COMPOSITION_NO;                         \
2538     if (coding->cmp_data_start == coding->cmp_data->used        \
2539         && coding->cmp_data->next)                              \
2540       {                                                         \
2541         coding->cmp_data = coding->cmp_data->next;              \
2542         coding->cmp_data_start = 0;                             \
2543       }                                                         \
2544   } while (0)
2545
2546 /* Produce composition start sequence ESC 0.  Here, this sequence
2547    doesn't mean the start of a new composition but means that we have
2548    just produced components (alternate chars and composition rules) of
2549    the composition and the actual text follows in SRC.  */
2550
2551 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2552   do {                                          \
2553     *dst++ = ISO_CODE_ESC;                      \
2554     *dst++ = '0';                               \
2555     coding->composing = COMPOSITION_RELATIVE;   \
2556   } while (0)
2557
2558 /* The following three macros produce codes for indicating direction
2559    of text.  */
2560 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2561   do {                                                  \
2562     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2563       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2564     else                                                \
2565       *dst++ = ISO_CODE_CSI;                            \
2566   } while (0)
2567
2568 #define ENCODE_DIRECTION_R2L    \
2569   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2570
2571 #define ENCODE_DIRECTION_L2R    \
2572   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2573
2574 /* Produce codes for designation and invocation to reset the graphic
2575    planes and registers to initial state.  */
2576 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2577   do {                                                                      \
2578     int reg;                                                                \
2579     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2580       ENCODE_SHIFT_IN;                                                      \
2581     for (reg = 0; reg < 4; reg++)                                           \
2582       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2583           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2584               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2585         ENCODE_DESIGNATION                                                  \
2586           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2587   } while (0)
2588
2589 /* Produce designation sequences of charsets in the line started from
2590    SRC to a place pointed by DST, and return updated DST.
2591
2592    If the current block ends before any end-of-line, we may fail to
2593    find all the necessary designations.  */
2594
2595 static unsigned char *
2596 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2597      struct coding_system *coding;
2598      Lisp_Object translation_table;
2599      unsigned char *src, *src_end, *dst;
2600 {
2601   int charset, c, found = 0, reg;
2602   /* Table of charsets to be designated to each graphic register.  */
2603   int r[4];
2604
2605   for (reg = 0; reg < 4; reg++)
2606     r[reg] = -1;
2607
2608   while (found < 4)
2609     {
2610       ONE_MORE_CHAR (c);
2611       if (c == '\n')
2612         break;
2613
2614       charset = CHAR_CHARSET (c);
2615       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2616       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2617         {
2618           found++;
2619           r[reg] = charset;
2620         }
2621     }
2622
2623  label_end_of_loop:
2624   if (found)
2625     {
2626       for (reg = 0; reg < 4; reg++)
2627         if (r[reg] >= 0
2628             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2629           ENCODE_DESIGNATION (r[reg], reg, coding);
2630     }
2631
2632   return dst;
2633 }
2634
2635 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2636
2637 static void
2638 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2639      struct coding_system *coding;
2640      unsigned char *source, *destination;
2641      int src_bytes, dst_bytes;
2642 {
2643   unsigned char *src = source;
2644   unsigned char *src_end = source + src_bytes;
2645   unsigned char *dst = destination;
2646   unsigned char *dst_end = destination + dst_bytes;
2647   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2648      from DST_END to assure overflow checking is necessary only at the
2649      head of loop.  */
2650   unsigned char *adjusted_dst_end = dst_end - 19;
2651   /* SRC_BASE remembers the start position in source in each loop.
2652      The loop will be exited when there's not enough source text to
2653      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2654      there's not enough destination area to produce encoded codes
2655      (within macro EMIT_BYTES).  */
2656   unsigned char *src_base;
2657   int c;
2658   Lisp_Object translation_table;
2659   Lisp_Object safe_chars;
2660
2661   if (coding->flags & CODING_FLAG_ISO_SAFE)
2662     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2663
2664   safe_chars = coding_safe_chars (coding->symbol);
2665
2666   if (NILP (Venable_character_translation))
2667     translation_table = Qnil;
2668   else
2669     {
2670       translation_table = coding->translation_table_for_encode;
2671       if (NILP (translation_table))
2672         translation_table = Vstandard_translation_table_for_encode;
2673     }
2674
2675   coding->consumed_char = 0;
2676   coding->errors = 0;
2677   while (1)
2678     {
2679       src_base = src;
2680
2681       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2682         {
2683           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2684           break;
2685         }
2686
2687       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2688           && CODING_SPEC_ISO_BOL (coding))
2689         {
2690           /* We have to produce designation sequences if any now.  */
2691           dst = encode_designation_at_bol (coding, translation_table,
2692                                            src, src_end, dst);
2693           CODING_SPEC_ISO_BOL (coding) = 0;
2694         }
2695
2696       /* Check composition start and end.  */
2697       if (coding->composing != COMPOSITION_DISABLED
2698           && coding->cmp_data_start < coding->cmp_data->used)
2699         {
2700           struct composition_data *cmp_data = coding->cmp_data;
2701           int *data = cmp_data->data + coding->cmp_data_start;
2702           int this_pos = cmp_data->char_offset + coding->consumed_char;
2703
2704           if (coding->composing == COMPOSITION_RELATIVE)
2705             {
2706               if (this_pos == data[2])
2707                 {
2708                   ENCODE_COMPOSITION_END (coding, data);
2709                   cmp_data = coding->cmp_data;
2710                   data = cmp_data->data + coding->cmp_data_start;
2711                 }
2712             }
2713           else if (COMPOSING_P (coding))
2714             {
2715               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2716               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2717                 /* We have consumed components of the composition.
2718                    What follows in SRC is the composition's base
2719                    text.  */
2720                 ENCODE_COMPOSITION_FAKE_START (coding);
2721               else
2722                 {
2723                   int c = cmp_data->data[coding->cmp_data_index++];
2724                   if (coding->composition_rule_follows)
2725                     {
2726                       ENCODE_COMPOSITION_RULE (c);
2727                       coding->composition_rule_follows = 0;
2728                     }
2729                   else
2730                     {
2731                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2732                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2733                         ENCODE_UNSAFE_CHARACTER (c);
2734                       else
2735                         ENCODE_ISO_CHARACTER (c);
2736                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2737                         coding->composition_rule_follows = 1;
2738                     }
2739                   continue;
2740                 }
2741             }
2742           if (!COMPOSING_P (coding))
2743             {
2744               if (this_pos == data[1])
2745                 {
2746                   ENCODE_COMPOSITION_START (coding, data);
2747                   continue;
2748                 }
2749             }
2750         }
2751
2752       ONE_MORE_CHAR (c);
2753
2754       /* Now encode the character C.  */
2755       if (c < 0x20 || c == 0x7F)
2756         {
2757           if (c == '\r')
2758             {
2759               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2760                 {
2761                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2762                     ENCODE_RESET_PLANE_AND_REGISTER;
2763                   *dst++ = c;
2764                   continue;
2765                 }
2766               /* fall down to treat '\r' as '\n' ...  */
2767               c = '\n';
2768             }
2769           if (c == '\n')
2770             {
2771               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2772                 ENCODE_RESET_PLANE_AND_REGISTER;
2773               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2774                 bcopy (coding->spec.iso2022.initial_designation,
2775                        coding->spec.iso2022.current_designation,
2776                        sizeof coding->spec.iso2022.initial_designation);
2777               if (coding->eol_type == CODING_EOL_LF
2778                   || coding->eol_type == CODING_EOL_UNDECIDED)
2779                 *dst++ = ISO_CODE_LF;
2780               else if (coding->eol_type == CODING_EOL_CRLF)
2781                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2782               else
2783                 *dst++ = ISO_CODE_CR;
2784               CODING_SPEC_ISO_BOL (coding) = 1;
2785             }
2786           else
2787             {
2788               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2789                 ENCODE_RESET_PLANE_AND_REGISTER;
2790               *dst++ = c;
2791             }
2792         }
2793       else if (ASCII_BYTE_P (c))
2794         ENCODE_ISO_CHARACTER (c);
2795       else if (SINGLE_BYTE_CHAR_P (c))
2796         {
2797           *dst++ = c;
2798           coding->errors++;
2799         }
2800       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2801                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2802         ENCODE_UNSAFE_CHARACTER (c);
2803       else
2804         ENCODE_ISO_CHARACTER (c);
2805
2806       coding->consumed_char++;
2807     }
2808
2809  label_end_of_loop:
2810   coding->consumed = src_base - source;
2811   coding->produced = coding->produced_char = dst - destination;
2812 }
2813
2814 \f
2815 /*** 4. SJIS and BIG5 handlers ***/
2816
2817 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2818    quite widely.  So, for the moment, Emacs supports them in the bare
2819    C code.  But, in the future, they may be supported only by CCL.  */
2820
2821 /* SJIS is a coding system encoding three character sets: ASCII, right
2822    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2823    as is.  A character of charset katakana-jisx0201 is encoded by
2824    "position-code + 0x80".  A character of charset japanese-jisx0208
2825    is encoded in 2-byte but two position-codes are divided and shifted
2826    so that it fits in the range below.
2827
2828    --- CODE RANGE of SJIS ---
2829    (character set)      (range)
2830    ASCII                0x00 .. 0x7F
2831    KATAKANA-JISX0201    0xA1 .. 0xDF
2832    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2833             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2834    -------------------------------
2835
2836 */
2837
2838 /* BIG5 is a coding system encoding two character sets: ASCII and
2839    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2840    character set and is encoded in two bytes.
2841
2842    --- CODE RANGE of BIG5 ---
2843    (character set)      (range)
2844    ASCII                0x00 .. 0x7F
2845    Big5 (1st byte)      0xA1 .. 0xFE
2846         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2847    --------------------------
2848
2849    Since the number of characters in Big5 is larger than maximum
2850    characters in Emacs' charset (96x96), it can't be handled as one
2851    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2852    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2853    contains frequently used characters and the latter contains less
2854    frequently used characters.  */
2855
2856 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2857    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2858    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2859    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2860
2861 /* Number of Big5 characters which have the same code in 1st byte.  */
2862 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2863
2864 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2865   do {                                                                  \
2866     unsigned int temp                                                   \
2867       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2868     if (b1 < 0xC9)                                                      \
2869       charset = charset_big5_1;                                         \
2870     else                                                                \
2871       {                                                                 \
2872         charset = charset_big5_2;                                       \
2873         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2874       }                                                                 \
2875     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2876     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2877   } while (0)
2878
2879 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2880   do {                                                                  \
2881     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2882     if (charset == charset_big5_2)                                      \
2883       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2884     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2885     b2 = temp % BIG5_SAME_ROW;                                          \
2886     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2887   } while (0)
2888
2889 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2890    Check if a text is encoded in SJIS.  If it is, return
2891    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2892
2893 static int
2894 detect_coding_sjis (src, src_end, multibytep)
2895      unsigned char *src, *src_end;
2896      int multibytep;
2897 {
2898   int c;
2899   /* Dummy for ONE_MORE_BYTE.  */
2900   struct coding_system dummy_coding;
2901   struct coding_system *coding = &dummy_coding;
2902
2903   while (1)
2904     {
2905       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2906       if (c < 0x80)
2907         continue;
2908       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2909         return 0;
2910       if (c <= 0x9F || c >= 0xE0)
2911         {
2912           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2913           if (c < 0x40 || c == 0x7F || c > 0xFC)
2914             return 0;
2915         }
2916     }
2917  label_end_of_loop:
2918   return CODING_CATEGORY_MASK_SJIS;
2919 }
2920
2921 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2922    Check if a text is encoded in BIG5.  If it is, return
2923    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2924
2925 static int
2926 detect_coding_big5 (src, src_end, multibytep)
2927      unsigned char *src, *src_end;
2928      int multibytep;
2929 {
2930   int c;
2931   /* Dummy for ONE_MORE_BYTE.  */
2932   struct coding_system dummy_coding;
2933   struct coding_system *coding = &dummy_coding;
2934
2935   while (1)
2936     {
2937       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2938       if (c < 0x80)
2939         continue;
2940       if (c < 0xA1 || c > 0xFE)
2941         return 0;
2942       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2943       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2944         return 0;
2945     }
2946  label_end_of_loop:
2947   return CODING_CATEGORY_MASK_BIG5;
2948 }
2949
2950 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2951    Check if a text is encoded in UTF-8.  If it is, return
2952    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2953
2954 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2955 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2956 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2957 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2958 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2959 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2960 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2961
2962 static int
2963 detect_coding_utf_8 (src, src_end, multibytep)
2964      unsigned char *src, *src_end;
2965      int multibytep;
2966 {
2967   unsigned char c;
2968   int seq_maybe_bytes;
2969   /* Dummy for ONE_MORE_BYTE.  */
2970   struct coding_system dummy_coding;
2971   struct coding_system *coding = &dummy_coding;
2972
2973   while (1)
2974     {
2975       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2976       if (UTF_8_1_OCTET_P (c))
2977         continue;
2978       else if (UTF_8_2_OCTET_LEADING_P (c))
2979         seq_maybe_bytes = 1;
2980       else if (UTF_8_3_OCTET_LEADING_P (c))
2981         seq_maybe_bytes = 2;
2982       else if (UTF_8_4_OCTET_LEADING_P (c))
2983         seq_maybe_bytes = 3;
2984       else if (UTF_8_5_OCTET_LEADING_P (c))
2985         seq_maybe_bytes = 4;
2986       else if (UTF_8_6_OCTET_LEADING_P (c))
2987         seq_maybe_bytes = 5;
2988       else
2989         return 0;
2990
2991       do
2992         {
2993           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2994           if (!UTF_8_EXTRA_OCTET_P (c))
2995             return 0;
2996           seq_maybe_bytes--;
2997         }
2998       while (seq_maybe_bytes > 0);
2999     }
3000
3001  label_end_of_loop:
3002   return CODING_CATEGORY_MASK_UTF_8;
3003 }
3004
3005 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3006    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3007    Little Endian (otherwise).  If it is, return
3008    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3009    else return 0.  */
3010
3011 #define UTF_16_INVALID_P(val)   \
3012   (((val) == 0xFFFE)            \
3013    || ((val) == 0xFFFF))
3014
3015 #define UTF_16_HIGH_SURROGATE_P(val) \
3016   (((val) & 0xD800) == 0xD800)
3017
3018 #define UTF_16_LOW_SURROGATE_P(val) \
3019   (((val) & 0xDC00) == 0xDC00)
3020
3021 static int
3022 detect_coding_utf_16 (src, src_end, multibytep)
3023      unsigned char *src, *src_end;
3024      int multibytep;
3025 {
3026   unsigned char c1, c2;
3027   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3028   struct coding_system dummy_coding;
3029   struct coding_system *coding = &dummy_coding;
3030
3031   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3032   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3033
3034   if ((c1 == 0xFF) && (c2 == 0xFE))
3035     return CODING_CATEGORY_MASK_UTF_16_LE;
3036   else if ((c1 == 0xFE) && (c2 == 0xFF))
3037     return CODING_CATEGORY_MASK_UTF_16_BE;
3038
3039  label_end_of_loop:
3040   return 0;
3041 }
3042
3043 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3044    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3045
3046 static void
3047 decode_coding_sjis_big5 (coding, source, destination,
3048                          src_bytes, dst_bytes, sjis_p)
3049      struct coding_system *coding;
3050      unsigned char *source, *destination;
3051      int src_bytes, dst_bytes;
3052      int sjis_p;
3053 {
3054   unsigned char *src = source;
3055   unsigned char *src_end = source + src_bytes;
3056   unsigned char *dst = destination;
3057   unsigned char *dst_end = destination + dst_bytes;
3058   /* SRC_BASE remembers the start position in source in each loop.
3059      The loop will be exited when there's not enough source code
3060      (within macro ONE_MORE_BYTE), or when there's not enough
3061      destination area to produce a character (within macro
3062      EMIT_CHAR).  */
3063   unsigned char *src_base;
3064   Lisp_Object translation_table;
3065
3066   if (NILP (Venable_character_translation))
3067     translation_table = Qnil;
3068   else
3069     {
3070       translation_table = coding->translation_table_for_decode;
3071       if (NILP (translation_table))
3072         translation_table = Vstandard_translation_table_for_decode;
3073     }
3074
3075   coding->produced_char = 0;
3076   while (1)
3077     {
3078       int c, charset, c1, c2 = 0;
3079
3080       src_base = src;
3081       ONE_MORE_BYTE (c1);
3082
3083       if (c1 < 0x80)
3084         {
3085           charset = CHARSET_ASCII;
3086           if (c1 < 0x20)
3087             {
3088               if (c1 == '\r')
3089                 {
3090                   if (coding->eol_type == CODING_EOL_CRLF)
3091                     {
3092                       ONE_MORE_BYTE (c2);
3093                       if (c2 == '\n')
3094                         c1 = c2;
3095                       else
3096                         /* To process C2 again, SRC is subtracted by 1.  */
3097                         src--;
3098                     }
3099                   else if (coding->eol_type == CODING_EOL_CR)
3100                     c1 = '\n';
3101                 }
3102               else if (c1 == '\n'
3103                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3104                        && (coding->eol_type == CODING_EOL_CR
3105                            || coding->eol_type == CODING_EOL_CRLF))
3106                 {
3107                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3108                   goto label_end_of_loop;
3109                 }
3110             }
3111         }
3112       else
3113         {
3114           if (sjis_p)
3115             {
3116               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3117                 goto label_invalid_code;
3118               if (c1 <= 0x9F || c1 >= 0xE0)
3119                 {
3120                   /* SJIS -> JISX0208 */
3121                   ONE_MORE_BYTE (c2);
3122                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3123                     goto label_invalid_code;
3124                   DECODE_SJIS (c1, c2, c1, c2);
3125                   charset = charset_jisx0208;
3126                 }
3127               else
3128                 /* SJIS -> JISX0201-Kana */
3129                 charset = charset_katakana_jisx0201;
3130             }
3131           else
3132             {
3133               /* BIG5 -> Big5 */
3134               if (c1 < 0xA0 || c1 > 0xFE)
3135                 goto label_invalid_code;
3136               ONE_MORE_BYTE (c2);
3137               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3138                 goto label_invalid_code;
3139               DECODE_BIG5 (c1, c2, charset, c1, c2);
3140             }
3141         }
3142
3143       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3144       EMIT_CHAR (c);
3145       continue;
3146
3147     label_invalid_code:
3148       coding->errors++;
3149       src = src_base;
3150       c = *src++;
3151       EMIT_CHAR (c);
3152     }
3153
3154  label_end_of_loop:
3155   coding->consumed = coding->consumed_char = src_base - source;
3156   coding->produced = dst - destination;
3157   return;
3158 }
3159
3160 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3161    This function can encode charsets `ascii', `katakana-jisx0201',
3162    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3163    are sure that all these charsets are registered as official charset
3164    (i.e. do not have extended leading-codes).  Characters of other
3165    charsets are produced without any encoding.  If SJIS_P is 1, encode
3166    SJIS text, else encode BIG5 text.  */
3167
3168 static void
3169 encode_coding_sjis_big5 (coding, source, destination,
3170                          src_bytes, dst_bytes, sjis_p)
3171      struct coding_system *coding;
3172      unsigned char *source, *destination;
3173      int src_bytes, dst_bytes;
3174      int sjis_p;
3175 {
3176   unsigned char *src = source;
3177   unsigned char *src_end = source + src_bytes;
3178   unsigned char *dst = destination;
3179   unsigned char *dst_end = destination + dst_bytes;
3180   /* SRC_BASE remembers the start position in source in each loop.
3181      The loop will be exited when there's not enough source text to
3182      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3183      there's not enough destination area to produce encoded codes
3184      (within macro EMIT_BYTES).  */
3185   unsigned char *src_base;
3186   Lisp_Object translation_table;
3187
3188   if (NILP (Venable_character_translation))
3189     translation_table = Qnil;
3190   else
3191     {
3192       translation_table = coding->translation_table_for_encode;
3193       if (NILP (translation_table))
3194         translation_table = Vstandard_translation_table_for_encode;
3195     }
3196
3197   while (1)
3198     {
3199       int c, charset, c1, c2;
3200
3201       src_base = src;
3202       ONE_MORE_CHAR (c);
3203
3204       /* Now encode the character C.  */
3205       if (SINGLE_BYTE_CHAR_P (c))
3206         {
3207           switch (c)
3208             {
3209             case '\r':
3210               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3211                 {
3212                   EMIT_ONE_BYTE (c);
3213                   break;
3214                 }
3215               c = '\n';
3216             case '\n':
3217               if (coding->eol_type == CODING_EOL_CRLF)
3218                 {
3219                   EMIT_TWO_BYTES ('\r', c);
3220                   break;
3221                 }
3222               else if (coding->eol_type == CODING_EOL_CR)
3223                 c = '\r';
3224             default:
3225               EMIT_ONE_BYTE (c);
3226             }
3227         }
3228       else
3229         {
3230           SPLIT_CHAR (c, charset, c1, c2);
3231           if (sjis_p)
3232             {
3233               if (charset == charset_jisx0208
3234                   || charset == charset_jisx0208_1978)
3235                 {
3236                   ENCODE_SJIS (c1, c2, c1, c2);
3237                   EMIT_TWO_BYTES (c1, c2);
3238                 }
3239               else if (charset == charset_katakana_jisx0201)
3240                 EMIT_ONE_BYTE (c1 | 0x80);
3241               else if (charset == charset_latin_jisx0201)
3242                 EMIT_ONE_BYTE (c1);
3243               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3244                 {
3245                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3246                   if (CHARSET_WIDTH (charset) > 1)
3247                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3248                 }
3249               else
3250                 /* There's no way other than producing the internal
3251                    codes as is.  */
3252                 EMIT_BYTES (src_base, src);
3253             }
3254           else
3255             {
3256               if (charset == charset_big5_1 || charset == charset_big5_2)
3257                 {
3258                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3259                   EMIT_TWO_BYTES (c1, c2);
3260                 }
3261               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3262                 {
3263                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3264                   if (CHARSET_WIDTH (charset) > 1)
3265                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3266                 }
3267               else
3268                 /* There's no way other than producing the internal
3269                    codes as is.  */
3270                 EMIT_BYTES (src_base, src);
3271             }
3272         }
3273       coding->consumed_char++;
3274     }
3275
3276  label_end_of_loop:
3277   coding->consumed = src_base - source;
3278   coding->produced = coding->produced_char = dst - destination;
3279 }
3280
3281 \f
3282 /*** 5. CCL handlers ***/
3283
3284 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3285    Check if a text is encoded in a coding system of which
3286    encoder/decoder are written in CCL program.  If it is, return
3287    CODING_CATEGORY_MASK_CCL, else return 0.  */
3288
3289 static int
3290 detect_coding_ccl (src, src_end, multibytep)
3291      unsigned char *src, *src_end;
3292      int multibytep;
3293 {
3294   unsigned char *valid;
3295   int c;
3296   /* Dummy for ONE_MORE_BYTE.  */
3297   struct coding_system dummy_coding;
3298   struct coding_system *coding = &dummy_coding;
3299
3300   /* No coding system is assigned to coding-category-ccl.  */
3301   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3302     return 0;
3303
3304   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3305   while (1)
3306     {
3307       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3308       if (! valid[c])
3309         return 0;
3310     }
3311  label_end_of_loop:
3312   return CODING_CATEGORY_MASK_CCL;
3313 }
3314
3315 \f
3316 /*** 6. End-of-line handlers ***/
3317
3318 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3319
3320 static void
3321 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3322      struct coding_system *coding;
3323      unsigned char *source, *destination;
3324      int src_bytes, dst_bytes;
3325 {
3326   unsigned char *src = source;
3327   unsigned char *dst = destination;
3328   unsigned char *src_end = src + src_bytes;
3329   unsigned char *dst_end = dst + dst_bytes;
3330   Lisp_Object translation_table;
3331   /* SRC_BASE remembers the start position in source in each loop.
3332      The loop will be exited when there's not enough source code
3333      (within macro ONE_MORE_BYTE), or when there's not enough
3334      destination area to produce a character (within macro
3335      EMIT_CHAR).  */
3336   unsigned char *src_base;
3337   int c;
3338
3339   translation_table = Qnil;
3340   switch (coding->eol_type)
3341     {
3342     case CODING_EOL_CRLF:
3343       while (1)
3344         {
3345           src_base = src;
3346           ONE_MORE_BYTE (c);
3347           if (c == '\r')
3348             {
3349               ONE_MORE_BYTE (c);
3350               if (c != '\n')
3351                 {
3352                   src--;
3353                   c = '\r';
3354                 }
3355             }
3356           else if (c == '\n'
3357                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3358             {
3359               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3360               goto label_end_of_loop;
3361             }
3362           EMIT_CHAR (c);
3363         }
3364       break;
3365
3366     case CODING_EOL_CR:
3367       while (1)
3368         {
3369           src_base = src;
3370           ONE_MORE_BYTE (c);
3371           if (c == '\n')
3372             {
3373               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3374                 {
3375                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3376                   goto label_end_of_loop;
3377                 }
3378             }
3379           else if (c == '\r')
3380             c = '\n';
3381           EMIT_CHAR (c);
3382         }
3383       break;
3384
3385     default:                    /* no need for EOL handling */
3386       while (1)
3387         {
3388           src_base = src;
3389           ONE_MORE_BYTE (c);
3390           EMIT_CHAR (c);
3391         }
3392     }
3393
3394  label_end_of_loop:
3395   coding->consumed = coding->consumed_char = src_base - source;
3396   coding->produced = dst - destination;
3397   return;
3398 }
3399
3400 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3401    format of end-of-line according to `coding->eol_type'.  It also
3402    convert multibyte form 8-bit characters to unibyte if
3403    CODING->src_multibyte is nonzero.  If `coding->mode &
3404    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3405    also means end-of-line.  */
3406
3407 static void
3408 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3409      struct coding_system *coding;
3410      const unsigned char *source;
3411      unsigned char *destination;
3412      int src_bytes, dst_bytes;
3413 {
3414   const unsigned char *src = source;
3415   unsigned char *dst = destination;
3416   const unsigned char *src_end = src + src_bytes;
3417   unsigned char *dst_end = dst + dst_bytes;
3418   Lisp_Object translation_table;
3419   /* SRC_BASE remembers the start position in source in each loop.
3420      The loop will be exited when there's not enough source text to
3421      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3422      there's not enough destination area to produce encoded codes
3423      (within macro EMIT_BYTES).  */
3424   const unsigned char *src_base;
3425   unsigned char *tmp;
3426   int c;
3427   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3428
3429   translation_table = Qnil;
3430   if (coding->src_multibyte
3431       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3432     {
3433       src_end--;
3434       src_bytes--;
3435       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3436     }
3437
3438   if (coding->eol_type == CODING_EOL_CRLF)
3439     {
3440       while (src < src_end)
3441         {
3442           src_base = src;
3443           c = *src++;
3444           if (c >= 0x20)
3445             EMIT_ONE_BYTE (c);
3446           else if (c == '\n' || (c == '\r' && selective_display))
3447             EMIT_TWO_BYTES ('\r', '\n');
3448           else
3449             EMIT_ONE_BYTE (c);
3450         }
3451       src_base = src;
3452     label_end_of_loop:
3453       ;
3454     }
3455   else
3456     {
3457       if (!dst_bytes || src_bytes <= dst_bytes)
3458         {
3459           safe_bcopy (src, dst, src_bytes);
3460           src_base = src_end;
3461           dst += src_bytes;
3462         }
3463       else
3464         {
3465           if (coding->src_multibyte
3466               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3467             dst_bytes--;
3468           safe_bcopy (src, dst, dst_bytes);
3469           src_base = src + dst_bytes;
3470           dst = destination + dst_bytes;
3471           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3472         }
3473       if (coding->eol_type == CODING_EOL_CR)
3474         {
3475           for (tmp = destination; tmp < dst; tmp++)
3476             if (*tmp == '\n') *tmp = '\r';
3477         }
3478       else if (selective_display)
3479         {
3480           for (tmp = destination; tmp < dst; tmp++)
3481             if (*tmp == '\r') *tmp = '\n';
3482         }
3483     }
3484   if (coding->src_multibyte)
3485     dst = destination + str_as_unibyte (destination, dst - destination);
3486
3487   coding->consumed = src_base - source;
3488   coding->produced = dst - destination;
3489   coding->produced_char = coding->produced;
3490 }
3491
3492 \f
3493 /*** 7. C library functions ***/
3494
3495 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3496    has a property `coding-system'.  The value of this property is a
3497    vector of length 5 (called the coding-vector).  Among elements of
3498    this vector, the first (element[0]) and the fifth (element[4])
3499    carry important information for decoding/encoding.  Before
3500    decoding/encoding, this information should be set in fields of a
3501    structure of type `coding_system'.
3502
3503    The value of the property `coding-system' can be a symbol of another
3504    subsidiary coding-system.  In that case, Emacs gets coding-vector
3505    from that symbol.
3506
3507    `element[0]' contains information to be set in `coding->type'.  The
3508    value and its meaning is as follows:
3509
3510    0 -- coding_type_emacs_mule
3511    1 -- coding_type_sjis
3512    2 -- coding_type_iso2022
3513    3 -- coding_type_big5
3514    4 -- coding_type_ccl encoder/decoder written in CCL
3515    nil -- coding_type_no_conversion
3516    t -- coding_type_undecided (automatic conversion on decoding,
3517                                no-conversion on encoding)
3518
3519    `element[4]' contains information to be set in `coding->flags' and
3520    `coding->spec'.  The meaning varies by `coding->type'.
3521
3522    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3523    of length 32 (of which the first 13 sub-elements are used now).
3524    Meanings of these sub-elements are:
3525
3526    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3527         If the value is an integer of valid charset, the charset is
3528         assumed to be designated to graphic register N initially.
3529
3530         If the value is minus, it is a minus value of charset which
3531         reserves graphic register N, which means that the charset is
3532         not designated initially but should be designated to graphic
3533         register N just before encoding a character in that charset.
3534
3535         If the value is nil, graphic register N is never used on
3536         encoding.
3537
3538    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3539         Each value takes t or nil.  See the section ISO2022 of
3540         `coding.h' for more information.
3541
3542    If `coding->type' is `coding_type_big5', element[4] is t to denote
3543    BIG5-ETen or nil to denote BIG5-HKU.
3544
3545    If `coding->type' takes the other value, element[4] is ignored.
3546
3547    Emacs Lisp's coding systems also carry information about format of
3548    end-of-line in a value of property `eol-type'.  If the value is
3549    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3550    means CODING_EOL_CR.  If it is not integer, it should be a vector
3551    of subsidiary coding systems of which property `eol-type' has one
3552    of the above values.
3553
3554 */
3555
3556 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3557    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3558    is setup so that no conversion is necessary and return -1, else
3559    return 0.  */
3560
3561 int
3562 setup_coding_system (coding_system, coding)
3563      Lisp_Object coding_system;
3564      struct coding_system *coding;
3565 {
3566   Lisp_Object coding_spec, coding_type, eol_type, plist;
3567   Lisp_Object val;
3568
3569   /* At first, zero clear all members.  */
3570   bzero (coding, sizeof (struct coding_system));
3571
3572   /* Initialize some fields required for all kinds of coding systems.  */
3573   coding->symbol = coding_system;
3574   coding->heading_ascii = -1;
3575   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3576   coding->composing = COMPOSITION_DISABLED;
3577   coding->cmp_data = NULL;
3578
3579   if (NILP (coding_system))
3580     goto label_invalid_coding_system;
3581
3582   coding_spec = Fget (coding_system, Qcoding_system);
3583
3584   if (!VECTORP (coding_spec)
3585       || XVECTOR (coding_spec)->size != 5
3586       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3587     goto label_invalid_coding_system;
3588
3589   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3590   if (VECTORP (eol_type))
3591     {
3592       coding->eol_type = CODING_EOL_UNDECIDED;
3593       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3594     }
3595   else if (XFASTINT (eol_type) == 1)
3596     {
3597       coding->eol_type = CODING_EOL_CRLF;
3598       coding->common_flags
3599         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3600     }
3601   else if (XFASTINT (eol_type) == 2)
3602     {
3603       coding->eol_type = CODING_EOL_CR;
3604       coding->common_flags
3605         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3606     }
3607   else
3608     coding->eol_type = CODING_EOL_LF;
3609
3610   coding_type = XVECTOR (coding_spec)->contents[0];
3611   /* Try short cut.  */
3612   if (SYMBOLP (coding_type))
3613     {
3614       if (EQ (coding_type, Qt))
3615         {
3616           coding->type = coding_type_undecided;
3617           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3618         }
3619       else
3620         coding->type = coding_type_no_conversion;
3621       /* Initialize this member.  Any thing other than
3622          CODING_CATEGORY_IDX_UTF_16_BE and
3623          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3624          special treatment in detect_eol.  */
3625       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3626
3627       return 0;
3628     }
3629
3630   /* Get values of coding system properties:
3631      `post-read-conversion', `pre-write-conversion',
3632      `translation-table-for-decode', `translation-table-for-encode'.  */
3633   plist = XVECTOR (coding_spec)->contents[3];
3634   /* Pre & post conversion functions should be disabled if
3635      inhibit_eol_conversion is nonzero.  This is the case that a code
3636      conversion function is called while those functions are running.  */
3637   if (! inhibit_pre_post_conversion)
3638     {
3639       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3640       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3641     }
3642   val = Fplist_get (plist, Qtranslation_table_for_decode);
3643   if (SYMBOLP (val))
3644     val = Fget (val, Qtranslation_table_for_decode);
3645   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3646   val = Fplist_get (plist, Qtranslation_table_for_encode);
3647   if (SYMBOLP (val))
3648     val = Fget (val, Qtranslation_table_for_encode);
3649   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3650   val = Fplist_get (plist, Qcoding_category);
3651   if (!NILP (val))
3652     {
3653       val = Fget (val, Qcoding_category_index);
3654       if (INTEGERP (val))
3655         coding->category_idx = XINT (val);
3656       else
3657         goto label_invalid_coding_system;
3658     }
3659   else
3660     goto label_invalid_coding_system;
3661
3662   /* If the coding system has non-nil `composition' property, enable
3663      composition handling.  */
3664   val = Fplist_get (plist, Qcomposition);
3665   if (!NILP (val))
3666     coding->composing = COMPOSITION_NO;
3667
3668   switch (XFASTINT (coding_type))
3669     {
3670     case 0:
3671       coding->type = coding_type_emacs_mule;
3672       coding->common_flags
3673         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3674       if (!NILP (coding->post_read_conversion))
3675         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3676       if (!NILP (coding->pre_write_conversion))
3677         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3678       break;
3679
3680     case 1:
3681       coding->type = coding_type_sjis;
3682       coding->common_flags
3683         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3684       break;
3685
3686     case 2:
3687       coding->type = coding_type_iso2022;
3688       coding->common_flags
3689         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3690       {
3691         Lisp_Object val, temp;
3692         Lisp_Object *flags;
3693         int i, charset, reg_bits = 0;
3694
3695         val = XVECTOR (coding_spec)->contents[4];
3696
3697         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3698           goto label_invalid_coding_system;
3699
3700         flags = XVECTOR (val)->contents;
3701         coding->flags
3702           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3703              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3704              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3705              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3706              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3707              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3708              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3709              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3710              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3711              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3712              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3713              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3714              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3715              );
3716
3717         /* Invoke graphic register 0 to plane 0.  */
3718         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3719         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3720         CODING_SPEC_ISO_INVOCATION (coding, 1)
3721           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3722         /* Not single shifting at first.  */
3723         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3724         /* Beginning of buffer should also be regarded as bol. */
3725         CODING_SPEC_ISO_BOL (coding) = 1;
3726
3727         for (charset = 0; charset <= MAX_CHARSET; charset++)
3728           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3729         val = Vcharset_revision_alist;
3730         while (CONSP (val))
3731           {
3732             charset = get_charset_id (Fcar_safe (XCAR (val)));
3733             if (charset >= 0
3734                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3735                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3736               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3737             val = XCDR (val);
3738           }
3739
3740         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3741            FLAGS[REG] can be one of below:
3742                 integer CHARSET: CHARSET occupies register I,
3743                 t: designate nothing to REG initially, but can be used
3744                   by any charsets,
3745                 list of integer, nil, or t: designate the first
3746                   element (if integer) to REG initially, the remaining
3747                   elements (if integer) is designated to REG on request,
3748                   if an element is t, REG can be used by any charsets,
3749                 nil: REG is never used.  */
3750         for (charset = 0; charset <= MAX_CHARSET; charset++)
3751           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3752             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3753         for (i = 0; i < 4; i++)
3754           {
3755             if ((INTEGERP (flags[i])
3756                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3757                 || (charset = get_charset_id (flags[i])) >= 0)
3758               {
3759                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3760                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3761               }
3762             else if (EQ (flags[i], Qt))
3763               {
3764                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3765                 reg_bits |= 1 << i;
3766                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3767               }
3768             else if (CONSP (flags[i]))
3769               {
3770                 Lisp_Object tail;
3771                 tail = flags[i];
3772
3773                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3774                 if ((INTEGERP (XCAR (tail))
3775                      && (charset = XINT (XCAR (tail)),
3776                          CHARSET_VALID_P (charset)))
3777                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3778                   {
3779                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3780                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3781                   }
3782                 else
3783                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3784                 tail = XCDR (tail);
3785                 while (CONSP (tail))
3786                   {
3787                     if ((INTEGERP (XCAR (tail))
3788                          && (charset = XINT (XCAR (tail)),
3789                              CHARSET_VALID_P (charset)))
3790                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3791                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3792                         = i;
3793                     else if (EQ (XCAR (tail), Qt))
3794                       reg_bits |= 1 << i;
3795                     tail = XCDR (tail);
3796                   }
3797               }
3798             else
3799               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3800
3801             CODING_SPEC_ISO_DESIGNATION (coding, i)
3802               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3803           }
3804
3805         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3806           {
3807             /* REG 1 can be used only by locking shift in 7-bit env.  */
3808             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3809               reg_bits &= ~2;
3810             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3811               /* Without any shifting, only REG 0 and 1 can be used.  */
3812               reg_bits &= 3;
3813           }
3814
3815         if (reg_bits)
3816           for (charset = 0; charset <= MAX_CHARSET; charset++)
3817             {
3818               if (CHARSET_DEFINED_P (charset)
3819                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3820                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3821                 {
3822                   /* There exist some default graphic registers to be
3823                      used by CHARSET.  */
3824
3825                   /* We had better avoid designating a charset of
3826                      CHARS96 to REG 0 as far as possible.  */
3827                   if (CHARSET_CHARS (charset) == 96)
3828                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3829                       = (reg_bits & 2
3830                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3831                   else
3832                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3833                       = (reg_bits & 1
3834                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3835                 }
3836             }
3837       }
3838       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3839       coding->spec.iso2022.last_invalid_designation_register = -1;
3840       break;
3841
3842     case 3:
3843       coding->type = coding_type_big5;
3844       coding->common_flags
3845         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3846       coding->flags
3847         = (NILP (XVECTOR (coding_spec)->contents[4])
3848            ? CODING_FLAG_BIG5_HKU
3849            : CODING_FLAG_BIG5_ETEN);
3850       break;
3851
3852     case 4:
3853       coding->type = coding_type_ccl;
3854       coding->common_flags
3855         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3856       {
3857         val = XVECTOR (coding_spec)->contents[4];
3858         if (! CONSP (val)
3859             || setup_ccl_program (&(coding->spec.ccl.decoder),
3860                                   XCAR (val)) < 0
3861             || setup_ccl_program (&(coding->spec.ccl.encoder),
3862                                   XCDR (val)) < 0)
3863           goto label_invalid_coding_system;
3864
3865         bzero (coding->spec.ccl.valid_codes, 256);
3866         val = Fplist_get (plist, Qvalid_codes);
3867         if (CONSP (val))
3868           {
3869             Lisp_Object this;
3870
3871             for (; CONSP (val); val = XCDR (val))
3872               {
3873                 this = XCAR (val);
3874                 if (INTEGERP (this)
3875                     && XINT (this) >= 0 && XINT (this) < 256)
3876                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3877                 else if (CONSP (this)
3878                          && INTEGERP (XCAR (this))
3879                          && INTEGERP (XCDR (this)))
3880                   {
3881                     int start = XINT (XCAR (this));
3882                     int end = XINT (XCDR (this));
3883
3884                     if (start >= 0 && start <= end && end < 256)
3885                       while (start <= end)
3886                         coding->spec.ccl.valid_codes[start++] = 1;
3887                   }
3888               }
3889           }
3890       }
3891       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3892       coding->spec.ccl.cr_carryover = 0;
3893       coding->spec.ccl.eight_bit_carryover[0] = 0;
3894       break;
3895
3896     case 5:
3897       coding->type = coding_type_raw_text;
3898       break;
3899
3900     default:
3901       goto label_invalid_coding_system;
3902     }
3903   return 0;
3904
3905  label_invalid_coding_system:
3906   coding->type = coding_type_no_conversion;
3907   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3908   coding->common_flags = 0;
3909   coding->eol_type = CODING_EOL_LF;
3910   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3911   return -1;
3912 }
3913
3914 /* Free memory blocks allocated for storing composition information.  */
3915
3916 void
3917 coding_free_composition_data (coding)
3918      struct coding_system *coding;
3919 {
3920   struct composition_data *cmp_data = coding->cmp_data, *next;
3921
3922   if (!cmp_data)
3923     return;
3924   /* Memory blocks are chained.  At first, rewind to the first, then,
3925      free blocks one by one.  */
3926   while (cmp_data->prev)
3927     cmp_data = cmp_data->prev;
3928   while (cmp_data)
3929     {
3930       next = cmp_data->next;
3931       xfree (cmp_data);
3932       cmp_data = next;
3933     }
3934   coding->cmp_data = NULL;
3935 }
3936
3937 /* Set `char_offset' member of all memory blocks pointed by
3938    coding->cmp_data to POS.  */
3939
3940 void
3941 coding_adjust_composition_offset (coding, pos)
3942      struct coding_system *coding;
3943      int pos;
3944 {
3945   struct composition_data *cmp_data;
3946
3947   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3948     cmp_data->char_offset = pos;
3949 }
3950
3951 /* Setup raw-text or one of its subsidiaries in the structure
3952    coding_system CODING according to the already setup value eol_type
3953    in CODING.  CODING should be setup for some coding system in
3954    advance.  */
3955
3956 void
3957 setup_raw_text_coding_system (coding)
3958      struct coding_system *coding;
3959 {
3960   if (coding->type != coding_type_raw_text)
3961     {
3962       coding->symbol = Qraw_text;
3963       coding->type = coding_type_raw_text;
3964       if (coding->eol_type != CODING_EOL_UNDECIDED)
3965         {
3966           Lisp_Object subsidiaries;
3967           subsidiaries = Fget (Qraw_text, Qeol_type);
3968
3969           if (VECTORP (subsidiaries)
3970               && XVECTOR (subsidiaries)->size == 3)
3971             coding->symbol
3972               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3973         }
3974       setup_coding_system (coding->symbol, coding);
3975     }
3976   return;
3977 }
3978
3979 /* Emacs has a mechanism to automatically detect a coding system if it
3980    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3981    it's impossible to distinguish some coding systems accurately
3982    because they use the same range of codes.  So, at first, coding
3983    systems are categorized into 7, those are:
3984
3985    o coding-category-emacs-mule
3986
3987         The category for a coding system which has the same code range
3988         as Emacs' internal format.  Assigned the coding-system (Lisp
3989         symbol) `emacs-mule' by default.
3990
3991    o coding-category-sjis
3992
3993         The category for a coding system which has the same code range
3994         as SJIS.  Assigned the coding-system (Lisp
3995         symbol) `japanese-shift-jis' by default.
3996
3997    o coding-category-iso-7
3998
3999         The category for a coding system which has the same code range
4000         as ISO2022 of 7-bit environment.  This doesn't use any locking
4001         shift and single shift functions.  This can encode/decode all
4002         charsets.  Assigned the coding-system (Lisp symbol)
4003         `iso-2022-7bit' by default.
4004
4005    o coding-category-iso-7-tight
4006
4007         Same as coding-category-iso-7 except that this can
4008         encode/decode only the specified charsets.
4009
4010    o coding-category-iso-8-1
4011
4012         The category for a coding system which has the same code range
4013         as ISO2022 of 8-bit environment and graphic plane 1 used only
4014         for DIMENSION1 charset.  This doesn't use any locking shift
4015         and single shift functions.  Assigned the coding-system (Lisp
4016         symbol) `iso-latin-1' by default.
4017
4018    o coding-category-iso-8-2
4019
4020         The category for a coding system which has the same code range
4021         as ISO2022 of 8-bit environment and graphic plane 1 used only
4022         for DIMENSION2 charset.  This doesn't use any locking shift
4023         and single shift functions.  Assigned the coding-system (Lisp
4024         symbol) `japanese-iso-8bit' by default.
4025
4026    o coding-category-iso-7-else
4027
4028         The category for a coding system which has the same code range
4029         as ISO2022 of 7-bit environment but uses locking shift or
4030         single shift functions.  Assigned the coding-system (Lisp
4031         symbol) `iso-2022-7bit-lock' by default.
4032
4033    o coding-category-iso-8-else
4034
4035         The category for a coding system which has the same code range
4036         as ISO2022 of 8-bit environment but uses locking shift or
4037         single shift functions.  Assigned the coding-system (Lisp
4038         symbol) `iso-2022-8bit-ss2' by default.
4039
4040    o coding-category-big5
4041
4042         The category for a coding system which has the same code range
4043         as BIG5.  Assigned the coding-system (Lisp symbol)
4044         `cn-big5' by default.
4045
4046    o coding-category-utf-8
4047
4048         The category for a coding system which has the same code range
4049         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4050         symbol) `utf-8' by default.
4051
4052    o coding-category-utf-16-be
4053
4054         The category for a coding system in which a text has an
4055         Unicode signature (cf. Unicode Standard) in the order of BIG
4056         endian at the head.  Assigned the coding-system (Lisp symbol)
4057         `utf-16-be' by default.
4058
4059    o coding-category-utf-16-le
4060
4061         The category for a coding system in which a text has an
4062         Unicode signature (cf. Unicode Standard) in the order of
4063         LITTLE endian at the head.  Assigned the coding-system (Lisp
4064         symbol) `utf-16-le' by default.
4065
4066    o coding-category-ccl
4067
4068         The category for a coding system of which encoder/decoder is
4069         written in CCL programs.  The default value is nil, i.e., no
4070         coding system is assigned.
4071
4072    o coding-category-binary
4073
4074         The category for a coding system not categorized in any of the
4075         above.  Assigned the coding-system (Lisp symbol)
4076         `no-conversion' by default.
4077
4078    Each of them is a Lisp symbol and the value is an actual
4079    `coding-system' (this is also a Lisp symbol) assigned by a user.
4080    What Emacs does actually is to detect a category of coding system.
4081    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4082    decide a single possible category, it selects a category of the
4083    highest priority.  Priorities of categories are also specified by a
4084    user in a Lisp variable `coding-category-list'.
4085
4086 */
4087
4088 static
4089 int ascii_skip_code[256];
4090
4091 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4092    If it detects possible coding systems, return an integer in which
4093    appropriate flag bits are set.  Flag bits are defined by macros
4094    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4095    it should point the table `coding_priorities'.  In that case, only
4096    the flag bit for a coding system of the highest priority is set in
4097    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4098    range 0x80..0x9F are in multibyte form.
4099
4100    How many ASCII characters are at the head is returned as *SKIP.  */
4101
4102 static int
4103 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4104      unsigned char *source;
4105      int src_bytes, *priorities, *skip;
4106      int multibytep;
4107 {
4108   register unsigned char c;
4109   unsigned char *src = source, *src_end = source + src_bytes;
4110   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4111   int i;
4112
4113   /* At first, skip all ASCII characters and control characters except
4114      for three ISO2022 specific control characters.  */
4115   ascii_skip_code[ISO_CODE_SO] = 0;
4116   ascii_skip_code[ISO_CODE_SI] = 0;
4117   ascii_skip_code[ISO_CODE_ESC] = 0;
4118
4119  label_loop_detect_coding:
4120   while (src < src_end && ascii_skip_code[*src]) src++;
4121   *skip = src - source;
4122
4123   if (src >= src_end)
4124     /* We found nothing other than ASCII.  There's nothing to do.  */
4125     return 0;
4126
4127   c = *src;
4128   /* The text seems to be encoded in some multilingual coding system.
4129      Now, try to find in which coding system the text is encoded.  */
4130   if (c < 0x80)
4131     {
4132       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4133       /* C is an ISO2022 specific control code of C0.  */
4134       mask = detect_coding_iso2022 (src, src_end, multibytep);
4135       if (mask == 0)
4136         {
4137           /* No valid ISO2022 code follows C.  Try again.  */
4138           src++;
4139           if (c == ISO_CODE_ESC)
4140             ascii_skip_code[ISO_CODE_ESC] = 1;
4141           else
4142             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4143           goto label_loop_detect_coding;
4144         }
4145       if (priorities)
4146         {
4147           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4148             {
4149               if (mask & priorities[i])
4150                 return priorities[i];
4151             }
4152           return CODING_CATEGORY_MASK_RAW_TEXT;
4153         }
4154     }
4155   else
4156     {
4157       int try;
4158
4159       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4160         c = src[1] - 0x20;
4161
4162       if (c < 0xA0)
4163         {
4164           /* C is the first byte of SJIS character code,
4165              or a leading-code of Emacs' internal format (emacs-mule),
4166              or the first byte of UTF-16.  */
4167           try = (CODING_CATEGORY_MASK_SJIS
4168                   | CODING_CATEGORY_MASK_EMACS_MULE
4169                   | CODING_CATEGORY_MASK_UTF_16_BE
4170                   | CODING_CATEGORY_MASK_UTF_16_LE);
4171
4172           /* Or, if C is a special latin extra code,
4173              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4174              or is an ISO2022 control-sequence-introducer (CSI),
4175              we should also consider the possibility of ISO2022 codings.  */
4176           if ((VECTORP (Vlatin_extra_code_table)
4177                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4178               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4179               || (c == ISO_CODE_CSI
4180                   && (src < src_end
4181                       && (*src == ']'
4182                           || ((*src == '0' || *src == '1' || *src == '2')
4183                               && src + 1 < src_end
4184                               && src[1] == ']')))))
4185             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4186                      | CODING_CATEGORY_MASK_ISO_8BIT);
4187         }
4188       else
4189         /* C is a character of ISO2022 in graphic plane right,
4190            or a SJIS's 1-byte character code (i.e. JISX0201),
4191            or the first byte of BIG5's 2-byte code,
4192            or the first byte of UTF-8/16.  */
4193         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4194                 | CODING_CATEGORY_MASK_ISO_8BIT
4195                 | CODING_CATEGORY_MASK_SJIS
4196                 | CODING_CATEGORY_MASK_BIG5
4197                 | CODING_CATEGORY_MASK_UTF_8
4198                 | CODING_CATEGORY_MASK_UTF_16_BE
4199                 | CODING_CATEGORY_MASK_UTF_16_LE);
4200
4201       /* Or, we may have to consider the possibility of CCL.  */
4202       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4203           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4204               ->spec.ccl.valid_codes)[c])
4205         try |= CODING_CATEGORY_MASK_CCL;
4206
4207       mask = 0;
4208       utf16_examined_p = iso2022_examined_p = 0;
4209       if (priorities)
4210         {
4211           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4212             {
4213               if (!iso2022_examined_p
4214                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4215                 {
4216                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4217                   iso2022_examined_p = 1;
4218                 }
4219               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4220                 mask |= detect_coding_sjis (src, src_end, multibytep);
4221               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4222                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4223               else if (!utf16_examined_p
4224                        && (priorities[i] & try &
4225                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4226                 {
4227                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4228                   utf16_examined_p = 1;
4229                 }
4230               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4231                 mask |= detect_coding_big5 (src, src_end, multibytep);
4232               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4233                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4234               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4235                 mask |= detect_coding_ccl (src, src_end, multibytep);
4236               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4237                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4238               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4239                 mask |= CODING_CATEGORY_MASK_BINARY;
4240               if (mask & priorities[i])
4241                 return priorities[i];
4242             }
4243           return CODING_CATEGORY_MASK_RAW_TEXT;
4244         }
4245       if (try & CODING_CATEGORY_MASK_ISO)
4246         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4247       if (try & CODING_CATEGORY_MASK_SJIS)
4248         mask |= detect_coding_sjis (src, src_end, multibytep);
4249       if (try & CODING_CATEGORY_MASK_BIG5)
4250         mask |= detect_coding_big5 (src, src_end, multibytep);
4251       if (try & CODING_CATEGORY_MASK_UTF_8)
4252         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4253       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4254         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4255       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4256         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4257       if (try & CODING_CATEGORY_MASK_CCL)
4258         mask |= detect_coding_ccl (src, src_end, multibytep);
4259     }
4260   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4261 }
4262
4263 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4264    The information of the detected coding system is set in CODING.  */
4265
4266 void
4267 detect_coding (coding, src, src_bytes)
4268      struct coding_system *coding;
4269      const unsigned char *src;
4270      int src_bytes;
4271 {
4272   unsigned int idx;
4273   int skip, mask;
4274   Lisp_Object val;
4275
4276   val = Vcoding_category_list;
4277   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4278                              coding->src_multibyte);
4279   coding->heading_ascii = skip;
4280
4281   if (!mask) return;
4282
4283   /* We found a single coding system of the highest priority in MASK.  */
4284   idx = 0;
4285   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4286   if (! mask)
4287     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4288
4289   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4290
4291   if (coding->eol_type != CODING_EOL_UNDECIDED)
4292     {
4293       Lisp_Object tmp;
4294
4295       tmp = Fget (val, Qeol_type);
4296       if (VECTORP (tmp))
4297         val = XVECTOR (tmp)->contents[coding->eol_type];
4298     }
4299
4300   /* Setup this new coding system while preserving some slots.  */
4301   {
4302     int src_multibyte = coding->src_multibyte;
4303     int dst_multibyte = coding->dst_multibyte;
4304
4305     setup_coding_system (val, coding);
4306     coding->src_multibyte = src_multibyte;
4307     coding->dst_multibyte = dst_multibyte;
4308     coding->heading_ascii = skip;
4309   }
4310 }
4311
4312 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4313    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4314    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4315
4316    How many non-eol characters are at the head is returned as *SKIP.  */
4317
4318 #define MAX_EOL_CHECK_COUNT 3
4319
4320 static int
4321 detect_eol_type (source, src_bytes, skip)
4322      unsigned char *source;
4323      int src_bytes, *skip;
4324 {
4325   unsigned char *src = source, *src_end = src + src_bytes;
4326   unsigned char c;
4327   int total = 0;                /* How many end-of-lines are found so far.  */
4328   int eol_type = CODING_EOL_UNDECIDED;
4329   int this_eol_type;
4330
4331   *skip = 0;
4332
4333   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4334     {
4335       c = *src++;
4336       if (c == '\n' || c == '\r')
4337         {
4338           if (*skip == 0)
4339             *skip = src - 1 - source;
4340           total++;
4341           if (c == '\n')
4342             this_eol_type = CODING_EOL_LF;
4343           else if (src >= src_end || *src != '\n')
4344             this_eol_type = CODING_EOL_CR;
4345           else
4346             this_eol_type = CODING_EOL_CRLF, src++;
4347
4348           if (eol_type == CODING_EOL_UNDECIDED)
4349             /* This is the first end-of-line.  */
4350             eol_type = this_eol_type;
4351           else if (eol_type != this_eol_type)
4352             {
4353               /* The found type is different from what found before.  */
4354               eol_type = CODING_EOL_INCONSISTENT;
4355               break;
4356             }
4357         }
4358     }
4359
4360   if (*skip == 0)
4361     *skip = src_end - source;
4362   return eol_type;
4363 }
4364
4365 /* Like detect_eol_type, but detect EOL type in 2-octet
4366    big-endian/little-endian format for coding systems utf-16-be and
4367    utf-16-le.  */
4368
4369 static int
4370 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4371      unsigned char *source;
4372      int src_bytes, *skip, big_endian_p;
4373 {
4374   unsigned char *src = source, *src_end = src + src_bytes;
4375   unsigned int c1, c2;
4376   int total = 0;                /* How many end-of-lines are found so far.  */
4377   int eol_type = CODING_EOL_UNDECIDED;
4378   int this_eol_type;
4379   int msb, lsb;
4380
4381   if (big_endian_p)
4382     msb = 0, lsb = 1;
4383   else
4384     msb = 1, lsb = 0;
4385
4386   *skip = 0;
4387
4388   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4389     {
4390       c1 = (src[msb] << 8) | (src[lsb]);
4391       src += 2;
4392
4393       if (c1 == '\n' || c1 == '\r')
4394         {
4395           if (*skip == 0)
4396             *skip = src - 2 - source;
4397           total++;
4398           if (c1 == '\n')
4399             {
4400               this_eol_type = CODING_EOL_LF;
4401             }
4402           else
4403             {
4404               if ((src + 1) >= src_end)
4405                 {
4406                   this_eol_type = CODING_EOL_CR;
4407                 }
4408               else
4409                 {
4410                   c2 = (src[msb] << 8) | (src[lsb]);
4411                   if (c2 == '\n')
4412                     this_eol_type = CODING_EOL_CRLF, src += 2;
4413                   else
4414                     this_eol_type = CODING_EOL_CR;
4415                 }
4416             }
4417
4418           if (eol_type == CODING_EOL_UNDECIDED)
4419             /* This is the first end-of-line.  */
4420             eol_type = this_eol_type;
4421           else if (eol_type != this_eol_type)
4422             {
4423               /* The found type is different from what found before.  */
4424               eol_type = CODING_EOL_INCONSISTENT;
4425               break;
4426             }
4427         }
4428     }
4429
4430   if (*skip == 0)
4431     *skip = src_end - source;
4432   return eol_type;
4433 }
4434
4435 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4436    is encoded.  If it detects an appropriate format of end-of-line, it
4437    sets the information in *CODING.  */
4438
4439 void
4440 detect_eol (coding, src, src_bytes)
4441      struct coding_system *coding;
4442      const unsigned char *src;
4443      int src_bytes;
4444 {
4445   Lisp_Object val;
4446   int skip;
4447   int eol_type;
4448
4449   switch (coding->category_idx)
4450     {
4451     case CODING_CATEGORY_IDX_UTF_16_BE:
4452       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4453       break;
4454     case CODING_CATEGORY_IDX_UTF_16_LE:
4455       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4456       break;
4457     default:
4458       eol_type = detect_eol_type (src, src_bytes, &skip);
4459       break;
4460     }
4461
4462   if (coding->heading_ascii > skip)
4463     coding->heading_ascii = skip;
4464   else
4465     skip = coding->heading_ascii;
4466
4467   if (eol_type == CODING_EOL_UNDECIDED)
4468     return;
4469   if (eol_type == CODING_EOL_INCONSISTENT)
4470     {
4471 #if 0
4472       /* This code is suppressed until we find a better way to
4473          distinguish raw text file and binary file.  */
4474
4475       /* If we have already detected that the coding is raw-text, the
4476          coding should actually be no-conversion.  */
4477       if (coding->type == coding_type_raw_text)
4478         {
4479           setup_coding_system (Qno_conversion, coding);
4480           return;
4481         }
4482       /* Else, let's decode only text code anyway.  */
4483 #endif /* 0 */
4484       eol_type = CODING_EOL_LF;
4485     }
4486
4487   val = Fget (coding->symbol, Qeol_type);
4488   if (VECTORP (val) && XVECTOR (val)->size == 3)
4489     {
4490       int src_multibyte = coding->src_multibyte;
4491       int dst_multibyte = coding->dst_multibyte;
4492       struct composition_data *cmp_data = coding->cmp_data;
4493
4494       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4495       coding->src_multibyte = src_multibyte;
4496       coding->dst_multibyte = dst_multibyte;
4497       coding->heading_ascii = skip;
4498       coding->cmp_data = cmp_data;
4499     }
4500 }
4501
4502 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4503
4504 #define DECODING_BUFFER_MAG(coding)                     \
4505   (coding->type == coding_type_iso2022                  \
4506    ? 3                                                  \
4507    : (coding->type == coding_type_ccl                   \
4508       ? coding->spec.ccl.decoder.buf_magnification      \
4509       : 2))
4510
4511 /* Return maximum size (bytes) of a buffer enough for decoding
4512    SRC_BYTES of text encoded in CODING.  */
4513
4514 int
4515 decoding_buffer_size (coding, src_bytes)
4516      struct coding_system *coding;
4517      int src_bytes;
4518 {
4519   return (src_bytes * DECODING_BUFFER_MAG (coding)
4520           + CONVERSION_BUFFER_EXTRA_ROOM);
4521 }
4522
4523 /* Return maximum size (bytes) of a buffer enough for encoding
4524    SRC_BYTES of text to CODING.  */
4525
4526 int
4527 encoding_buffer_size (coding, src_bytes)
4528      struct coding_system *coding;
4529      int src_bytes;
4530 {
4531   int magnification;
4532
4533   if (coding->type == coding_type_ccl)
4534     {
4535       magnification = coding->spec.ccl.encoder.buf_magnification;
4536       if (coding->eol_type == CODING_EOL_CRLF)
4537         magnification *= 2;
4538     }
4539   else if (CODING_REQUIRE_ENCODING (coding))
4540     magnification = 3;
4541   else
4542     magnification = 1;
4543
4544   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4545 }
4546
4547 /* Working buffer for code conversion.  */
4548 struct conversion_buffer
4549 {
4550   int size;                     /* size of data.  */
4551   int on_stack;                 /* 1 if allocated by alloca.  */
4552   unsigned char *data;
4553 };
4554
4555 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4556 #define allocate_conversion_buffer(buf, len)            \
4557   do {                                                  \
4558     if (len < MAX_ALLOCA)                               \
4559       {                                                 \
4560         buf.data = (unsigned char *) alloca (len);      \
4561         buf.on_stack = 1;                               \
4562       }                                                 \
4563     else                                                \
4564       {                                                 \
4565         buf.data = (unsigned char *) xmalloc (len);     \
4566         buf.on_stack = 0;                               \
4567       }                                                 \
4568     buf.size = len;                                     \
4569   } while (0)
4570
4571 /* Double the allocated memory for *BUF.  */
4572 static void
4573 extend_conversion_buffer (buf)
4574      struct conversion_buffer *buf;
4575 {
4576   if (buf->on_stack)
4577     {
4578       unsigned char *save = buf->data;
4579       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4580       bcopy (save, buf->data, buf->size);
4581       buf->on_stack = 0;
4582     }
4583   else
4584     {
4585       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4586     }
4587   buf->size *= 2;
4588 }
4589
4590 /* Free the allocated memory for BUF if it is not on stack.  */
4591 static void
4592 free_conversion_buffer (buf)
4593      struct conversion_buffer *buf;
4594 {
4595   if (!buf->on_stack)
4596     xfree (buf->data);
4597 }
4598
4599 int
4600 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4601      struct coding_system *coding;
4602      unsigned char *source, *destination;
4603      int src_bytes, dst_bytes, encodep;
4604 {
4605   struct ccl_program *ccl
4606     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4607   unsigned char *dst = destination;
4608
4609   ccl->suppress_error = coding->suppress_error;
4610   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4611   if (encodep)
4612     {
4613       /* On encoding, EOL format is converted within ccl_driver.  For
4614          that, setup proper information in the structure CCL.  */
4615       ccl->eol_type = coding->eol_type;
4616       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4617         ccl->eol_type = CODING_EOL_LF;
4618       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4619       ccl->eight_bit_control = coding->dst_multibyte;
4620     }
4621   else
4622     ccl->eight_bit_control = 1;
4623   ccl->multibyte = coding->src_multibyte;
4624   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4625     {
4626       /* Move carryover bytes to DESTINATION.  */
4627       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4628       while (*p)
4629         *dst++ = *p++;
4630       coding->spec.ccl.eight_bit_carryover[0] = 0;
4631       if (dst_bytes)
4632         dst_bytes -= dst - destination;
4633     }
4634
4635   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4636                                   &(coding->consumed))
4637                       + dst - destination);
4638
4639   if (encodep)
4640     {
4641       coding->produced_char = coding->produced;
4642       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4643     }
4644   else if (!ccl->eight_bit_control)
4645     {
4646       /* The produced bytes forms a valid multibyte sequence. */
4647       coding->produced_char
4648         = multibyte_chars_in_text (destination, coding->produced);
4649       coding->spec.ccl.eight_bit_carryover[0] = 0;
4650     }
4651   else
4652     {
4653       /* On decoding, the destination should always multibyte.  But,
4654          CCL program might have been generated an invalid multibyte
4655          sequence.  Here we make such a sequence valid as
4656          multibyte.  */
4657       int bytes
4658         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4659
4660       if ((coding->consumed < src_bytes
4661            || !ccl->last_block)
4662           && coding->produced >= 1
4663           && destination[coding->produced - 1] >= 0x80)
4664         {
4665           /* We should not convert the tailing 8-bit codes to
4666              multibyte form even if they doesn't form a valid
4667              multibyte sequence.  They may form a valid sequence in
4668              the next call.  */
4669           int carryover = 0;
4670
4671           if (destination[coding->produced - 1] < 0xA0)
4672             carryover = 1;
4673           else if (coding->produced >= 2)
4674             {
4675               if (destination[coding->produced - 2] >= 0x80)
4676                 {
4677                   if (destination[coding->produced - 2] < 0xA0)
4678                     carryover = 2;
4679                   else if (coding->produced >= 3
4680                            && destination[coding->produced - 3] >= 0x80
4681                            && destination[coding->produced - 3] < 0xA0)
4682                     carryover = 3;
4683                 }
4684             }
4685           if (carryover > 0)
4686             {
4687               BCOPY_SHORT (destination + coding->produced - carryover,
4688                            coding->spec.ccl.eight_bit_carryover,
4689                            carryover);
4690               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4691               coding->produced -= carryover;
4692             }
4693         }
4694       coding->produced = str_as_multibyte (destination, bytes,
4695                                            coding->produced,
4696                                            &(coding->produced_char));
4697     }
4698
4699   switch (ccl->status)
4700     {
4701     case CCL_STAT_SUSPEND_BY_SRC:
4702       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4703       break;
4704     case CCL_STAT_SUSPEND_BY_DST:
4705       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4706       break;
4707     case CCL_STAT_QUIT:
4708     case CCL_STAT_INVALID_CMD:
4709       coding->result = CODING_FINISH_INTERRUPT;
4710       break;
4711     default:
4712       coding->result = CODING_FINISH_NORMAL;
4713       break;
4714     }
4715   return coding->result;
4716 }
4717
4718 /* Decode EOL format of the text at PTR of BYTES length destructively
4719    according to CODING->eol_type.  This is called after the CCL
4720    program produced a decoded text at PTR.  If we do CRLF->LF
4721    conversion, update CODING->produced and CODING->produced_char.  */
4722
4723 static void
4724 decode_eol_post_ccl (coding, ptr, bytes)
4725      struct coding_system *coding;
4726      unsigned char *ptr;
4727      int bytes;
4728 {
4729   Lisp_Object val, saved_coding_symbol;
4730   unsigned char *pend = ptr + bytes;
4731   int dummy;
4732
4733   /* Remember the current coding system symbol.  We set it back when
4734      an inconsistent EOL is found so that `last-coding-system-used' is
4735      set to the coding system that doesn't specify EOL conversion.  */
4736   saved_coding_symbol = coding->symbol;
4737
4738   coding->spec.ccl.cr_carryover = 0;
4739   if (coding->eol_type == CODING_EOL_UNDECIDED)
4740     {
4741       /* Here, to avoid the call of setup_coding_system, we directly
4742          call detect_eol_type.  */
4743       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4744       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4745         coding->eol_type = CODING_EOL_LF;
4746       if (coding->eol_type != CODING_EOL_UNDECIDED)
4747         {
4748           val = Fget (coding->symbol, Qeol_type);
4749           if (VECTORP (val) && XVECTOR (val)->size == 3)
4750             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4751         }
4752       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4753     }
4754
4755   if (coding->eol_type == CODING_EOL_LF
4756       || coding->eol_type == CODING_EOL_UNDECIDED)
4757     {
4758       /* We have nothing to do.  */
4759       ptr = pend;
4760     }
4761   else if (coding->eol_type == CODING_EOL_CRLF)
4762     {
4763       unsigned char *pstart = ptr, *p = ptr;
4764
4765       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4766           && *(pend - 1) == '\r')
4767         {
4768           /* If the last character is CR, we can't handle it here
4769              because LF will be in the not-yet-decoded source text.
4770              Record that the CR is not yet processed.  */
4771           coding->spec.ccl.cr_carryover = 1;
4772           coding->produced--;
4773           coding->produced_char--;
4774           pend--;
4775         }
4776       while (ptr < pend)
4777         {
4778           if (*ptr == '\r')
4779             {
4780               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4781                 {
4782                   *p++ = '\n';
4783                   ptr += 2;
4784                 }
4785               else
4786                 {
4787                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4788                     goto undo_eol_conversion;
4789                   *p++ = *ptr++;
4790                 }
4791             }
4792           else if (*ptr == '\n'
4793                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4794             goto undo_eol_conversion;
4795           else
4796             *p++ = *ptr++;
4797           continue;
4798
4799         undo_eol_conversion:
4800           /* We have faced with inconsistent EOL format at PTR.
4801              Convert all LFs before PTR back to CRLFs.  */
4802           for (p--, ptr--; p >= pstart; p--)
4803             {
4804               if (*p == '\n')
4805                 *ptr-- = '\n', *ptr-- = '\r';
4806               else
4807                 *ptr-- = *p;
4808             }
4809           /*  If carryover is recorded, cancel it because we don't
4810               convert CRLF anymore.  */
4811           if (coding->spec.ccl.cr_carryover)
4812             {
4813               coding->spec.ccl.cr_carryover = 0;
4814               coding->produced++;
4815               coding->produced_char++;
4816               pend++;
4817             }
4818           p = ptr = pend;
4819           coding->eol_type = CODING_EOL_LF;
4820           coding->symbol = saved_coding_symbol;
4821         }
4822       if (p < pend)
4823         {
4824           /* As each two-byte sequence CRLF was converted to LF, (PEND
4825              - P) is the number of deleted characters.  */
4826           coding->produced -= pend - p;
4827           coding->produced_char -= pend - p;
4828         }
4829     }
4830   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4831     {
4832       unsigned char *p = ptr;
4833
4834       for (; ptr < pend; ptr++)
4835         {
4836           if (*ptr == '\r')
4837             *ptr = '\n';
4838           else if (*ptr == '\n'
4839                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4840             {
4841               for (; p < ptr; p++)
4842                 {
4843                   if (*p == '\n')
4844                     *p = '\r';
4845                 }
4846               ptr = pend;
4847               coding->eol_type = CODING_EOL_LF;
4848               coding->symbol = saved_coding_symbol;
4849             }
4850         }
4851     }
4852 }
4853
4854 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4855    decoding, it may detect coding system and format of end-of-line if
4856    those are not yet decided.  The source should be unibyte, the
4857    result is multibyte if CODING->dst_multibyte is nonzero, else
4858    unibyte.  */
4859
4860 int
4861 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4862      struct coding_system *coding;
4863      const unsigned char *source;
4864      unsigned char *destination;
4865      int src_bytes, dst_bytes;
4866 {
4867   int extra = 0;
4868
4869   if (coding->type == coding_type_undecided)
4870     detect_coding (coding, source, src_bytes);
4871
4872   if (coding->eol_type == CODING_EOL_UNDECIDED
4873       && coding->type != coding_type_ccl)
4874     {
4875       detect_eol (coding, source, src_bytes);
4876       /* We had better recover the original eol format if we
4877          encounter an inconsistent eol format while decoding.  */
4878       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4879     }
4880
4881   coding->produced = coding->produced_char = 0;
4882   coding->consumed = coding->consumed_char = 0;
4883   coding->errors = 0;
4884   coding->result = CODING_FINISH_NORMAL;
4885
4886   switch (coding->type)
4887     {
4888     case coding_type_sjis:
4889       decode_coding_sjis_big5 (coding, source, destination,
4890                                src_bytes, dst_bytes, 1);
4891       break;
4892
4893     case coding_type_iso2022:
4894       decode_coding_iso2022 (coding, source, destination,
4895                              src_bytes, dst_bytes);
4896       break;
4897
4898     case coding_type_big5:
4899       decode_coding_sjis_big5 (coding, source, destination,
4900                                src_bytes, dst_bytes, 0);
4901       break;
4902
4903     case coding_type_emacs_mule:
4904       decode_coding_emacs_mule (coding, source, destination,
4905                                 src_bytes, dst_bytes);
4906       break;
4907
4908     case coding_type_ccl:
4909       if (coding->spec.ccl.cr_carryover)
4910         {
4911           /* Put the CR which was not processed by the previous call
4912              of decode_eol_post_ccl in DESTINATION.  It will be
4913              decoded together with the following LF by the call to
4914              decode_eol_post_ccl below.  */
4915           *destination = '\r';
4916           coding->produced++;
4917           coding->produced_char++;
4918           dst_bytes--;
4919           extra = coding->spec.ccl.cr_carryover;
4920         }
4921       ccl_coding_driver (coding, source, destination + extra,
4922                          src_bytes, dst_bytes, 0);
4923       if (coding->eol_type != CODING_EOL_LF)
4924         {
4925           coding->produced += extra;
4926           coding->produced_char += extra;
4927           decode_eol_post_ccl (coding, destination, coding->produced);
4928         }
4929       break;
4930
4931     default:
4932       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4933     }
4934
4935   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4936       && coding->mode & CODING_MODE_LAST_BLOCK
4937       && coding->consumed == src_bytes)
4938     coding->result = CODING_FINISH_NORMAL;
4939
4940   if (coding->mode & CODING_MODE_LAST_BLOCK
4941       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4942     {
4943       const unsigned char *src = source + coding->consumed;
4944       unsigned char *dst = destination + coding->produced;
4945
4946       src_bytes -= coding->consumed;
4947       coding->errors++;
4948       if (COMPOSING_P (coding))
4949         DECODE_COMPOSITION_END ('1');
4950       while (src_bytes--)
4951         {
4952           int c = *src++;
4953           dst += CHAR_STRING (c, dst);
4954           coding->produced_char++;
4955         }
4956       coding->consumed = coding->consumed_char = src - source;
4957       coding->produced = dst - destination;
4958       coding->result = CODING_FINISH_NORMAL;
4959     }
4960
4961   if (!coding->dst_multibyte)
4962     {
4963       coding->produced = str_as_unibyte (destination, coding->produced);
4964       coding->produced_char = coding->produced;
4965     }
4966
4967   return coding->result;
4968 }
4969
4970 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4971    multibyteness of the source is CODING->src_multibyte, the
4972    multibyteness of the result is always unibyte.  */
4973
4974 int
4975 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4976      struct coding_system *coding;
4977      const unsigned char *source;
4978      unsigned char *destination;
4979      int src_bytes, dst_bytes;
4980 {
4981   coding->produced = coding->produced_char = 0;
4982   coding->consumed = coding->consumed_char = 0;
4983   coding->errors = 0;
4984   coding->result = CODING_FINISH_NORMAL;
4985
4986   switch (coding->type)
4987     {
4988     case coding_type_sjis:
4989       encode_coding_sjis_big5 (coding, source, destination,
4990                                src_bytes, dst_bytes, 1);
4991       break;
4992
4993     case coding_type_iso2022:
4994       encode_coding_iso2022 (coding, source, destination,
4995                              src_bytes, dst_bytes);
4996       break;
4997
4998     case coding_type_big5:
4999       encode_coding_sjis_big5 (coding, source, destination,
5000                                src_bytes, dst_bytes, 0);
5001       break;
5002
5003     case coding_type_emacs_mule:
5004       encode_coding_emacs_mule (coding, source, destination,
5005                                 src_bytes, dst_bytes);
5006       break;
5007
5008     case coding_type_ccl:
5009       ccl_coding_driver (coding, source, destination,
5010                          src_bytes, dst_bytes, 1);
5011       break;
5012
5013     default:
5014       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5015     }
5016
5017   if (coding->mode & CODING_MODE_LAST_BLOCK
5018       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5019     {
5020       const unsigned char *src = source + coding->consumed;
5021       unsigned char *dst = destination + coding->produced;
5022
5023       if (coding->type == coding_type_iso2022)
5024         ENCODE_RESET_PLANE_AND_REGISTER;
5025       if (COMPOSING_P (coding))
5026         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5027       if (coding->consumed < src_bytes)
5028         {
5029           int len = src_bytes - coding->consumed;
5030
5031           BCOPY_SHORT (src, dst, len);
5032           if (coding->src_multibyte)
5033             len = str_as_unibyte (dst, len);
5034           dst += len;
5035           coding->consumed = src_bytes;
5036         }
5037       coding->produced = coding->produced_char = dst - destination;
5038       coding->result = CODING_FINISH_NORMAL;
5039     }
5040
5041   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5042       && coding->consumed == src_bytes)
5043     coding->result = CODING_FINISH_NORMAL;
5044
5045   return coding->result;
5046 }
5047
5048 /* Scan text in the region between *BEG and *END (byte positions),
5049    skip characters which we don't have to decode by coding system
5050    CODING at the head and tail, then set *BEG and *END to the region
5051    of the text we actually have to convert.  The caller should move
5052    the gap out of the region in advance if the region is from a
5053    buffer.
5054
5055    If STR is not NULL, *BEG and *END are indices into STR.  */
5056
5057 static void
5058 shrink_decoding_region (beg, end, coding, str)
5059      int *beg, *end;
5060      struct coding_system *coding;
5061      unsigned char *str;
5062 {
5063   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5064   int eol_conversion;
5065   Lisp_Object translation_table;
5066
5067   if (coding->type == coding_type_ccl
5068       || coding->type == coding_type_undecided
5069       || coding->eol_type != CODING_EOL_LF
5070       || !NILP (coding->post_read_conversion)
5071       || coding->composing != COMPOSITION_DISABLED)
5072     {
5073       /* We can't skip any data.  */
5074       return;
5075     }
5076   if (coding->type == coding_type_no_conversion
5077       || coding->type == coding_type_raw_text
5078       || coding->type == coding_type_emacs_mule)
5079     {
5080       /* We need no conversion, but don't have to skip any data here.
5081          Decoding routine handles them effectively anyway.  */
5082       return;
5083     }
5084
5085   translation_table = coding->translation_table_for_decode;
5086   if (NILP (translation_table) && !NILP (Venable_character_translation))
5087     translation_table = Vstandard_translation_table_for_decode;
5088   if (CHAR_TABLE_P (translation_table))
5089     {
5090       int i;
5091       for (i = 0; i < 128; i++)
5092         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5093           break;
5094       if (i < 128)
5095         /* Some ASCII character should be translated.  We give up
5096            shrinking.  */
5097         return;
5098     }
5099
5100   if (coding->heading_ascii >= 0)
5101     /* Detection routine has already found how much we can skip at the
5102        head.  */
5103     *beg += coding->heading_ascii;
5104
5105   if (str)
5106     {
5107       begp_orig = begp = str + *beg;
5108       endp_orig = endp = str + *end;
5109     }
5110   else
5111     {
5112       begp_orig = begp = BYTE_POS_ADDR (*beg);
5113       endp_orig = endp = begp + *end - *beg;
5114     }
5115
5116   eol_conversion = (coding->eol_type == CODING_EOL_CR
5117                     || coding->eol_type == CODING_EOL_CRLF);
5118
5119   switch (coding->type)
5120     {
5121     case coding_type_sjis:
5122     case coding_type_big5:
5123       /* We can skip all ASCII characters at the head.  */
5124       if (coding->heading_ascii < 0)
5125         {
5126           if (eol_conversion)
5127             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5128           else
5129             while (begp < endp && *begp < 0x80) begp++;
5130         }
5131       /* We can skip all ASCII characters at the tail except for the
5132          second byte of SJIS or BIG5 code.  */
5133       if (eol_conversion)
5134         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5135       else
5136         while (begp < endp && endp[-1] < 0x80) endp--;
5137       /* Do not consider LF as ascii if preceded by CR, since that
5138          confuses eol decoding. */
5139       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5140         endp++;
5141       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5142         endp++;
5143       break;
5144
5145     case coding_type_iso2022:
5146       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5147         /* We can't skip any data.  */
5148         break;
5149       if (coding->heading_ascii < 0)
5150         {
5151           /* We can skip all ASCII characters at the head except for a
5152              few control codes.  */
5153           while (begp < endp && (c = *begp) < 0x80
5154                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5155                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5156                  && (!eol_conversion || c != ISO_CODE_LF))
5157             begp++;
5158         }
5159       switch (coding->category_idx)
5160         {
5161         case CODING_CATEGORY_IDX_ISO_8_1:
5162         case CODING_CATEGORY_IDX_ISO_8_2:
5163           /* We can skip all ASCII characters at the tail.  */
5164           if (eol_conversion)
5165             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5166           else
5167             while (begp < endp && endp[-1] < 0x80) endp--;
5168           /* Do not consider LF as ascii if preceded by CR, since that
5169              confuses eol decoding. */
5170           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5171             endp++;
5172           break;
5173
5174         case CODING_CATEGORY_IDX_ISO_7:
5175         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5176           {
5177             /* We can skip all characters at the tail except for 8-bit
5178                codes and ESC and the following 2-byte at the tail.  */
5179             unsigned char *eight_bit = NULL;
5180
5181             if (eol_conversion)
5182               while (begp < endp
5183                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5184                 {
5185                   if (!eight_bit && c & 0x80) eight_bit = endp;
5186                   endp--;
5187                 }
5188             else
5189               while (begp < endp
5190                      && (c = endp[-1]) != ISO_CODE_ESC)
5191                 {
5192                   if (!eight_bit && c & 0x80) eight_bit = endp;
5193                   endp--;
5194                 }
5195             /* Do not consider LF as ascii if preceded by CR, since that
5196                confuses eol decoding. */
5197             if (begp < endp && endp < endp_orig
5198                 && endp[-1] == '\r' && endp[0] == '\n')
5199               endp++;
5200             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5201               {
5202                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5203                   /* This is an ASCII designation sequence.  We can
5204                      surely skip the tail.  But, if we have
5205                      encountered an 8-bit code, skip only the codes
5206                      after that.  */
5207                   endp = eight_bit ? eight_bit : endp + 2;
5208                 else
5209                   /* Hmmm, we can't skip the tail.  */
5210                   endp = endp_orig;
5211               }
5212             else if (eight_bit)
5213               endp = eight_bit;
5214           }
5215         }
5216       break;
5217
5218     default:
5219       abort ();
5220     }
5221   *beg += begp - begp_orig;
5222   *end += endp - endp_orig;
5223   return;
5224 }
5225
5226 /* Like shrink_decoding_region but for encoding.  */
5227
5228 static void
5229 shrink_encoding_region (beg, end, coding, str)
5230      int *beg, *end;
5231      struct coding_system *coding;
5232      unsigned char *str;
5233 {
5234   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5235   int eol_conversion;
5236   Lisp_Object translation_table;
5237
5238   if (coding->type == coding_type_ccl
5239       || coding->eol_type == CODING_EOL_CRLF
5240       || coding->eol_type == CODING_EOL_CR
5241       || (coding->cmp_data && coding->cmp_data->used > 0))
5242     {
5243       /* We can't skip any data.  */
5244       return;
5245     }
5246   if (coding->type == coding_type_no_conversion
5247       || coding->type == coding_type_raw_text
5248       || coding->type == coding_type_emacs_mule
5249       || coding->type == coding_type_undecided)
5250     {
5251       /* We need no conversion, but don't have to skip any data here.
5252          Encoding routine handles them effectively anyway.  */
5253       return;
5254     }
5255
5256   translation_table = coding->translation_table_for_encode;
5257   if (NILP (translation_table) && !NILP (Venable_character_translation))
5258     translation_table = Vstandard_translation_table_for_encode;
5259   if (CHAR_TABLE_P (translation_table))
5260     {
5261       int i;
5262       for (i = 0; i < 128; i++)
5263         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5264           break;
5265       if (i < 128)
5266         /* Some ASCII character should be translated.  We give up
5267            shrinking.  */
5268         return;
5269     }
5270
5271   if (str)
5272     {
5273       begp_orig = begp = str + *beg;
5274       endp_orig = endp = str + *end;
5275     }
5276   else
5277     {
5278       begp_orig = begp = BYTE_POS_ADDR (*beg);
5279       endp_orig = endp = begp + *end - *beg;
5280     }
5281
5282   eol_conversion = (coding->eol_type == CODING_EOL_CR
5283                     || coding->eol_type == CODING_EOL_CRLF);
5284
5285   /* Here, we don't have to check coding->pre_write_conversion because
5286      the caller is expected to have handled it already.  */
5287   switch (coding->type)
5288     {
5289     case coding_type_iso2022:
5290       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5291         /* We can't skip any data.  */
5292         break;
5293       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5294         {
5295           unsigned char *bol = begp;
5296           while (begp < endp && *begp < 0x80)
5297             {
5298               begp++;
5299               if (begp[-1] == '\n')
5300                 bol = begp;
5301             }
5302           begp = bol;
5303           goto label_skip_tail;
5304         }
5305       /* fall down ... */
5306
5307     case coding_type_sjis:
5308     case coding_type_big5:
5309       /* We can skip all ASCII characters at the head and tail.  */
5310       if (eol_conversion)
5311         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5312       else
5313         while (begp < endp && *begp < 0x80) begp++;
5314     label_skip_tail:
5315       if (eol_conversion)
5316         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5317       else
5318         while (begp < endp && *(endp - 1) < 0x80) endp--;
5319       break;
5320
5321     default:
5322       abort ();
5323     }
5324
5325   *beg += begp - begp_orig;
5326   *end += endp - endp_orig;
5327   return;
5328 }
5329
5330 /* As shrinking conversion region requires some overhead, we don't try
5331    shrinking if the length of conversion region is less than this
5332    value.  */
5333 static int shrink_conversion_region_threshhold = 1024;
5334
5335 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5336   do {                                                                  \
5337     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5338       {                                                                 \
5339         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5340         else shrink_decoding_region (beg, end, coding, str);            \
5341       }                                                                 \
5342   } while (0)
5343
5344 static Lisp_Object
5345 code_convert_region_unwind (arg)
5346      Lisp_Object arg;
5347 {
5348   inhibit_pre_post_conversion = 0;
5349   Vlast_coding_system_used = arg;
5350   return Qnil;
5351 }
5352
5353 /* Store information about all compositions in the range FROM and TO
5354    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5355    buffer or a string, defaults to the current buffer.  */
5356
5357 void
5358 coding_save_composition (coding, from, to, obj)
5359      struct coding_system *coding;
5360      int from, to;
5361      Lisp_Object obj;
5362 {
5363   Lisp_Object prop;
5364   int start, end;
5365
5366   if (coding->composing == COMPOSITION_DISABLED)
5367     return;
5368   if (!coding->cmp_data)
5369     coding_allocate_composition_data (coding, from);
5370   if (!find_composition (from, to, &start, &end, &prop, obj)
5371       || end > to)
5372     return;
5373   if (start < from
5374       && (!find_composition (end, to, &start, &end, &prop, obj)
5375           || end > to))
5376     return;
5377   coding->composing = COMPOSITION_NO;
5378   do
5379     {
5380       if (COMPOSITION_VALID_P (start, end, prop))
5381         {
5382           enum composition_method method = COMPOSITION_METHOD (prop);
5383           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5384               >= COMPOSITION_DATA_SIZE)
5385             coding_allocate_composition_data (coding, from);
5386           /* For relative composition, we remember start and end
5387              positions, for the other compositions, we also remember
5388              components.  */
5389           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5390           if (method != COMPOSITION_RELATIVE)
5391             {
5392               /* We must store a*/
5393               Lisp_Object val, ch;
5394
5395               val = COMPOSITION_COMPONENTS (prop);
5396               if (CONSP (val))
5397                 while (CONSP (val))
5398                   {
5399                     ch = XCAR (val), val = XCDR (val);
5400                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5401                   }
5402               else if (VECTORP (val) || STRINGP (val))
5403                 {
5404                   int len = (VECTORP (val)
5405                              ? XVECTOR (val)->size : SCHARS (val));
5406                   int i;
5407                   for (i = 0; i < len; i++)
5408                     {
5409                       ch = (STRINGP (val)
5410                             ? Faref (val, make_number (i))
5411                             : XVECTOR (val)->contents[i]);
5412                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5413                     }
5414                 }
5415               else              /* INTEGERP (val) */
5416                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5417             }
5418           CODING_ADD_COMPOSITION_END (coding, end - from);
5419         }
5420       start = end;
5421     }
5422   while (start < to
5423          && find_composition (start, to, &start, &end, &prop, obj)
5424          && end <= to);
5425
5426   /* Make coding->cmp_data point to the first memory block.  */
5427   while (coding->cmp_data->prev)
5428     coding->cmp_data = coding->cmp_data->prev;
5429   coding->cmp_data_start = 0;
5430 }
5431
5432 /* Reflect the saved information about compositions to OBJ.
5433    CODING->cmp_data points to a memory block for the information.  OBJ
5434    is a buffer or a string, defaults to the current buffer.  */
5435
5436 void
5437 coding_restore_composition (coding, obj)
5438      struct coding_system *coding;
5439      Lisp_Object obj;
5440 {
5441   struct composition_data *cmp_data = coding->cmp_data;
5442
5443   if (!cmp_data)
5444     return;
5445
5446   while (cmp_data->prev)
5447     cmp_data = cmp_data->prev;
5448
5449   while (cmp_data)
5450     {
5451       int i;
5452
5453       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5454            i += cmp_data->data[i])
5455         {
5456           int *data = cmp_data->data + i;
5457           enum composition_method method = (enum composition_method) data[3];
5458           Lisp_Object components;
5459
5460           if (data[0] < 0 || i + data[0] > cmp_data->used)
5461             /* Invalid composition data.  */
5462             break;
5463
5464           if (method == COMPOSITION_RELATIVE)
5465             components = Qnil;
5466           else
5467             {
5468               int len = data[0] - 4, j;
5469               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5470
5471               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5472                   && len % 2 == 0)
5473                 len --;
5474               if (len < 1)
5475                 /* Invalid composition data.  */
5476                 break;
5477               for (j = 0; j < len; j++)
5478                 args[j] = make_number (data[4 + j]);
5479               components = (method == COMPOSITION_WITH_ALTCHARS
5480                             ? Fstring (len, args)
5481                             : Fvector (len, args));
5482             }
5483           compose_text (data[1], data[2], components, Qnil, obj);
5484         }
5485       cmp_data = cmp_data->next;
5486     }
5487 }
5488
5489 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5490    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5491    coding system CODING, and return the status code of code conversion
5492    (currently, this value has no meaning).
5493
5494    How many characters (and bytes) are converted to how many
5495    characters (and bytes) are recorded in members of the structure
5496    CODING.
5497
5498    If REPLACE is nonzero, we do various things as if the original text
5499    is deleted and a new text is inserted.  See the comments in
5500    replace_range (insdel.c) to know what we are doing.
5501
5502    If REPLACE is zero, it is assumed that the source text is unibyte.
5503    Otherwise, it is assumed that the source text is multibyte.  */
5504
5505 int
5506 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5507      int from, from_byte, to, to_byte, encodep, replace;
5508      struct coding_system *coding;
5509 {
5510   int len = to - from, len_byte = to_byte - from_byte;
5511   int nchars_del = 0, nbytes_del = 0;
5512   int require, inserted, inserted_byte;
5513   int head_skip, tail_skip, total_skip = 0;
5514   Lisp_Object saved_coding_symbol;
5515   int first = 1;
5516   unsigned char *src, *dst;
5517   Lisp_Object deletion;
5518   int orig_point = PT, orig_len = len;
5519   int prev_Z;
5520   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5521
5522   deletion = Qnil;
5523   saved_coding_symbol = coding->symbol;
5524
5525   if (from < PT && PT < to)
5526     {
5527       TEMP_SET_PT_BOTH (from, from_byte);
5528       orig_point = from;
5529     }
5530
5531   if (replace)
5532     {
5533       int saved_from = from;
5534       int saved_inhibit_modification_hooks;
5535
5536       prepare_to_modify_buffer (from, to, &from);
5537       if (saved_from != from)
5538         {
5539           to = from + len;
5540           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5541           len_byte = to_byte - from_byte;
5542         }
5543
5544       /* The code conversion routine can not preserve text properties
5545          for now.  So, we must remove all text properties in the
5546          region.  Here, we must suppress all modification hooks.  */
5547       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5548       inhibit_modification_hooks = 1;
5549       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5550       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5551     }
5552
5553   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5554     {
5555       /* We must detect encoding of text and eol format.  */
5556
5557       if (from < GPT && to > GPT)
5558         move_gap_both (from, from_byte);
5559       if (coding->type == coding_type_undecided)
5560         {
5561           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5562           if (coding->type == coding_type_undecided)
5563             {
5564               /* It seems that the text contains only ASCII, but we
5565                  should not leave it undecided because the deeper
5566                  decoding routine (decode_coding) tries to detect the
5567                  encodings again in vain.  */
5568               coding->type = coding_type_emacs_mule;
5569               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5570               /* As emacs-mule decoder will handle composition, we
5571                  need this setting to allocate coding->cmp_data
5572                  later.  */
5573               coding->composing = COMPOSITION_NO;
5574             }
5575         }
5576       if (coding->eol_type == CODING_EOL_UNDECIDED
5577           && coding->type != coding_type_ccl)
5578         {
5579           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5580           if (coding->eol_type == CODING_EOL_UNDECIDED)
5581             coding->eol_type = CODING_EOL_LF;
5582           /* We had better recover the original eol format if we
5583              encounter an inconsistent eol format while decoding.  */
5584           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5585         }
5586     }
5587
5588   /* Now we convert the text.  */
5589
5590   /* For encoding, we must process pre-write-conversion in advance.  */
5591   if (! inhibit_pre_post_conversion
5592       && encodep
5593       && SYMBOLP (coding->pre_write_conversion)
5594       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5595     {
5596       /* The function in pre-write-conversion may put a new text in a
5597          new buffer.  */
5598       struct buffer *prev = current_buffer;
5599       Lisp_Object new;
5600
5601       record_unwind_protect (code_convert_region_unwind,
5602                              Vlast_coding_system_used);
5603       /* We should not call any more pre-write/post-read-conversion
5604          functions while this pre-write-conversion is running.  */
5605       inhibit_pre_post_conversion = 1;
5606       call2 (coding->pre_write_conversion,
5607              make_number (from), make_number (to));
5608       inhibit_pre_post_conversion = 0;
5609       /* Discard the unwind protect.  */
5610       specpdl_ptr--;
5611
5612       if (current_buffer != prev)
5613         {
5614           len = ZV - BEGV;
5615           new = Fcurrent_buffer ();
5616           set_buffer_internal_1 (prev);
5617           del_range_2 (from, from_byte, to, to_byte, 0);
5618           TEMP_SET_PT_BOTH (from, from_byte);
5619           insert_from_buffer (XBUFFER (new), 1, len, 0);
5620           Fkill_buffer (new);
5621           if (orig_point >= to)
5622             orig_point += len - orig_len;
5623           else if (orig_point > from)
5624             orig_point = from;
5625           orig_len = len;
5626           to = from + len;
5627           from_byte = CHAR_TO_BYTE (from);
5628           to_byte = CHAR_TO_BYTE (to);
5629           len_byte = to_byte - from_byte;
5630           TEMP_SET_PT_BOTH (from, from_byte);
5631         }
5632     }
5633
5634   if (replace)
5635     {
5636       if (! EQ (current_buffer->undo_list, Qt))
5637         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5638       else
5639         {
5640           nchars_del = to - from;
5641           nbytes_del = to_byte - from_byte;
5642         }
5643     }
5644
5645   if (coding->composing != COMPOSITION_DISABLED)
5646     {
5647       if (encodep)
5648         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5649       else
5650         coding_allocate_composition_data (coding, from);
5651     }
5652
5653   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5654      if we must run CCL program or there are compositions to
5655      encode.  */
5656   if (coding->type != coding_type_ccl
5657       && (! coding->cmp_data || coding->cmp_data->used == 0))
5658     {
5659       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5660
5661       if (from < GPT && GPT < to)
5662         move_gap_both (from, from_byte);
5663       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5664       if (from_byte == to_byte
5665           && (encodep || NILP (coding->post_read_conversion))
5666           && ! CODING_REQUIRE_FLUSHING (coding))
5667         {
5668           coding->produced = len_byte;
5669           coding->produced_char = len;
5670           if (!replace)
5671             /* We must record and adjust for this new text now.  */
5672             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5673           coding_free_composition_data (coding);
5674           return 0;
5675         }
5676
5677       head_skip = from_byte - from_byte_orig;
5678       tail_skip = to_byte_orig - to_byte;
5679       total_skip = head_skip + tail_skip;
5680       from += head_skip;
5681       to -= tail_skip;
5682       len -= total_skip; len_byte -= total_skip;
5683     }
5684
5685   /* For conversion, we must put the gap before the text in addition to
5686      making the gap larger for efficient decoding.  The required gap
5687      size starts from 2000 which is the magic number used in make_gap.
5688      But, after one batch of conversion, it will be incremented if we
5689      find that it is not enough .  */
5690   require = 2000;
5691
5692   if (GAP_SIZE  < require)
5693     make_gap (require - GAP_SIZE);
5694   move_gap_both (from, from_byte);
5695
5696   inserted = inserted_byte = 0;
5697
5698   GAP_SIZE += len_byte;
5699   ZV -= len;
5700   Z -= len;
5701   ZV_BYTE -= len_byte;
5702   Z_BYTE -= len_byte;
5703
5704   if (GPT - BEG < BEG_UNCHANGED)
5705     BEG_UNCHANGED = GPT - BEG;
5706   if (Z - GPT < END_UNCHANGED)
5707     END_UNCHANGED = Z - GPT;
5708
5709   if (!encodep && coding->src_multibyte)
5710     {
5711       /* Decoding routines expects that the source text is unibyte.
5712          We must convert 8-bit characters of multibyte form to
5713          unibyte.  */
5714       int len_byte_orig = len_byte;
5715       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5716       if (len_byte < len_byte_orig)
5717         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5718                     len_byte);
5719       coding->src_multibyte = 0;
5720     }
5721
5722   for (;;)
5723     {
5724       int result;
5725
5726       /* The buffer memory is now:
5727          +--------+converted-text+---------+-------original-text-------+---+
5728          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5729                   |<---------------------- GAP ----------------------->|  */
5730       src = GAP_END_ADDR - len_byte;
5731       dst = GPT_ADDR + inserted_byte;
5732
5733       if (encodep)
5734         result = encode_coding (coding, src, dst, len_byte, 0);
5735       else
5736         {
5737           if (coding->composing != COMPOSITION_DISABLED)
5738             coding->cmp_data->char_offset = from + inserted;
5739           result = decode_coding (coding, src, dst, len_byte, 0);
5740         }
5741
5742       /* The buffer memory is now:
5743          +--------+-------converted-text----+--+------original-text----+---+
5744          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5745                   |<---------------------- GAP ----------------------->|  */
5746
5747       inserted += coding->produced_char;
5748       inserted_byte += coding->produced;
5749       len_byte -= coding->consumed;
5750
5751       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5752         {
5753           coding_allocate_composition_data (coding, from + inserted);
5754           continue;
5755         }
5756
5757       src += coding->consumed;
5758       dst += coding->produced;
5759
5760       if (result == CODING_FINISH_NORMAL)
5761         {
5762           src += len_byte;
5763           break;
5764         }
5765       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5766         {
5767           unsigned char *pend = dst, *p = pend - inserted_byte;
5768           Lisp_Object eol_type;
5769
5770           /* Encode LFs back to the original eol format (CR or CRLF).  */
5771           if (coding->eol_type == CODING_EOL_CR)
5772             {
5773               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5774             }
5775           else
5776             {
5777               int count = 0;
5778
5779               while (p < pend) if (*p++ == '\n') count++;
5780               if (src - dst < count)
5781                 {
5782                   /* We don't have sufficient room for encoding LFs
5783                      back to CRLF.  We must record converted and
5784                      not-yet-converted text back to the buffer
5785                      content, enlarge the gap, then record them out of
5786                      the buffer contents again.  */
5787                   int add = len_byte + inserted_byte;
5788
5789                   GAP_SIZE -= add;
5790                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5791                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5792                   make_gap (count - GAP_SIZE);
5793                   GAP_SIZE += add;
5794                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5795                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5796                   /* Don't forget to update SRC, DST, and PEND.  */
5797                   src = GAP_END_ADDR - len_byte;
5798                   dst = GPT_ADDR + inserted_byte;
5799                   pend = dst;
5800                 }
5801               inserted += count;
5802               inserted_byte += count;
5803               coding->produced += count;
5804               p = dst = pend + count;
5805               while (count)
5806                 {
5807                   *--p = *--pend;
5808                   if (*p == '\n') count--, *--p = '\r';
5809                 }
5810             }
5811
5812           /* Suppress eol-format conversion in the further conversion.  */
5813           coding->eol_type = CODING_EOL_LF;
5814
5815           /* Set the coding system symbol to that for Unix-like EOL.  */
5816           eol_type = Fget (saved_coding_symbol, Qeol_type);
5817           if (VECTORP (eol_type)
5818               && XVECTOR (eol_type)->size == 3
5819               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5820             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5821           else
5822             coding->symbol = saved_coding_symbol;
5823
5824           continue;
5825         }
5826       if (len_byte <= 0)
5827         {
5828           if (coding->type != coding_type_ccl
5829               || coding->mode & CODING_MODE_LAST_BLOCK)
5830             break;
5831           coding->mode |= CODING_MODE_LAST_BLOCK;
5832           continue;
5833         }
5834       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5835         {
5836           /* The source text ends in invalid codes.  Let's just
5837              make them valid buffer contents, and finish conversion.  */
5838           if (multibyte_p)
5839             {
5840               unsigned char *start = dst;
5841
5842               inserted += len_byte;
5843               while (len_byte--)
5844                 {
5845                   int c = *src++;
5846                   dst += CHAR_STRING (c, dst);
5847                 }
5848
5849               inserted_byte += dst - start;
5850             }
5851           else
5852             {
5853               inserted += len_byte;
5854               inserted_byte += len_byte;
5855               while (len_byte--)
5856                 *dst++ = *src++;
5857             }
5858           break;
5859         }
5860       if (result == CODING_FINISH_INTERRUPT)
5861         {
5862           /* The conversion procedure was interrupted by a user.  */
5863           break;
5864         }
5865       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5866       if (coding->consumed < 1)
5867         {
5868           /* It's quite strange to require more memory without
5869              consuming any bytes.  Perhaps CCL program bug.  */
5870           break;
5871         }
5872       if (first)
5873         {
5874           /* We have just done the first batch of conversion which was
5875              stopped because of insufficient gap.  Let's reconsider the
5876              required gap size (i.e. SRT - DST) now.
5877
5878              We have converted ORIG bytes (== coding->consumed) into
5879              NEW bytes (coding->produced).  To convert the remaining
5880              LEN bytes, we may need REQUIRE bytes of gap, where:
5881                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5882                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5883              Here, we are sure that NEW >= ORIG.  */
5884           float ratio;
5885
5886           if (coding->produced <= coding->consumed)
5887             {
5888               /* This happens because of CCL-based coding system with
5889                  eol-type CRLF.  */
5890               require = 0;
5891             }
5892           else
5893             {
5894               ratio = (coding->produced - coding->consumed) / coding->consumed;
5895               require = len_byte * ratio;
5896             }
5897           first = 0;
5898         }
5899       if ((src - dst) < (require + 2000))
5900         {
5901           /* See the comment above the previous call of make_gap.  */
5902           int add = len_byte + inserted_byte;
5903
5904           GAP_SIZE -= add;
5905           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5906           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5907           make_gap (require + 2000);
5908           GAP_SIZE += add;
5909           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5910           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5911         }
5912     }
5913   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5914
5915   if (encodep && coding->dst_multibyte)
5916     {
5917       /* The output is unibyte.  We must convert 8-bit characters to
5918          multibyte form.  */
5919       if (inserted_byte * 2 > GAP_SIZE)
5920         {
5921           GAP_SIZE -= inserted_byte;
5922           ZV += inserted_byte; Z += inserted_byte;
5923           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5924           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5925           make_gap (inserted_byte - GAP_SIZE);
5926           GAP_SIZE += inserted_byte;
5927           ZV -= inserted_byte; Z -= inserted_byte;
5928           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5929           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5930         }
5931       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5932     }
5933
5934   /* If we shrank the conversion area, adjust it now.  */
5935   if (total_skip > 0)
5936     {
5937       if (tail_skip > 0)
5938         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5939       inserted += total_skip; inserted_byte += total_skip;
5940       GAP_SIZE += total_skip;
5941       GPT -= head_skip; GPT_BYTE -= head_skip;
5942       ZV -= total_skip; ZV_BYTE -= total_skip;
5943       Z -= total_skip; Z_BYTE -= total_skip;
5944       from -= head_skip; from_byte -= head_skip;
5945       to += tail_skip; to_byte += tail_skip;
5946     }
5947
5948   prev_Z = Z;
5949   if (! EQ (current_buffer->undo_list, Qt))
5950     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5951   else
5952     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5953                                  inserted, inserted_byte);
5954   inserted = Z - prev_Z;
5955
5956   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5957     coding_restore_composition (coding, Fcurrent_buffer ());
5958   coding_free_composition_data (coding);
5959
5960   if (! inhibit_pre_post_conversion
5961       && ! encodep && ! NILP (coding->post_read_conversion))
5962     {
5963       Lisp_Object val;
5964       Lisp_Object saved_coding_system;
5965
5966       if (from != PT)
5967         TEMP_SET_PT_BOTH (from, from_byte);
5968       prev_Z = Z;
5969       record_unwind_protect (code_convert_region_unwind,
5970                              Vlast_coding_system_used);
5971       saved_coding_system = Vlast_coding_system_used;
5972       Vlast_coding_system_used = coding->symbol;
5973       /* We should not call any more pre-write/post-read-conversion
5974          functions while this post-read-conversion is running.  */
5975       inhibit_pre_post_conversion = 1;
5976       val = call1 (coding->post_read_conversion, make_number (inserted));
5977       inhibit_pre_post_conversion = 0;
5978       coding->symbol = Vlast_coding_system_used;
5979       Vlast_coding_system_used = saved_coding_system;
5980       /* Discard the unwind protect.  */
5981       specpdl_ptr--;
5982       CHECK_NUMBER (val);
5983       inserted += Z - prev_Z;
5984     }
5985
5986   if (orig_point >= from)
5987     {
5988       if (orig_point >= from + orig_len)
5989         orig_point += inserted - orig_len;
5990       else
5991         orig_point = from;
5992       TEMP_SET_PT (orig_point);
5993     }
5994
5995   if (replace)
5996     {
5997       signal_after_change (from, to - from, inserted);
5998       update_compositions (from, from + inserted, CHECK_BORDER);
5999     }
6000
6001   {
6002     coding->consumed = to_byte - from_byte;
6003     coding->consumed_char = to - from;
6004     coding->produced = inserted_byte;
6005     coding->produced_char = inserted;
6006   }
6007
6008   return 0;
6009 }
6010
6011 /* Name (or base name) of work buffer for code conversion.  */
6012 static Lisp_Object Vcode_conversion_workbuf_name;
6013
6014 /* Set the current buffer to the working buffer prepared for
6015    code-conversion.  MULTIBYTE specifies the multibyteness of the
6016    buffer.  */
6017
6018 static struct buffer *
6019 set_conversion_work_buffer (multibyte)
6020      int multibyte;
6021 {
6022   Lisp_Object buffer;
6023   struct buffer *buf;
6024
6025   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6026   buf = XBUFFER (buffer);
6027   delete_all_overlays (buf);
6028   buf->directory = current_buffer->directory;
6029   buf->read_only = Qnil;
6030   buf->filename = Qnil;
6031   buf->undo_list = Qt;
6032   eassert (buf->overlays_before == NULL);
6033   eassert (buf->overlays_after == NULL);
6034   set_buffer_internal (buf);
6035   if (BEG != BEGV || Z != ZV)
6036     Fwiden ();
6037   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6038   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6039   return buf;
6040 }
6041
6042 Lisp_Object
6043 run_pre_post_conversion_on_str (str, coding, encodep)
6044      Lisp_Object str;
6045      struct coding_system *coding;
6046      int encodep;
6047 {
6048   int count = SPECPDL_INDEX ();
6049   struct gcpro gcpro1, gcpro2;
6050   int multibyte = STRING_MULTIBYTE (str);
6051   struct buffer *buf;
6052   Lisp_Object old_deactivate_mark;
6053
6054   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6055   record_unwind_protect (code_convert_region_unwind,
6056                          Vlast_coding_system_used);
6057   /* It is not crucial to specbind this.  */
6058   old_deactivate_mark = Vdeactivate_mark;
6059   GCPRO2 (str, old_deactivate_mark);
6060
6061   /* We must insert the contents of STR as is without
6062      unibyte<->multibyte conversion.  For that, we adjust the
6063      multibyteness of the working buffer to that of STR.  */
6064   set_conversion_work_buffer (multibyte);
6065
6066   insert_from_string (str, 0, 0,
6067                       SCHARS (str), SBYTES (str), 0);
6068   UNGCPRO;
6069   inhibit_pre_post_conversion = 1;
6070   if (encodep)
6071     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6072   else
6073     {
6074       Vlast_coding_system_used = coding->symbol;
6075       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6076       call1 (coding->post_read_conversion, make_number (Z - BEG));
6077       coding->symbol = Vlast_coding_system_used;
6078     }
6079   inhibit_pre_post_conversion = 0;
6080   Vdeactivate_mark = old_deactivate_mark;
6081   str = make_buffer_string (BEG, Z, 1);
6082   return unbind_to (count, str);
6083 }
6084
6085
6086 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6087    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6088    is intended that this function is called from encode_terminal_code,
6089    the pre-write-conversion function is run by safe_call and thus
6090    "Error during redisplay: ..." is logged when an error occurs.
6091
6092    Store the resulting text in *STR and set CODING->produced_char and
6093    CODING->produced to the number of characters and bytes
6094    respectively.  If the size of *STR is too small, enlarge it by
6095    xrealloc and update *STR and *SIZE.  */
6096
6097 void
6098 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6099      unsigned char **str;
6100      int *size, nchars, nbytes;
6101      struct coding_system *coding;
6102 {
6103   struct gcpro gcpro1, gcpro2;
6104   struct buffer *cur = current_buffer;
6105   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6106   Lisp_Object args[3];
6107
6108   /* It is not crucial to specbind this.  */
6109   old_deactivate_mark = Vdeactivate_mark;
6110   old_last_coding_system_used = Vlast_coding_system_used;
6111   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6112
6113   /* We must insert the contents of STR as is without
6114      unibyte<->multibyte conversion.  For that, we adjust the
6115      multibyteness of the working buffer to that of STR.  */
6116   set_conversion_work_buffer (coding->src_multibyte);
6117   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6118   UNGCPRO;
6119   inhibit_pre_post_conversion = 1;
6120   args[0] = coding->pre_write_conversion;
6121   args[1] = make_number (BEG);
6122   args[2] = make_number (Z);
6123   safe_call (3, args);
6124   inhibit_pre_post_conversion = 0;
6125   Vdeactivate_mark = old_deactivate_mark;
6126   Vlast_coding_system_used = old_last_coding_system_used;
6127   coding->produced_char = Z - BEG;
6128   coding->produced = Z_BYTE - BEG_BYTE;
6129   if (coding->produced > *size)
6130     {
6131       *size = coding->produced;
6132       *str = xrealloc (*str, *size);
6133     }
6134   if (BEG < GPT && GPT < Z)
6135     move_gap (BEG);
6136   bcopy (BEG_ADDR, *str, coding->produced);
6137   coding->src_multibyte
6138     = ! NILP (current_buffer->enable_multibyte_characters);
6139   set_buffer_internal (cur);
6140 }
6141
6142
6143 Lisp_Object
6144 decode_coding_string (str, coding, nocopy)
6145      Lisp_Object str;
6146      struct coding_system *coding;
6147      int nocopy;
6148 {
6149   int len;
6150   struct conversion_buffer buf;
6151   int from, to_byte;
6152   Lisp_Object saved_coding_symbol;
6153   int result;
6154   int require_decoding;
6155   int shrinked_bytes = 0;
6156   Lisp_Object newstr;
6157   int consumed, consumed_char, produced, produced_char;
6158
6159   from = 0;
6160   to_byte = SBYTES (str);
6161
6162   saved_coding_symbol = coding->symbol;
6163   coding->src_multibyte = STRING_MULTIBYTE (str);
6164   coding->dst_multibyte = 1;
6165   if (CODING_REQUIRE_DETECTION (coding))
6166     {
6167       /* See the comments in code_convert_region.  */
6168       if (coding->type == coding_type_undecided)
6169         {
6170           detect_coding (coding, SDATA (str), to_byte);
6171           if (coding->type == coding_type_undecided)
6172             {
6173               coding->type = coding_type_emacs_mule;
6174               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6175               /* As emacs-mule decoder will handle composition, we
6176                  need this setting to allocate coding->cmp_data
6177                  later.  */
6178               coding->composing = COMPOSITION_NO;
6179             }
6180         }
6181       if (coding->eol_type == CODING_EOL_UNDECIDED
6182           && coding->type != coding_type_ccl)
6183         {
6184           saved_coding_symbol = coding->symbol;
6185           detect_eol (coding, SDATA (str), to_byte);
6186           if (coding->eol_type == CODING_EOL_UNDECIDED)
6187             coding->eol_type = CODING_EOL_LF;
6188           /* We had better recover the original eol format if we
6189              encounter an inconsistent eol format while decoding.  */
6190           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6191         }
6192     }
6193
6194   if (coding->type == coding_type_no_conversion
6195       || coding->type == coding_type_raw_text)
6196     coding->dst_multibyte = 0;
6197
6198   require_decoding = CODING_REQUIRE_DECODING (coding);
6199
6200   if (STRING_MULTIBYTE (str))
6201     {
6202       /* Decoding routines expect the source text to be unibyte.  */
6203       str = Fstring_as_unibyte (str);
6204       to_byte = SBYTES (str);
6205       nocopy = 1;
6206       coding->src_multibyte = 0;
6207     }
6208
6209   /* Try to skip the heading and tailing ASCIIs.  */
6210   if (require_decoding && coding->type != coding_type_ccl)
6211     {
6212       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6213                                 0);
6214       if (from == to_byte)
6215         require_decoding = 0;
6216       shrinked_bytes = from + (SBYTES (str) - to_byte);
6217     }
6218
6219   if (!require_decoding
6220       && !(SYMBOLP (coding->post_read_conversion)
6221            && !NILP (Ffboundp (coding->post_read_conversion))))
6222     {
6223       coding->consumed = SBYTES (str);
6224       coding->consumed_char = SCHARS (str);
6225       if (coding->dst_multibyte)
6226         {
6227           str = Fstring_as_multibyte (str);
6228           nocopy = 1;
6229         }
6230       coding->produced = SBYTES (str);
6231       coding->produced_char = SCHARS (str);
6232       return (nocopy ? str : Fcopy_sequence (str));
6233     }
6234
6235   if (coding->composing != COMPOSITION_DISABLED)
6236     coding_allocate_composition_data (coding, from);
6237   len = decoding_buffer_size (coding, to_byte - from);
6238   allocate_conversion_buffer (buf, len);
6239
6240   consumed = consumed_char = produced = produced_char = 0;
6241   while (1)
6242     {
6243       result = decode_coding (coding, SDATA (str) + from + consumed,
6244                               buf.data + produced, to_byte - from - consumed,
6245                               buf.size - produced);
6246       consumed += coding->consumed;
6247       consumed_char += coding->consumed_char;
6248       produced += coding->produced;
6249       produced_char += coding->produced_char;
6250       if (result == CODING_FINISH_NORMAL
6251           || (result == CODING_FINISH_INSUFFICIENT_SRC
6252               && coding->consumed == 0))
6253         break;
6254       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6255         coding_allocate_composition_data (coding, from + produced_char);
6256       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6257         extend_conversion_buffer (&buf);
6258       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6259         {
6260           Lisp_Object eol_type;
6261
6262           /* Recover the original EOL format.  */
6263           if (coding->eol_type == CODING_EOL_CR)
6264             {
6265               unsigned char *p;
6266               for (p = buf.data; p < buf.data + produced; p++)
6267                 if (*p == '\n') *p = '\r';
6268             }
6269           else if (coding->eol_type == CODING_EOL_CRLF)
6270             {
6271               int num_eol = 0;
6272               unsigned char *p0, *p1;
6273               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6274                 if (*p0 == '\n') num_eol++;
6275               if (produced + num_eol >= buf.size)
6276                 extend_conversion_buffer (&buf);
6277               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6278                 {
6279                   *--p1 = *--p0;
6280                   if (*p0 == '\n') *--p1 = '\r';
6281                 }
6282               produced += num_eol;
6283               produced_char += num_eol;
6284             }
6285           /* Suppress eol-format conversion in the further conversion.  */
6286           coding->eol_type = CODING_EOL_LF;
6287
6288           /* Set the coding system symbol to that for Unix-like EOL.  */
6289           eol_type = Fget (saved_coding_symbol, Qeol_type);
6290           if (VECTORP (eol_type)
6291               && XVECTOR (eol_type)->size == 3
6292               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6293             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6294           else
6295             coding->symbol = saved_coding_symbol;
6296
6297
6298         }
6299     }
6300
6301   coding->consumed = consumed;
6302   coding->consumed_char = consumed_char;
6303   coding->produced = produced;
6304   coding->produced_char = produced_char;
6305
6306   if (coding->dst_multibyte)
6307     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6308                                            produced + shrinked_bytes);
6309   else
6310     newstr = make_uninit_string (produced + shrinked_bytes);
6311   if (from > 0)
6312     STRING_COPYIN (newstr, 0, SDATA (str), from);
6313   STRING_COPYIN (newstr, from, buf.data, produced);
6314   if (shrinked_bytes > from)
6315     STRING_COPYIN (newstr, from + produced,
6316                    SDATA (str) + to_byte,
6317                    shrinked_bytes - from);
6318   free_conversion_buffer (&buf);
6319
6320   coding->consumed += shrinked_bytes;
6321   coding->consumed_char += shrinked_bytes;
6322   coding->produced += shrinked_bytes;
6323   coding->produced_char += shrinked_bytes;
6324
6325   if (coding->cmp_data && coding->cmp_data->used)
6326     coding_restore_composition (coding, newstr);
6327   coding_free_composition_data (coding);
6328
6329   if (SYMBOLP (coding->post_read_conversion)
6330       && !NILP (Ffboundp (coding->post_read_conversion)))
6331     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6332
6333   return newstr;
6334 }
6335
6336 Lisp_Object
6337 encode_coding_string (str, coding, nocopy)
6338      Lisp_Object str;
6339      struct coding_system *coding;
6340      int nocopy;
6341 {
6342   int len;
6343   struct conversion_buffer buf;
6344   int from, to, to_byte;
6345   int result;
6346   int shrinked_bytes = 0;
6347   Lisp_Object newstr;
6348   int consumed, consumed_char, produced, produced_char;
6349
6350   if (SYMBOLP (coding->pre_write_conversion)
6351       && !NILP (Ffboundp (coding->pre_write_conversion)))
6352     str = run_pre_post_conversion_on_str (str, coding, 1);
6353
6354   from = 0;
6355   to = SCHARS (str);
6356   to_byte = SBYTES (str);
6357
6358   /* Encoding routines determine the multibyteness of the source text
6359      by coding->src_multibyte.  */
6360   coding->src_multibyte = STRING_MULTIBYTE (str);
6361   coding->dst_multibyte = 0;
6362   if (! CODING_REQUIRE_ENCODING (coding))
6363     {
6364       coding->consumed = SBYTES (str);
6365       coding->consumed_char = SCHARS (str);
6366       if (STRING_MULTIBYTE (str))
6367         {
6368           str = Fstring_as_unibyte (str);
6369           nocopy = 1;
6370         }
6371       coding->produced = SBYTES (str);
6372       coding->produced_char = SCHARS (str);
6373       return (nocopy ? str : Fcopy_sequence (str));
6374     }
6375
6376   if (coding->composing != COMPOSITION_DISABLED)
6377     coding_save_composition (coding, from, to, str);
6378
6379   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6380      if we must run CCL program or there are compositions to
6381      encode.  */
6382   if (coding->type != coding_type_ccl
6383       && (! coding->cmp_data || coding->cmp_data->used == 0))
6384     {
6385       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6386                                 1);
6387       if (from == to_byte)
6388         {
6389           coding_free_composition_data (coding);
6390           return (nocopy ? str : Fcopy_sequence (str));
6391         }
6392       shrinked_bytes = from + (SBYTES (str) - to_byte);
6393     }
6394
6395   len = encoding_buffer_size (coding, to_byte - from);
6396   allocate_conversion_buffer (buf, len);
6397
6398   consumed = consumed_char = produced = produced_char = 0;
6399   while (1)
6400     {
6401       result = encode_coding (coding, SDATA (str) + from + consumed,
6402                               buf.data + produced, to_byte - from - consumed,
6403                               buf.size - produced);
6404       consumed += coding->consumed;
6405       consumed_char += coding->consumed_char;
6406       produced += coding->produced;
6407       produced_char += coding->produced_char;
6408       if (result == CODING_FINISH_NORMAL
6409           || result == CODING_FINISH_INTERRUPT
6410           || (result == CODING_FINISH_INSUFFICIENT_SRC
6411               && coding->consumed == 0))
6412         break;
6413       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6414       extend_conversion_buffer (&buf);
6415     }
6416
6417   coding->consumed = consumed;
6418   coding->consumed_char = consumed_char;
6419   coding->produced = produced;
6420   coding->produced_char = produced_char;
6421
6422   newstr = make_uninit_string (produced + shrinked_bytes);
6423   if (from > 0)
6424     STRING_COPYIN (newstr, 0, SDATA (str), from);
6425   STRING_COPYIN (newstr, from, buf.data, produced);
6426   if (shrinked_bytes > from)
6427     STRING_COPYIN (newstr, from + produced,
6428                    SDATA (str) + to_byte,
6429                    shrinked_bytes - from);
6430
6431   free_conversion_buffer (&buf);
6432   coding_free_composition_data (coding);
6433
6434   return newstr;
6435 }
6436
6437 \f
6438 #ifdef emacs
6439 /*** 8. Emacs Lisp library functions ***/
6440
6441 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6442        doc: /* Return t if OBJECT is nil or a coding-system.
6443 See the documentation of `make-coding-system' for information
6444 about coding-system objects.  */)
6445      (obj)
6446      Lisp_Object obj;
6447 {
6448   if (NILP (obj))
6449     return Qt;
6450   if (!SYMBOLP (obj))
6451     return Qnil;
6452   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6453     return Qt;
6454   /* Get coding-spec vector for OBJ.  */
6455   obj = Fget (obj, Qcoding_system);
6456   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6457           ? Qt : Qnil);
6458 }
6459
6460 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6461        Sread_non_nil_coding_system, 1, 1, 0,
6462        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6463      (prompt)
6464      Lisp_Object prompt;
6465 {
6466   Lisp_Object val;
6467   do
6468     {
6469       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6470                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6471     }
6472   while (SCHARS (val) == 0);
6473   return (Fintern (val, Qnil));
6474 }
6475
6476 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6477        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6478 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6479      (prompt, default_coding_system)
6480      Lisp_Object prompt, default_coding_system;
6481 {
6482   Lisp_Object val;
6483   if (SYMBOLP (default_coding_system))
6484     default_coding_system = SYMBOL_NAME (default_coding_system);
6485   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6486                           Qt, Qnil, Qcoding_system_history,
6487                           default_coding_system, Qnil);
6488   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6489 }
6490
6491 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6492        1, 1, 0,
6493        doc: /* Check validity of CODING-SYSTEM.
6494 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6495 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6496 The value of this property should be a vector of length 5.  */)
6497      (coding_system)
6498      Lisp_Object coding_system;
6499 {
6500   Lisp_Object define_form;
6501
6502   define_form = Fget (coding_system, Qcoding_system_define_form);
6503   if (! NILP (define_form))
6504     {
6505       Fput (coding_system, Qcoding_system_define_form, Qnil);
6506       safe_eval (define_form);
6507     }
6508   if (!NILP (Fcoding_system_p (coding_system)))
6509     return coding_system;
6510   while (1)
6511     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6512 }
6513 \f
6514 Lisp_Object
6515 detect_coding_system (src, src_bytes, highest, multibytep)
6516      const unsigned char *src;
6517      int src_bytes, highest;
6518      int multibytep;
6519 {
6520   int coding_mask, eol_type;
6521   Lisp_Object val, tmp;
6522   int dummy;
6523
6524   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6525   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6526   if (eol_type == CODING_EOL_INCONSISTENT)
6527     eol_type = CODING_EOL_UNDECIDED;
6528
6529   if (!coding_mask)
6530     {
6531       val = Qundecided;
6532       if (eol_type != CODING_EOL_UNDECIDED)
6533         {
6534           Lisp_Object val2;
6535           val2 = Fget (Qundecided, Qeol_type);
6536           if (VECTORP (val2))
6537             val = XVECTOR (val2)->contents[eol_type];
6538         }
6539       return (highest ? val : Fcons (val, Qnil));
6540     }
6541
6542   /* At first, gather possible coding systems in VAL.  */
6543   val = Qnil;
6544   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6545     {
6546       Lisp_Object category_val, category_index;
6547
6548       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6549       category_val = Fsymbol_value (XCAR (tmp));
6550       if (!NILP (category_val)
6551           && NATNUMP (category_index)
6552           && (coding_mask & (1 << XFASTINT (category_index))))
6553         {
6554           val = Fcons (category_val, val);
6555           if (highest)
6556             break;
6557         }
6558     }
6559   if (!highest)
6560     val = Fnreverse (val);
6561
6562   /* Then, replace the elements with subsidiary coding systems.  */
6563   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6564     {
6565       if (eol_type != CODING_EOL_UNDECIDED
6566           && eol_type != CODING_EOL_INCONSISTENT)
6567         {
6568           Lisp_Object eol;
6569           eol = Fget (XCAR (tmp), Qeol_type);
6570           if (VECTORP (eol))
6571             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6572         }
6573     }
6574   return (highest ? XCAR (val) : val);
6575 }
6576
6577 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6578        2, 3, 0,
6579        doc: /* Detect how the byte sequence in the region is encoded.
6580 Return a list of possible coding systems used on decoding a byte
6581 sequence containing the bytes in the region between START and END when
6582 the coding system `undecided' is specified.  The list is ordered by
6583 priority decided in the current language environment.
6584
6585 If only ASCII characters are found, it returns a list of single element
6586 `undecided' or its subsidiary coding system according to a detected
6587 end-of-line format.
6588
6589 If optional argument HIGHEST is non-nil, return the coding system of
6590 highest priority.  */)
6591      (start, end, highest)
6592      Lisp_Object start, end, highest;
6593 {
6594   int from, to;
6595   int from_byte, to_byte;
6596   int include_anchor_byte = 0;
6597
6598   CHECK_NUMBER_COERCE_MARKER (start);
6599   CHECK_NUMBER_COERCE_MARKER (end);
6600
6601   validate_region (&start, &end);
6602   from = XINT (start), to = XINT (end);
6603   from_byte = CHAR_TO_BYTE (from);
6604   to_byte = CHAR_TO_BYTE (to);
6605
6606   if (from < GPT && to >= GPT)
6607     move_gap_both (to, to_byte);
6608   /* If we an anchor byte `\0' follows the region, we include it in
6609      the detecting source.  Then code detectors can handle the tailing
6610      byte sequence more accurately.
6611
6612      Fix me: This is not a perfect solution.  It is better that we
6613      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6614   */
6615   if (to == Z || (to == GPT && GAP_SIZE > 0))
6616     include_anchor_byte = 1;
6617   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6618                                to_byte - from_byte + include_anchor_byte,
6619                                !NILP (highest),
6620                                !NILP (current_buffer
6621                                       ->enable_multibyte_characters));
6622 }
6623
6624 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6625        1, 2, 0,
6626        doc: /* Detect how the byte sequence in STRING is encoded.
6627 Return a list of possible coding systems used on decoding a byte
6628 sequence containing the bytes in STRING when the coding system
6629 `undecided' is specified.  The list is ordered by priority decided in
6630 the current language environment.
6631
6632 If only ASCII characters are found, it returns a list of single element
6633 `undecided' or its subsidiary coding system according to a detected
6634 end-of-line format.
6635
6636 If optional argument HIGHEST is non-nil, return the coding system of
6637 highest priority.  */)
6638      (string, highest)
6639      Lisp_Object string, highest;
6640 {
6641   CHECK_STRING (string);
6642
6643   return detect_coding_system (SDATA (string),
6644                                /* "+ 1" is to include the anchor byte
6645                                   `\0'.  With this, code detectors can
6646                                   handle the tailing bytes more
6647                                   accurately.  */
6648                                SBYTES (string) + 1,
6649                                !NILP (highest),
6650                                STRING_MULTIBYTE (string));
6651 }
6652
6653 /*  Subroutine for Fsafe_coding_systems_region_internal.
6654
6655     Return a list of coding systems that safely encode the multibyte
6656     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6657     possible coding systems.  If it is nil, it means that we have not
6658     yet found any coding systems.
6659
6660     WORK_TABLE a char-table of which element is set to t once the
6661     element is looked up.
6662
6663     If a non-ASCII single byte char is found, set
6664     *single_byte_char_found to 1.  */
6665
6666 static Lisp_Object
6667 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6668      unsigned char *p, *pend;
6669      Lisp_Object safe_codings, work_table;
6670      int *single_byte_char_found;
6671 {
6672   int c, len;
6673   Lisp_Object val, ch;
6674   Lisp_Object prev, tail;
6675
6676   if (NILP (safe_codings))
6677     goto done_safe_codings;
6678   while (p < pend)
6679     {
6680       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6681       p += len;
6682       if (ASCII_BYTE_P (c))
6683         /* We can ignore ASCII characters here.  */
6684         continue;
6685       if (SINGLE_BYTE_CHAR_P (c))
6686         *single_byte_char_found = 1;
6687       /* Check the safe coding systems for C.  */
6688       ch = make_number (c);
6689       val = Faref (work_table, ch);
6690       if (EQ (val, Qt))
6691         /* This element was already checked.  Ignore it.  */
6692         continue;
6693       /* Remember that we checked this element.  */
6694       Faset (work_table, ch, Qt);
6695
6696       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6697         {
6698           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6699           int encodable;
6700
6701           elt = XCAR (tail);
6702           if (CONSP (XCDR (elt)))
6703             {
6704               /* This entry has this format now:
6705                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6706                           ACCEPT-LATIN-EXTRA ) */
6707               val = XCDR (elt);
6708               encodable = ! NILP (Faref (XCAR (val), ch));
6709               if (! encodable)
6710                 {
6711                   val = XCDR (val);
6712                   translation_table = XCAR (val);
6713                   hash_table = XCAR (XCDR (val));
6714                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6715                 }
6716             }
6717           else
6718             {
6719               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6720               encodable = ! NILP (Faref (XCDR (elt), ch));
6721               if (! encodable)
6722                 {
6723                   /* Transform the format to:
6724                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6725                        ACCEPT-LATIN-EXTRA )  */
6726                   val = Fget (XCAR (elt), Qcoding_system);
6727                   translation_table
6728                     = Fplist_get (AREF (val, 3),
6729                                   Qtranslation_table_for_encode);
6730                   if (SYMBOLP (translation_table))
6731                     translation_table = Fget (translation_table,
6732                                               Qtranslation_table);
6733                   hash_table
6734                     = (CHAR_TABLE_P (translation_table)
6735                        ? XCHAR_TABLE (translation_table)->extras[1]
6736                        : Qnil);
6737                   accept_latin_extra
6738                     = ((EQ (AREF (val, 0), make_number (2))
6739                         && VECTORP (AREF (val, 4)))
6740                        ? AREF (AREF (val, 4), 16)
6741                        : Qnil);
6742                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6743                                         translation_table, hash_table,
6744                                         accept_latin_extra));
6745                 }
6746             }
6747
6748           if (! encodable
6749               && ((CHAR_TABLE_P (translation_table)
6750                    && ! NILP (Faref (translation_table, ch)))
6751                   || (HASH_TABLE_P (hash_table)
6752                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6753                   || (SINGLE_BYTE_CHAR_P (c)
6754                       && ! NILP (accept_latin_extra)
6755                       && VECTORP (Vlatin_extra_code_table)
6756                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6757             encodable = 1;
6758           if (encodable)
6759             prev = tail;
6760           else
6761             {
6762               /* Exclude this coding system from SAFE_CODINGS.  */
6763               if (EQ (tail, safe_codings))
6764                 {
6765                   safe_codings = XCDR (safe_codings);
6766                   if (NILP (safe_codings))
6767                     goto done_safe_codings;
6768                 }
6769               else
6770                 XSETCDR (prev, XCDR (tail));
6771             }
6772         }
6773     }
6774
6775  done_safe_codings:
6776   /* If the above loop was terminated before P reaches PEND, it means
6777      SAFE_CODINGS was set to nil.  If we have not yet found an
6778      non-ASCII single-byte char, check it now.  */
6779   if (! *single_byte_char_found)
6780     while (p < pend)
6781       {
6782         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6783         p += len;
6784         if (! ASCII_BYTE_P (c)
6785             && SINGLE_BYTE_CHAR_P (c))
6786           {
6787             *single_byte_char_found = 1;
6788             break;
6789           }
6790       }
6791   return safe_codings;
6792 }
6793
6794 DEFUN ("find-coding-systems-region-internal",
6795        Ffind_coding_systems_region_internal,
6796        Sfind_coding_systems_region_internal, 2, 2, 0,
6797        doc: /* Internal use only.  */)
6798      (start, end)
6799      Lisp_Object start, end;
6800 {
6801   Lisp_Object work_table, safe_codings;
6802   int non_ascii_p = 0;
6803   int single_byte_char_found = 0;
6804   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6805
6806   if (STRINGP (start))
6807     {
6808       if (!STRING_MULTIBYTE (start))
6809         return Qt;
6810       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6811       p2 = p2end = p1end;
6812       if (SCHARS (start) != SBYTES (start))
6813         non_ascii_p = 1;
6814     }
6815   else
6816     {
6817       int from, to, stop;
6818
6819       CHECK_NUMBER_COERCE_MARKER (start);
6820       CHECK_NUMBER_COERCE_MARKER (end);
6821       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6822         args_out_of_range (start, end);
6823       if (NILP (current_buffer->enable_multibyte_characters))
6824         return Qt;
6825       from = CHAR_TO_BYTE (XINT (start));
6826       to = CHAR_TO_BYTE (XINT (end));
6827       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6828       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6829       if (stop == to)
6830         p2 = p2end = p1end;
6831       else
6832         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6833       if (XINT (end) - XINT (start) != to - from)
6834         non_ascii_p = 1;
6835     }
6836
6837   if (!non_ascii_p)
6838     {
6839       /* We are sure that the text contains no multibyte character.
6840          Check if it contains eight-bit-graphic.  */
6841       p = p1;
6842       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6843       if (p == p1end)
6844         {
6845           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6846           if (p == p2end)
6847             return Qt;
6848         }
6849     }
6850
6851   /* The text contains non-ASCII characters.  */
6852
6853   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6854   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6855
6856   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6857                                     &single_byte_char_found);
6858   if (p2 < p2end)
6859     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6860                                       &single_byte_char_found);
6861   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6862     safe_codings = Qt;
6863   else
6864     {
6865       /* Turn safe_codings to a list of coding systems... */
6866       Lisp_Object val;
6867
6868       if (single_byte_char_found)
6869         /* ... and append these for eight-bit chars.  */
6870         val = Fcons (Qraw_text,
6871                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6872       else
6873         /* ... and append generic coding systems.  */
6874         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6875
6876       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6877         val = Fcons (XCAR (XCAR (safe_codings)), val);
6878       safe_codings = val;
6879     }
6880
6881   return safe_codings;
6882 }
6883
6884
6885 /* Search from position POS for such characters that are unencodable
6886    accoding to SAFE_CHARS, and return a list of their positions.  P
6887    points where in the memory the character at POS exists.  Limit the
6888    search at PEND or when Nth unencodable characters are found.
6889
6890    If SAFE_CHARS is a char table, an element for an unencodable
6891    character is nil.
6892
6893    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6894
6895    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6896    eight-bit-graphic characters are unencodable.  */
6897
6898 static Lisp_Object
6899 unencodable_char_position (safe_chars, pos, p, pend, n)
6900      Lisp_Object safe_chars;
6901      int pos;
6902      unsigned char *p, *pend;
6903      int n;
6904 {
6905   Lisp_Object pos_list;
6906
6907   pos_list = Qnil;
6908   while (p < pend)
6909     {
6910       int len;
6911       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6912
6913       if (c >= 128
6914           && (CHAR_TABLE_P (safe_chars)
6915               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6916               : (NILP (safe_chars) || c < 256)))
6917         {
6918           pos_list = Fcons (make_number (pos), pos_list);
6919           if (--n <= 0)
6920             break;
6921         }
6922       pos++;
6923       p += len;
6924     }
6925   return Fnreverse (pos_list);
6926 }
6927
6928
6929 DEFUN ("unencodable-char-position", Funencodable_char_position,
6930        Sunencodable_char_position, 3, 5, 0,
6931        doc: /*
6932 Return position of first un-encodable character in a region.
6933 START and END specfiy the region and CODING-SYSTEM specifies the
6934 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6935
6936 If optional 4th argument COUNT is non-nil, it specifies at most how
6937 many un-encodable characters to search.  In this case, the value is a
6938 list of positions.
6939
6940 If optional 5th argument STRING is non-nil, it is a string to search
6941 for un-encodable characters.  In that case, START and END are indexes
6942 to the string.  */)
6943      (start, end, coding_system, count, string)
6944      Lisp_Object start, end, coding_system, count, string;
6945 {
6946   int n;
6947   Lisp_Object safe_chars;
6948   struct coding_system coding;
6949   Lisp_Object positions;
6950   int from, to;
6951   unsigned char *p, *pend;
6952
6953   if (NILP (string))
6954     {
6955       validate_region (&start, &end);
6956       from = XINT (start);
6957       to = XINT (end);
6958       if (NILP (current_buffer->enable_multibyte_characters))
6959         return Qnil;
6960       p = CHAR_POS_ADDR (from);
6961       if (to == GPT)
6962         pend = GPT_ADDR;
6963       else
6964         pend = CHAR_POS_ADDR (to);
6965     }
6966   else
6967     {
6968       CHECK_STRING (string);
6969       CHECK_NATNUM (start);
6970       CHECK_NATNUM (end);
6971       from = XINT (start);
6972       to = XINT (end);
6973       if (from > to
6974           || to > SCHARS (string))
6975         args_out_of_range_3 (string, start, end);
6976       if (! STRING_MULTIBYTE (string))
6977         return Qnil;
6978       p = SDATA (string) + string_char_to_byte (string, from);
6979       pend = SDATA (string) + string_char_to_byte (string, to);
6980     }
6981
6982   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6983
6984   if (NILP (count))
6985     n = 1;
6986   else
6987     {
6988       CHECK_NATNUM (count);
6989       n = XINT (count);
6990     }
6991
6992   if (coding.type == coding_type_no_conversion
6993       || coding.type == coding_type_raw_text)
6994     return Qnil;
6995
6996   if (coding.type == coding_type_undecided)
6997     safe_chars = Qnil;
6998   else
6999     safe_chars = coding_safe_chars (coding_system);
7000
7001   if (STRINGP (string)
7002       || from >= GPT || to <= GPT)
7003     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7004   else
7005     {
7006       Lisp_Object args[2];
7007
7008       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7009       n -= XINT (Flength (args[0]));
7010       if (n <= 0)
7011         positions = args[0];
7012       else
7013         {
7014           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7015                                                pend, n);
7016           positions = Fappend (2, args);
7017         }
7018     }
7019
7020   return  (NILP (count) ? Fcar (positions) : positions);
7021 }
7022
7023
7024 Lisp_Object
7025 code_convert_region1 (start, end, coding_system, encodep)
7026      Lisp_Object start, end, coding_system;
7027      int encodep;
7028 {
7029   struct coding_system coding;
7030   int from, to;
7031
7032   CHECK_NUMBER_COERCE_MARKER (start);
7033   CHECK_NUMBER_COERCE_MARKER (end);
7034   CHECK_SYMBOL (coding_system);
7035
7036   validate_region (&start, &end);
7037   from = XFASTINT (start);
7038   to = XFASTINT (end);
7039
7040   if (NILP (coding_system))
7041     return make_number (to - from);
7042
7043   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7044     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7045
7046   coding.mode |= CODING_MODE_LAST_BLOCK;
7047   coding.src_multibyte = coding.dst_multibyte
7048     = !NILP (current_buffer->enable_multibyte_characters);
7049   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7050                        &coding, encodep, 1);
7051   Vlast_coding_system_used = coding.symbol;
7052   return make_number (coding.produced_char);
7053 }
7054
7055 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7056        3, 3, "r\nzCoding system: ",
7057        doc: /* Decode the current region from the specified coding system.
7058 When called from a program, takes three arguments:
7059 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7060 This function sets `last-coding-system-used' to the precise coding system
7061 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7062 not fully specified.)
7063 It returns the length of the decoded text.  */)
7064      (start, end, coding_system)
7065      Lisp_Object start, end, coding_system;
7066 {
7067   return code_convert_region1 (start, end, coding_system, 0);
7068 }
7069
7070 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7071        3, 3, "r\nzCoding system: ",
7072        doc: /* Encode the current region into the specified coding system.
7073 When called from a program, takes three arguments:
7074 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7075 This function sets `last-coding-system-used' to the precise coding system
7076 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7077 not fully specified.)
7078 It returns the length of the encoded text.  */)
7079      (start, end, coding_system)
7080      Lisp_Object start, end, coding_system;
7081 {
7082   return code_convert_region1 (start, end, coding_system, 1);
7083 }
7084
7085 Lisp_Object
7086 code_convert_string1 (string, coding_system, nocopy, encodep)
7087      Lisp_Object string, coding_system, nocopy;
7088      int encodep;
7089 {
7090   struct coding_system coding;
7091
7092   CHECK_STRING (string);
7093   CHECK_SYMBOL (coding_system);
7094
7095   if (NILP (coding_system))
7096     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7097
7098   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7099     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7100
7101   coding.mode |= CODING_MODE_LAST_BLOCK;
7102   string = (encodep
7103             ? encode_coding_string (string, &coding, !NILP (nocopy))
7104             : decode_coding_string (string, &coding, !NILP (nocopy)));
7105   Vlast_coding_system_used = coding.symbol;
7106
7107   return string;
7108 }
7109
7110 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7111        2, 3, 0,
7112        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7113 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7114 if the decoding operation is trivial.
7115 This function sets `last-coding-system-used' to the precise coding system
7116 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7117 not fully specified.)  */)
7118      (string, coding_system, nocopy)
7119      Lisp_Object string, coding_system, nocopy;
7120 {
7121   return code_convert_string1 (string, coding_system, nocopy, 0);
7122 }
7123
7124 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7125        2, 3, 0,
7126        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7127 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7128 if the encoding operation is trivial.
7129 This function sets `last-coding-system-used' to the precise coding system
7130 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7131 not fully specified.)  */)
7132      (string, coding_system, nocopy)
7133      Lisp_Object string, coding_system, nocopy;
7134 {
7135   return code_convert_string1 (string, coding_system, nocopy, 1);
7136 }
7137
7138 /* Encode or decode STRING according to CODING_SYSTEM.
7139    Do not set Vlast_coding_system_used.
7140
7141    This function is called only from macros DECODE_FILE and
7142    ENCODE_FILE, thus we ignore character composition.  */
7143
7144 Lisp_Object
7145 code_convert_string_norecord (string, coding_system, encodep)
7146      Lisp_Object string, coding_system;
7147      int encodep;
7148 {
7149   struct coding_system coding;
7150
7151   CHECK_STRING (string);
7152   CHECK_SYMBOL (coding_system);
7153
7154   if (NILP (coding_system))
7155     return string;
7156
7157   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7158     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7159
7160   coding.composing = COMPOSITION_DISABLED;
7161   coding.mode |= CODING_MODE_LAST_BLOCK;
7162   return (encodep
7163           ? encode_coding_string (string, &coding, 1)
7164           : decode_coding_string (string, &coding, 1));
7165 }
7166 \f
7167 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7168        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7169 Return the corresponding character.  */)
7170      (code)
7171      Lisp_Object code;
7172 {
7173   unsigned char c1, c2, s1, s2;
7174   Lisp_Object val;
7175
7176   CHECK_NUMBER (code);
7177   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7178   if (s1 == 0)
7179     {
7180       if (s2 < 0x80)
7181         XSETFASTINT (val, s2);
7182       else if (s2 >= 0xA0 || s2 <= 0xDF)
7183         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7184       else
7185         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7186     }
7187   else
7188     {
7189       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7190           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7191         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7192       DECODE_SJIS (s1, s2, c1, c2);
7193       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7194     }
7195   return val;
7196 }
7197
7198 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7199        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7200 Return the corresponding code in SJIS.  */)
7201      (ch)
7202      Lisp_Object ch;
7203 {
7204   int charset, c1, c2, s1, s2;
7205   Lisp_Object val;
7206
7207   CHECK_NUMBER (ch);
7208   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7209   if (charset == CHARSET_ASCII)
7210     {
7211       val = ch;
7212     }
7213   else if (charset == charset_jisx0208
7214            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7215     {
7216       ENCODE_SJIS (c1, c2, s1, s2);
7217       XSETFASTINT (val, (s1 << 8) | s2);
7218     }
7219   else if (charset == charset_katakana_jisx0201
7220            && c1 > 0x20 && c2 < 0xE0)
7221     {
7222       XSETFASTINT (val, c1 | 0x80);
7223     }
7224   else
7225     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7226   return val;
7227 }
7228
7229 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7230        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7231 Return the corresponding character.  */)
7232      (code)
7233      Lisp_Object code;
7234 {
7235   int charset;
7236   unsigned char b1, b2, c1, c2;
7237   Lisp_Object val;
7238
7239   CHECK_NUMBER (code);
7240   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7241   if (b1 == 0)
7242     {
7243       if (b2 >= 0x80)
7244         error ("Invalid BIG5 code: %x", XFASTINT (code));
7245       val = code;
7246     }
7247   else
7248     {
7249       if ((b1 < 0xA1 || b1 > 0xFE)
7250           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7251         error ("Invalid BIG5 code: %x", XFASTINT (code));
7252       DECODE_BIG5 (b1, b2, charset, c1, c2);
7253       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7254     }
7255   return val;
7256 }
7257
7258 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7259        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7260 Return the corresponding character code in Big5.  */)
7261      (ch)
7262      Lisp_Object ch;
7263 {
7264   int charset, c1, c2, b1, b2;
7265   Lisp_Object val;
7266
7267   CHECK_NUMBER (ch);
7268   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7269   if (charset == CHARSET_ASCII)
7270     {
7271       val = ch;
7272     }
7273   else if ((charset == charset_big5_1
7274             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7275            || (charset == charset_big5_2
7276                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7277     {
7278       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7279       XSETFASTINT (val, (b1 << 8) | b2);
7280     }
7281   else
7282     error ("Can't encode to Big5: %d", XFASTINT (ch));
7283   return val;
7284 }
7285 \f
7286 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7287        Sset_terminal_coding_system_internal, 1, 1, 0,
7288        doc: /* Internal use only.  */)
7289      (coding_system)
7290      Lisp_Object coding_system;
7291 {
7292   CHECK_SYMBOL (coding_system);
7293   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7294   /* We had better not send unsafe characters to terminal.  */
7295   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7296   /* Character composition should be disabled.  */
7297   terminal_coding.composing = COMPOSITION_DISABLED;
7298   /* Error notification should be suppressed.  */
7299   terminal_coding.suppress_error = 1;
7300   terminal_coding.src_multibyte = 1;
7301   terminal_coding.dst_multibyte = 0;
7302   return Qnil;
7303 }
7304
7305 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7306        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7307        doc: /* Internal use only.  */)
7308      (coding_system)
7309      Lisp_Object coding_system;
7310 {
7311   CHECK_SYMBOL (coding_system);
7312   setup_coding_system (Fcheck_coding_system (coding_system),
7313                        &safe_terminal_coding);
7314   /* Character composition should be disabled.  */
7315   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7316   /* Error notification should be suppressed.  */
7317   safe_terminal_coding.suppress_error = 1;
7318   safe_terminal_coding.src_multibyte = 1;
7319   safe_terminal_coding.dst_multibyte = 0;
7320   return Qnil;
7321 }
7322
7323 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7324        Sterminal_coding_system, 0, 0, 0,
7325        doc: /* Return coding system specified for terminal output.  */)
7326      ()
7327 {
7328   return terminal_coding.symbol;
7329 }
7330
7331 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7332        Sset_keyboard_coding_system_internal, 1, 1, 0,
7333        doc: /* Internal use only.  */)
7334      (coding_system)
7335      Lisp_Object coding_system;
7336 {
7337   CHECK_SYMBOL (coding_system);
7338   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7339   /* Character composition should be disabled.  */
7340   keyboard_coding.composing = COMPOSITION_DISABLED;
7341   return Qnil;
7342 }
7343
7344 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7345        Skeyboard_coding_system, 0, 0, 0,
7346        doc: /* Return coding system specified for decoding keyboard input.  */)
7347      ()
7348 {
7349   return keyboard_coding.symbol;
7350 }
7351
7352 \f
7353 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7354        Sfind_operation_coding_system,  1, MANY, 0,
7355        doc: /* Choose a coding system for an operation based on the target name.
7356 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7357 DECODING-SYSTEM is the coding system to use for decoding
7358 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7359 for encoding (in case OPERATION does encoding).
7360
7361 The first argument OPERATION specifies an I/O primitive:
7362   For file I/O, `insert-file-contents' or `write-region'.
7363   For process I/O, `call-process', `call-process-region', or `start-process'.
7364   For network I/O, `open-network-stream'.
7365
7366 The remaining arguments should be the same arguments that were passed
7367 to the primitive.  Depending on which primitive, one of those arguments
7368 is selected as the TARGET.  For example, if OPERATION does file I/O,
7369 whichever argument specifies the file name is TARGET.
7370
7371 TARGET has a meaning which depends on OPERATION:
7372   For file I/O, TARGET is a file name.
7373   For process I/O, TARGET is a process name.
7374   For network I/O, TARGET is a service name or a port number
7375
7376 This function looks up what specified for TARGET in,
7377 `file-coding-system-alist', `process-coding-system-alist',
7378 or `network-coding-system-alist' depending on OPERATION.
7379 They may specify a coding system, a cons of coding systems,
7380 or a function symbol to call.
7381 In the last case, we call the function with one argument,
7382 which is a list of all the arguments given to this function.
7383
7384 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7385      (nargs, args)
7386      int nargs;
7387      Lisp_Object *args;
7388 {
7389   Lisp_Object operation, target_idx, target, val;
7390   register Lisp_Object chain;
7391
7392   if (nargs < 2)
7393     error ("Too few arguments");
7394   operation = args[0];
7395   if (!SYMBOLP (operation)
7396       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7397     error ("Invalid first argument");
7398   if (nargs < 1 + XINT (target_idx))
7399     error ("Too few arguments for operation: %s",
7400            SDATA (SYMBOL_NAME (operation)));
7401   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7402      argument to write-region) is string, it must be treated as a
7403      target file name.  */
7404   if (EQ (operation, Qwrite_region)
7405       && nargs > 5
7406       && STRINGP (args[5]))
7407     target_idx = make_number (4);
7408   target = args[XINT (target_idx) + 1];
7409   if (!(STRINGP (target)
7410         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7411     error ("Invalid argument %d", XINT (target_idx) + 1);
7412
7413   chain = ((EQ (operation, Qinsert_file_contents)
7414             || EQ (operation, Qwrite_region))
7415            ? Vfile_coding_system_alist
7416            : (EQ (operation, Qopen_network_stream)
7417               ? Vnetwork_coding_system_alist
7418               : Vprocess_coding_system_alist));
7419   if (NILP (chain))
7420     return Qnil;
7421
7422   for (; CONSP (chain); chain = XCDR (chain))
7423     {
7424       Lisp_Object elt;
7425       elt = XCAR (chain);
7426
7427       if (CONSP (elt)
7428           && ((STRINGP (target)
7429                && STRINGP (XCAR (elt))
7430                && fast_string_match (XCAR (elt), target) >= 0)
7431               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7432         {
7433           val = XCDR (elt);
7434           /* Here, if VAL is both a valid coding system and a valid
7435              function symbol, we return VAL as a coding system.  */
7436           if (CONSP (val))
7437             return val;
7438           if (! SYMBOLP (val))
7439             return Qnil;
7440           if (! NILP (Fcoding_system_p (val)))
7441             return Fcons (val, val);
7442           if (! NILP (Ffboundp (val)))
7443             {
7444               val = call1 (val, Flist (nargs, args));
7445               if (CONSP (val))
7446                 return val;
7447               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7448                 return Fcons (val, val);
7449             }
7450           return Qnil;
7451         }
7452     }
7453   return Qnil;
7454 }
7455
7456 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7457        Supdate_coding_systems_internal, 0, 0, 0,
7458        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7459 When values of any coding categories are changed, you must
7460 call this function.  */)
7461      ()
7462 {
7463   int i;
7464
7465   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7466     {
7467       Lisp_Object val;
7468
7469       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7470       if (!NILP (val))
7471         {
7472           if (! coding_system_table[i])
7473             coding_system_table[i] = ((struct coding_system *)
7474                                       xmalloc (sizeof (struct coding_system)));
7475           setup_coding_system (val, coding_system_table[i]);
7476         }
7477       else if (coding_system_table[i])
7478         {
7479           xfree (coding_system_table[i]);
7480           coding_system_table[i] = NULL;
7481         }
7482     }
7483
7484   return Qnil;
7485 }
7486
7487 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7488        Sset_coding_priority_internal, 0, 0, 0,
7489        doc: /* Update internal database for the current value of `coding-category-list'.
7490 This function is internal use only.  */)
7491      ()
7492 {
7493   int i = 0, idx;
7494   Lisp_Object val;
7495
7496   val = Vcoding_category_list;
7497
7498   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7499     {
7500       if (! SYMBOLP (XCAR (val)))
7501         break;
7502       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7503       if (idx >= CODING_CATEGORY_IDX_MAX)
7504         break;
7505       coding_priorities[i++] = (1 << idx);
7506       val = XCDR (val);
7507     }
7508   /* If coding-category-list is valid and contains all coding
7509      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7510      the following code saves Emacs from crashing.  */
7511   while (i < CODING_CATEGORY_IDX_MAX)
7512     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7513
7514   return Qnil;
7515 }
7516
7517 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7518        Sdefine_coding_system_internal, 1, 1, 0,
7519        doc: /* Register CODING-SYSTEM as a base coding system.
7520 This function is internal use only.  */)
7521      (coding_system)
7522      Lisp_Object coding_system;
7523 {
7524   Lisp_Object safe_chars, slot;
7525
7526   if (NILP (Fcheck_coding_system (coding_system)))
7527     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7528   safe_chars = coding_safe_chars (coding_system);
7529   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7530     error ("No valid safe-chars property for %s",
7531            SDATA (SYMBOL_NAME (coding_system)));
7532   if (EQ (safe_chars, Qt))
7533     {
7534       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7535         XSETCAR (Vcoding_system_safe_chars,
7536                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7537     }
7538   else
7539     {
7540       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7541       if (NILP (slot))
7542         XSETCDR (Vcoding_system_safe_chars,
7543                  nconc2 (XCDR (Vcoding_system_safe_chars),
7544                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7545       else
7546         XSETCDR (slot, safe_chars);
7547     }
7548   return Qnil;
7549 }
7550
7551 #endif /* emacs */
7552
7553 \f
7554 /*** 9. Post-amble ***/
7555
7556 void
7557 init_coding_once ()
7558 {
7559   int i;
7560
7561   /* Emacs' internal format specific initialize routine.  */
7562   for (i = 0; i <= 0x20; i++)
7563     emacs_code_class[i] = EMACS_control_code;
7564   emacs_code_class[0x0A] = EMACS_linefeed_code;
7565   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7566   for (i = 0x21 ; i < 0x7F; i++)
7567     emacs_code_class[i] = EMACS_ascii_code;
7568   emacs_code_class[0x7F] = EMACS_control_code;
7569   for (i = 0x80; i < 0xFF; i++)
7570     emacs_code_class[i] = EMACS_invalid_code;
7571   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7572   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7573   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7574   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7575
7576   /* ISO2022 specific initialize routine.  */
7577   for (i = 0; i < 0x20; i++)
7578     iso_code_class[i] = ISO_control_0;
7579   for (i = 0x21; i < 0x7F; i++)
7580     iso_code_class[i] = ISO_graphic_plane_0;
7581   for (i = 0x80; i < 0xA0; i++)
7582     iso_code_class[i] = ISO_control_1;
7583   for (i = 0xA1; i < 0xFF; i++)
7584     iso_code_class[i] = ISO_graphic_plane_1;
7585   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7586   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7587   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7588   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7589   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7590   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7591   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7592   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7593   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7594   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7595
7596   setup_coding_system (Qnil, &keyboard_coding);
7597   setup_coding_system (Qnil, &terminal_coding);
7598   setup_coding_system (Qnil, &safe_terminal_coding);
7599   setup_coding_system (Qnil, &default_buffer_file_coding);
7600
7601   bzero (coding_system_table, sizeof coding_system_table);
7602
7603   bzero (ascii_skip_code, sizeof ascii_skip_code);
7604   for (i = 0; i < 128; i++)
7605     ascii_skip_code[i] = 1;
7606
7607 #if defined (MSDOS) || defined (WINDOWSNT)
7608   system_eol_type = CODING_EOL_CRLF;
7609 #else
7610   system_eol_type = CODING_EOL_LF;
7611 #endif
7612
7613   inhibit_pre_post_conversion = 0;
7614 }
7615
7616 #ifdef emacs
7617
7618 void
7619 syms_of_coding ()
7620 {
7621   staticpro (&Vcode_conversion_workbuf_name);
7622   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7623
7624   Qtarget_idx = intern ("target-idx");
7625   staticpro (&Qtarget_idx);
7626
7627   Qcoding_system_history = intern ("coding-system-history");
7628   staticpro (&Qcoding_system_history);
7629   Fset (Qcoding_system_history, Qnil);
7630
7631   /* Target FILENAME is the first argument.  */
7632   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7633   /* Target FILENAME is the third argument.  */
7634   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7635
7636   Qcall_process = intern ("call-process");
7637   staticpro (&Qcall_process);
7638   /* Target PROGRAM is the first argument.  */
7639   Fput (Qcall_process, Qtarget_idx, make_number (0));
7640
7641   Qcall_process_region = intern ("call-process-region");
7642   staticpro (&Qcall_process_region);
7643   /* Target PROGRAM is the third argument.  */
7644   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7645
7646   Qstart_process = intern ("start-process");
7647   staticpro (&Qstart_process);
7648   /* Target PROGRAM is the third argument.  */
7649   Fput (Qstart_process, Qtarget_idx, make_number (2));
7650
7651   Qopen_network_stream = intern ("open-network-stream");
7652   staticpro (&Qopen_network_stream);
7653   /* Target SERVICE is the fourth argument.  */
7654   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7655
7656   Qcoding_system = intern ("coding-system");
7657   staticpro (&Qcoding_system);
7658
7659   Qeol_type = intern ("eol-type");
7660   staticpro (&Qeol_type);
7661
7662   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7663   staticpro (&Qbuffer_file_coding_system);
7664
7665   Qpost_read_conversion = intern ("post-read-conversion");
7666   staticpro (&Qpost_read_conversion);
7667
7668   Qpre_write_conversion = intern ("pre-write-conversion");
7669   staticpro (&Qpre_write_conversion);
7670
7671   Qno_conversion = intern ("no-conversion");
7672   staticpro (&Qno_conversion);
7673
7674   Qundecided = intern ("undecided");
7675   staticpro (&Qundecided);
7676
7677   Qcoding_system_p = intern ("coding-system-p");
7678   staticpro (&Qcoding_system_p);
7679
7680   Qcoding_system_error = intern ("coding-system-error");
7681   staticpro (&Qcoding_system_error);
7682
7683   Fput (Qcoding_system_error, Qerror_conditions,
7684         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7685   Fput (Qcoding_system_error, Qerror_message,
7686         build_string ("Invalid coding system"));
7687
7688   Qcoding_category = intern ("coding-category");
7689   staticpro (&Qcoding_category);
7690   Qcoding_category_index = intern ("coding-category-index");
7691   staticpro (&Qcoding_category_index);
7692
7693   Vcoding_category_table
7694     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7695   staticpro (&Vcoding_category_table);
7696   {
7697     int i;
7698     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7699       {
7700         XVECTOR (Vcoding_category_table)->contents[i]
7701           = intern (coding_category_name[i]);
7702         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7703               Qcoding_category_index, make_number (i));
7704       }
7705   }
7706
7707   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7708   staticpro (&Vcoding_system_safe_chars);
7709
7710   Qtranslation_table = intern ("translation-table");
7711   staticpro (&Qtranslation_table);
7712   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7713
7714   Qtranslation_table_id = intern ("translation-table-id");
7715   staticpro (&Qtranslation_table_id);
7716
7717   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7718   staticpro (&Qtranslation_table_for_decode);
7719
7720   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7721   staticpro (&Qtranslation_table_for_encode);
7722
7723   Qsafe_chars = intern ("safe-chars");
7724   staticpro (&Qsafe_chars);
7725
7726   Qchar_coding_system = intern ("char-coding-system");
7727   staticpro (&Qchar_coding_system);
7728
7729   /* Intern this now in case it isn't already done.
7730      Setting this variable twice is harmless.
7731      But don't staticpro it here--that is done in alloc.c.  */
7732   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7733   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7734   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7735
7736   Qvalid_codes = intern ("valid-codes");
7737   staticpro (&Qvalid_codes);
7738
7739   Qemacs_mule = intern ("emacs-mule");
7740   staticpro (&Qemacs_mule);
7741
7742   Qraw_text = intern ("raw-text");
7743   staticpro (&Qraw_text);
7744
7745   Qutf_8 = intern ("utf-8");
7746   staticpro (&Qutf_8);
7747
7748   Qcoding_system_define_form = intern ("coding-system-define-form");
7749   staticpro (&Qcoding_system_define_form);
7750
7751   defsubr (&Scoding_system_p);
7752   defsubr (&Sread_coding_system);
7753   defsubr (&Sread_non_nil_coding_system);
7754   defsubr (&Scheck_coding_system);
7755   defsubr (&Sdetect_coding_region);
7756   defsubr (&Sdetect_coding_string);
7757   defsubr (&Sfind_coding_systems_region_internal);
7758   defsubr (&Sunencodable_char_position);
7759   defsubr (&Sdecode_coding_region);
7760   defsubr (&Sencode_coding_region);
7761   defsubr (&Sdecode_coding_string);
7762   defsubr (&Sencode_coding_string);
7763   defsubr (&Sdecode_sjis_char);
7764   defsubr (&Sencode_sjis_char);
7765   defsubr (&Sdecode_big5_char);
7766   defsubr (&Sencode_big5_char);
7767   defsubr (&Sset_terminal_coding_system_internal);
7768   defsubr (&Sset_safe_terminal_coding_system_internal);
7769   defsubr (&Sterminal_coding_system);
7770   defsubr (&Sset_keyboard_coding_system_internal);
7771   defsubr (&Skeyboard_coding_system);
7772   defsubr (&Sfind_operation_coding_system);
7773   defsubr (&Supdate_coding_systems_internal);
7774   defsubr (&Sset_coding_priority_internal);
7775   defsubr (&Sdefine_coding_system_internal);
7776
7777   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7778                doc: /* List of coding systems.
7779
7780 Do not alter the value of this variable manually.  This variable should be
7781 updated by the functions `make-coding-system' and
7782 `define-coding-system-alias'.  */);
7783   Vcoding_system_list = Qnil;
7784
7785   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7786                doc: /* Alist of coding system names.
7787 Each element is one element list of coding system name.
7788 This variable is given to `completing-read' as TABLE argument.
7789
7790 Do not alter the value of this variable manually.  This variable should be
7791 updated by the functions `make-coding-system' and
7792 `define-coding-system-alias'.  */);
7793   Vcoding_system_alist = Qnil;
7794
7795   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7796                doc: /* List of coding-categories (symbols) ordered by priority.
7797
7798 On detecting a coding system, Emacs tries code detection algorithms
7799 associated with each coding-category one by one in this order.  When
7800 one algorithm agrees with a byte sequence of source text, the coding
7801 system bound to the corresponding coding-category is selected.  */);
7802   {
7803     int i;
7804
7805     Vcoding_category_list = Qnil;
7806     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7807       Vcoding_category_list
7808         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7809                  Vcoding_category_list);
7810   }
7811
7812   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7813                doc: /* Specify the coding system for read operations.
7814 It is useful to bind this variable with `let', but do not set it globally.
7815 If the value is a coding system, it is used for decoding on read operation.
7816 If not, an appropriate element is used from one of the coding system alists:
7817 There are three such tables, `file-coding-system-alist',
7818 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7819   Vcoding_system_for_read = Qnil;
7820
7821   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7822                doc: /* Specify the coding system for write operations.
7823 Programs bind this variable with `let', but you should not set it globally.
7824 If the value is a coding system, it is used for encoding of output,
7825 when writing it to a file and when sending it to a file or subprocess.
7826
7827 If this does not specify a coding system, an appropriate element
7828 is used from one of the coding system alists:
7829 There are three such tables, `file-coding-system-alist',
7830 `process-coding-system-alist', and `network-coding-system-alist'.
7831 For output to files, if the above procedure does not specify a coding system,
7832 the value of `buffer-file-coding-system' is used.  */);
7833   Vcoding_system_for_write = Qnil;
7834
7835   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7836                doc: /* Coding system used in the latest file or process I/O.
7837 Also set by `encode-coding-region', `decode-coding-region',
7838 `encode-coding-string' and `decode-coding-string'.  */);
7839   Vlast_coding_system_used = Qnil;
7840
7841   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7842                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7843 See info node `Coding Systems' and info node `Text and Binary' concerning
7844 such conversion.  */);
7845   inhibit_eol_conversion = 0;
7846
7847   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7848                doc: /* Non-nil means process buffer inherits coding system of process output.
7849 Bind it to t if the process output is to be treated as if it were a file
7850 read from some filesystem.  */);
7851   inherit_process_coding_system = 0;
7852
7853   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7854                doc: /* Alist to decide a coding system to use for a file I/O operation.
7855 The format is ((PATTERN . VAL) ...),
7856 where PATTERN is a regular expression matching a file name,
7857 VAL is a coding system, a cons of coding systems, or a function symbol.
7858 If VAL is a coding system, it is used for both decoding and encoding
7859 the file contents.
7860 If VAL is a cons of coding systems, the car part is used for decoding,
7861 and the cdr part is used for encoding.
7862 If VAL is a function symbol, the function must return a coding system
7863 or a cons of coding systems which are used as above.  The function gets
7864 the arguments with which `find-operation-coding-system' was called.
7865
7866 See also the function `find-operation-coding-system'
7867 and the variable `auto-coding-alist'.  */);
7868   Vfile_coding_system_alist = Qnil;
7869
7870   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7871     doc: /* Alist to decide a coding system to use for a process I/O operation.
7872 The format is ((PATTERN . VAL) ...),
7873 where PATTERN is a regular expression matching a program name,
7874 VAL is a coding system, a cons of coding systems, or a function symbol.
7875 If VAL is a coding system, it is used for both decoding what received
7876 from the program and encoding what sent to the program.
7877 If VAL is a cons of coding systems, the car part is used for decoding,
7878 and the cdr part is used for encoding.
7879 If VAL is a function symbol, the function must return a coding system
7880 or a cons of coding systems which are used as above.
7881
7882 See also the function `find-operation-coding-system'.  */);
7883   Vprocess_coding_system_alist = Qnil;
7884
7885   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7886     doc: /* Alist to decide a coding system to use for a network I/O operation.
7887 The format is ((PATTERN . VAL) ...),
7888 where PATTERN is a regular expression matching a network service name
7889 or is a port number to connect to,
7890 VAL is a coding system, a cons of coding systems, or a function symbol.
7891 If VAL is a coding system, it is used for both decoding what received
7892 from the network stream and encoding what sent to the network stream.
7893 If VAL is a cons of coding systems, the car part is used for decoding,
7894 and the cdr part is used for encoding.
7895 If VAL is a function symbol, the function must return a coding system
7896 or a cons of coding systems which are used as above.
7897
7898 See also the function `find-operation-coding-system'.  */);
7899   Vnetwork_coding_system_alist = Qnil;
7900
7901   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7902                doc: /* Coding system to use with system messages.
7903 Also used for decoding keyboard input on X Window system.  */);
7904   Vlocale_coding_system = Qnil;
7905
7906   /* The eol mnemonics are reset in startup.el system-dependently.  */
7907   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7908                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7909   eol_mnemonic_unix = build_string (":");
7910
7911   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7912                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7913   eol_mnemonic_dos = build_string ("\\");
7914
7915   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7916                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7917   eol_mnemonic_mac = build_string ("/");
7918
7919   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7920                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7921   eol_mnemonic_undecided = build_string (":");
7922
7923   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7924                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7925   Venable_character_translation = Qt;
7926
7927   DEFVAR_LISP ("standard-translation-table-for-decode",
7928                &Vstandard_translation_table_for_decode,
7929                doc: /* Table for translating characters while decoding.  */);
7930   Vstandard_translation_table_for_decode = Qnil;
7931
7932   DEFVAR_LISP ("standard-translation-table-for-encode",
7933                &Vstandard_translation_table_for_encode,
7934                doc: /* Table for translating characters while encoding.  */);
7935   Vstandard_translation_table_for_encode = Qnil;
7936
7937   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7938                doc: /* Alist of charsets vs revision numbers.
7939 While encoding, if a charset (car part of an element) is found,
7940 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7941   Vcharset_revision_alist = Qnil;
7942
7943   DEFVAR_LISP ("default-process-coding-system",
7944                &Vdefault_process_coding_system,
7945                doc: /* Cons of coding systems used for process I/O by default.
7946 The car part is used for decoding a process output,
7947 the cdr part is used for encoding a text to be sent to a process.  */);
7948   Vdefault_process_coding_system = Qnil;
7949
7950   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7951                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7952 This is a vector of length 256.
7953 If Nth element is non-nil, the existence of code N in a file
7954 \(or output of subprocess) doesn't prevent it to be detected as
7955 a coding system of ISO 2022 variant which has a flag
7956 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7957 or reading output of a subprocess.
7958 Only 128th through 159th elements has a meaning.  */);
7959   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7960
7961   DEFVAR_LISP ("select-safe-coding-system-function",
7962                &Vselect_safe_coding_system_function,
7963                doc: /* Function to call to select safe coding system for encoding a text.
7964
7965 If set, this function is called to force a user to select a proper
7966 coding system which can encode the text in the case that a default
7967 coding system used in each operation can't encode the text.
7968
7969 The default value is `select-safe-coding-system' (which see).  */);
7970   Vselect_safe_coding_system_function = Qnil;
7971
7972   DEFVAR_BOOL ("coding-system-require-warning",
7973                &coding_system_require_warning,
7974                doc: /* Internal use only.
7975 If non-nil, on writing a file, `select-safe-coding-system-function' is
7976 called even if `coding-system-for-write' is non-nil.  The command
7977 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7978   coding_system_require_warning = 0;
7979
7980
7981   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7982                &inhibit_iso_escape_detection,
7983                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7984
7985 By default, on reading a file, Emacs tries to detect how the text is
7986 encoded.  This code detection is sensitive to escape sequences.  If
7987 the sequence is valid as ISO2022, the code is determined as one of
7988 the ISO2022 encodings, and the file is decoded by the corresponding
7989 coding system (e.g. `iso-2022-7bit').
7990
7991 However, there may be a case that you want to read escape sequences in
7992 a file as is.  In such a case, you can set this variable to non-nil.
7993 Then, as the code detection ignores any escape sequences, no file is
7994 detected as encoded in some ISO2022 encoding.  The result is that all
7995 escape sequences become visible in a buffer.
7996
7997 The default value is nil, and it is strongly recommended not to change
7998 it.  That is because many Emacs Lisp source files that contain
7999 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8000 in Emacs's distribution, and they won't be decoded correctly on
8001 reading if you suppress escape sequence detection.
8002
8003 The other way to read escape sequences in a file without decoding is
8004 to explicitly specify some coding system that doesn't use ISO2022's
8005 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8006   inhibit_iso_escape_detection = 0;
8007
8008   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8009                doc: /* Char table for translating self-inserting characters.
8010 This is applied to the result of input methods, not their input.  See also
8011 `keyboard-translate-table'.  */);
8012     Vtranslation_table_for_input = Qnil;
8013 }
8014
8015 char *
8016 emacs_strerror (error_number)
8017      int error_number;
8018 {
8019   char *str;
8020
8021   synchronize_system_messages_locale ();
8022   str = strerror (error_number);
8023
8024   if (! NILP (Vlocale_coding_system))
8025     {
8026       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8027                                                       Vlocale_coding_system,
8028                                                       0);
8029       str = (char *) SDATA (dec);
8030     }
8031
8032   return str;
8033 }
8034
8035 #endif /* emacs */
8036
8037 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8038    (do not change this comment) */