src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      const unsigned char *source;
 151      unsigned char *destination;
 152      int src_bytes, dst_bytes;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 159
 160   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 161   internal multibyte format to CODING.  The resulting unibyte text
 162   goes to a place pointed to by DESTINATION, the length of which
 163   should not exceed DST_BYTES.
 164
 165   These functions set the information about original and encoded texts
 166   in the members `produced', `produced_char', `consumed', and
 167   `consumed_char' of the structure *CODING.  They also set the member
 168   `result' to one of CODING_FINISH_XXX indicating how the encoding
 169   finished.
 170
 171   DST_BYTES zero means that the source area and destination area are
 172   overlapped, which means that we can produce encoded text until it
 173   reaches at the head of the not-yet-encoded source text.
 174
 175   Below is a template for these functions.  */
 176 #if 0
 177 static void
 178 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 179      struct coding_system *coding;
 180      unsigned char *source, *destination;
 181      int src_bytes, dst_bytes;
 182 {
 183   ...
 184 }
 185 #endif
 186
 187 /*** COMMONLY USED MACROS ***/
 188
 189 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 190    get one, two, and three bytes from the source text respectively.
 191    If there are not enough bytes in the source, they jump to
 192    `label_end_of_loop'.  The caller should set variables `coding',
 193    `src' and `src_end' to appropriate pointer in advance.  These
 194    macros are called from decoding routines `decode_coding_XXX', thus
 195    it is assumed that the source text is unibyte.  */
 196
 197 #define ONE_MORE_BYTE(c1)                                       \
 198   do {                                                          \
 199     if (src >= src_end)                                         \
 200       {                                                         \
 201         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 202         goto label_end_of_loop;                                 \
 203       }                                                         \
 204     c1 = *src++;                                                \
 205   } while (0)
 206
 207 #define TWO_MORE_BYTES(c1, c2)                                  \
 208   do {                                                          \
 209     if (src + 1 >= src_end)                                     \
 210       {                                                         \
 211         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 212         goto label_end_of_loop;                                 \
 213       }                                                         \
 214     c1 = *src++;                                                \
 215     c2 = *src++;                                                \
 216   } while (0)
 217
 218
 219 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 220    form if MULTIBYTEP is nonzero.  */
 221
 222 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 223   do {                                                          \
 224     if (src >= src_end)                                         \
 225       {                                                         \
 226         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 227         goto label_end_of_loop;                                 \
 228       }                                                         \
 229     c1 = *src++;                                                \
 230     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 231       c1 = *src++ - 0x20;                                       \
 232   } while (0)
 233
 234 /* Set C to the next character at the source text pointed by `src'.
 235    If there are not enough characters in the source, jump to
 236    `label_end_of_loop'.  The caller should set variables `coding'
 237    `src', `src_end', and `translation_table' to appropriate pointers
 238    in advance.  This macro is used in encoding routines
 239    `encode_coding_XXX', thus it assumes that the source text is in
 240    multibyte form except for 8-bit characters.  8-bit characters are
 241    in multibyte form if coding->src_multibyte is nonzero, else they
 242    are represented by a single byte.  */
 243
 244 #define ONE_MORE_CHAR(c)                                        \
 245   do {                                                          \
 246     int len = src_end - src;                                    \
 247     int bytes;                                                  \
 248     if (len <= 0)                                               \
 249       {                                                         \
 250         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 251         goto label_end_of_loop;                                 \
 252       }                                                         \
 253     if (coding->src_multibyte                                   \
 254         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 255       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 256     else                                                        \
 257       c = *src, bytes = 1;                                      \
 258     if (!NILP (translation_table))                              \
 259       c = translate_char (translation_table, c, -1, 0, 0);      \
 260     src += bytes;                                               \
 261   } while (0)
 262
 263
 264 /* Produce a multibyte form of character C to `dst'.  Jump to
 265    `label_end_of_loop' if there's not enough space at `dst'.
 266
 267    If we are now in the middle of a composition sequence, the decoded
 268    character may be ALTCHAR (for the current composition).  In that
 269    case, the character goes to coding->cmp_data->data instead of
 270    `dst'.
 271
 272    This macro is used in decoding routines.  */
 273
 274 #define EMIT_CHAR(c)                                                    \
 275   do {                                                                  \
 276     if (! COMPOSING_P (coding)                                          \
 277         || coding->composing == COMPOSITION_RELATIVE                    \
 278         || coding->composing == COMPOSITION_WITH_RULE)                  \
 279       {                                                                 \
 280         int bytes = CHAR_BYTES (c);                                     \
 281         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 282           {                                                             \
 283             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 284             goto label_end_of_loop;                                     \
 285           }                                                             \
 286         dst += CHAR_STRING (c, dst);                                    \
 287         coding->produced_char++;                                        \
 288       }                                                                 \
 289                                                                         \
 290     if (COMPOSING_P (coding)                                            \
 291         && coding->composing != COMPOSITION_RELATIVE)                   \
 292       {                                                                 \
 293         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 294         coding->composition_rule_follows                                \
 295           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 296       }                                                                 \
 297   } while (0)
 298
 299
 300 #define EMIT_ONE_BYTE(c)                                        \
 301   do {                                                          \
 302     if (dst >= (dst_bytes ? dst_end : src))                     \
 303       {                                                         \
 304         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 305         goto label_end_of_loop;                                 \
 306       }                                                         \
 307     *dst++ = c;                                                 \
 308   } while (0)
 309
 310 #define EMIT_TWO_BYTES(c1, c2)                                  \
 311   do {                                                          \
 312     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 313       {                                                         \
 314         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 315         goto label_end_of_loop;                                 \
 316       }                                                         \
 317     *dst++ = c1, *dst++ = c2;                                   \
 318   } while (0)
 319
 320 #define EMIT_BYTES(from, to)                                    \
 321   do {                                                          \
 322     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 323       {                                                         \
 324         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 325         goto label_end_of_loop;                                 \
 326       }                                                         \
 327     while (from < to)                                           \
 328       *dst++ = *from++;                                         \
 329   } while (0)
 330
 331 \f
 332 /*** 1. Preamble ***/
 333
 334 #ifdef emacs
 335 #include <config.h>
 336 #endif
 337
 338 #include <stdio.h>
 339
 340 #ifdef emacs
 341
 342 #include "lisp.h"
 343 #include "buffer.h"
 344 #include "charset.h"
 345 #include "composite.h"
 346 #include "ccl.h"
 347 #include "coding.h"
 348 #include "window.h"
 349 #include "intervals.h"
 350 #include "frame.h"
 351 #include "termhooks.h"
 352
 353 #else  /* not emacs */
 354
 355 #include "mulelib.h"
 356
 357 #endif /* not emacs */
 358
 359 Lisp_Object Qcoding_system, Qeol_type;
 360 Lisp_Object Qbuffer_file_coding_system;
 361 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 362 Lisp_Object Qno_conversion, Qundecided;
 363 Lisp_Object Qcoding_system_history;
 364 Lisp_Object Qsafe_chars;
 365 Lisp_Object Qvalid_codes;
 366
 367 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 368 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 369 Lisp_Object Qstart_process, Qopen_network_stream;
 370 Lisp_Object Qtarget_idx;
 371
 372 /* If a symbol has this property, evaluate the value to define the
 373    symbol as a coding system.  */
 374 Lisp_Object Qcoding_system_define_form;
 375
 376 Lisp_Object Vselect_safe_coding_system_function;
 377
 378 int coding_system_require_warning;
 379
 380 /* Mnemonic string for each format of end-of-line.  */
 381 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 382 /* Mnemonic string to indicate format of end-of-line is not yet
 383    decided.  */
 384 Lisp_Object eol_mnemonic_undecided;
 385
 386 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 387    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 388 int system_eol_type;
 389
 390 #ifdef emacs
 391
 392 /* Information about which coding system is safe for which chars.
 393    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 394
 395    GENERIC-LIST is a list of generic coding systems which can encode
 396    any characters.
 397
 398    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 399    corresponding char table that contains safe chars.  */
 400 Lisp_Object Vcoding_system_safe_chars;
 401
 402 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 403
 404 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 405
 406 /* Coding system emacs-mule and raw-text are for converting only
 407    end-of-line format.  */
 408 Lisp_Object Qemacs_mule, Qraw_text;
 409
 410 Lisp_Object Qutf_8;
 411
 412 /* Coding-systems are handed between Emacs Lisp programs and C internal
 413    routines by the following three variables.  */
 414 /* Coding-system for reading files and receiving data from process.  */
 415 Lisp_Object Vcoding_system_for_read;
 416 /* Coding-system for writing files and sending data to process.  */
 417 Lisp_Object Vcoding_system_for_write;
 418 /* Coding-system actually used in the latest I/O.  */
 419 Lisp_Object Vlast_coding_system_used;
 420
 421 /* A vector of length 256 which contains information about special
 422    Latin codes (especially for dealing with Microsoft codes).  */
 423 Lisp_Object Vlatin_extra_code_table;
 424
 425 /* Flag to inhibit code conversion of end-of-line format.  */
 426 int inhibit_eol_conversion;
 427
 428 /* Flag to inhibit ISO2022 escape sequence detection.  */
 429 int inhibit_iso_escape_detection;
 430
 431 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 432 int inherit_process_coding_system;
 433
 434 /* Coding system to be used to encode text for terminal display when
 435    terminal coding system is nil.  */
 436 struct coding_system safe_terminal_coding;
 437
 438 /* Default coding system to be used to write a file.  */
 439 struct coding_system default_buffer_file_coding;
 440
 441 Lisp_Object Vfile_coding_system_alist;
 442 Lisp_Object Vprocess_coding_system_alist;
 443 Lisp_Object Vnetwork_coding_system_alist;
 444
 445 Lisp_Object Vlocale_coding_system;
 446
 447 #endif /* emacs */
 448
 449 Lisp_Object Qcoding_category, Qcoding_category_index;
 450
 451 /* List of symbols `coding-category-xxx' ordered by priority.  */
 452 Lisp_Object Vcoding_category_list;
 453
 454 /* Table of coding categories (Lisp symbols).  */
 455 Lisp_Object Vcoding_category_table;
 456
 457 /* Table of names of symbol for each coding-category.  */
 458 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 459   "coding-category-emacs-mule",
 460   "coding-category-sjis",
 461   "coding-category-iso-7",
 462   "coding-category-iso-7-tight",
 463   "coding-category-iso-8-1",
 464   "coding-category-iso-8-2",
 465   "coding-category-iso-7-else",
 466   "coding-category-iso-8-else",
 467   "coding-category-ccl",
 468   "coding-category-big5",
 469   "coding-category-utf-8",
 470   "coding-category-utf-16-be",
 471   "coding-category-utf-16-le",
 472   "coding-category-raw-text",
 473   "coding-category-binary"
 474 };
 475
 476 /* Table of pointers to coding systems corresponding to each coding
 477    categories.  */
 478 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 479
 480 /* Table of coding category masks.  Nth element is a mask for a coding
 481    category of which priority is Nth.  */
 482 static
 483 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 484
 485 /* Flag to tell if we look up translation table on character code
 486    conversion.  */
 487 Lisp_Object Venable_character_translation;
 488 /* Standard translation table to look up on decoding (reading).  */
 489 Lisp_Object Vstandard_translation_table_for_decode;
 490 /* Standard translation table to look up on encoding (writing).  */
 491 Lisp_Object Vstandard_translation_table_for_encode;
 492
 493 Lisp_Object Qtranslation_table;
 494 Lisp_Object Qtranslation_table_id;
 495 Lisp_Object Qtranslation_table_for_decode;
 496 Lisp_Object Qtranslation_table_for_encode;
 497
 498 /* Alist of charsets vs revision number.  */
 499 Lisp_Object Vcharset_revision_alist;
 500
 501 /* Default coding systems used for process I/O.  */
 502 Lisp_Object Vdefault_process_coding_system;
 503
 504 /* Char table for translating Quail and self-inserting input.  */
 505 Lisp_Object Vtranslation_table_for_input;
 506
 507 /* Global flag to tell that we can't call post-read-conversion and
 508    pre-write-conversion functions.  Usually the value is zero, but it
 509    is set to 1 temporarily while such functions are running.  This is
 510    to avoid infinite recursive call.  */
 511 static int inhibit_pre_post_conversion;
 512
 513 Lisp_Object Qchar_coding_system;
 514
 515 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 516    its validity.  */
 517
 518 Lisp_Object
 519 coding_safe_chars (coding_system)
 520      Lisp_Object coding_system;
 521 {
 522   Lisp_Object coding_spec, plist, safe_chars;
 523
 524   coding_spec = Fget (coding_system, Qcoding_system);
 525   plist = XVECTOR (coding_spec)->contents[3];
 526   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 527   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 528 }
 529
 530 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 531   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 532
 533 \f
 534 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 535
 536 /* Emacs' internal format for representation of multiple character
 537    sets is a kind of multi-byte encoding, i.e. characters are
 538    represented by variable-length sequences of one-byte codes.
 539
 540    ASCII characters and control characters (e.g. `tab', `newline') are
 541    represented by one-byte sequences which are their ASCII codes, in
 542    the range 0x00 through 0x7F.
 543
 544    8-bit characters of the range 0x80..0x9F are represented by
 545    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 546    code + 0x20).
 547
 548    8-bit characters of the range 0xA0..0xFF are represented by
 549    one-byte sequences which are their 8-bit code.
 550
 551    The other characters are represented by a sequence of `base
 552    leading-code', optional `extended leading-code', and one or two
 553    `position-code's.  The length of the sequence is determined by the
 554    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 555    whereas extended leading-code and position-code take the range 0xA0
 556    through 0xFF.  See `charset.h' for more details about leading-code
 557    and position-code.
 558
 559    --- CODE RANGE of Emacs' internal format ---
 560    character set        range
 561    -------------        -----
 562    ascii                0x00..0x7F
 563    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 564    eight-bit-graphic    0xA0..0xBF
 565    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 566    ---------------------------------------------
 567
 568    As this is the internal character representation, the format is
 569    usually not used externally (i.e. in a file or in a data sent to a
 570    process).  But, it is possible to have a text externally in this
 571    format (i.e. by encoding by the coding system `emacs-mule').
 572
 573    In that case, a sequence of one-byte codes has a slightly different
 574    form.
 575
 576    Firstly, all characters in eight-bit-control are represented by
 577    one-byte sequences which are their 8-bit code.
 578
 579    Next, character composition data are represented by the byte
 580    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 581    where,
 582         METHOD is 0xF0 plus one of composition method (enum
 583         composition_method),
 584
 585         BYTES is 0xA0 plus the byte length of these composition data,
 586
 587         CHARS is 0xA0 plus the number of characters composed by these
 588         data,
 589
 590         COMPONENTs are characters of multibyte form or composition
 591         rules encoded by two-byte of ASCII codes.
 592
 593    In addition, for backward compatibility, the following formats are
 594    also recognized as composition data on decoding.
 595
 596    0x80 MSEQ ...
 597    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 598
 599    Here,
 600         MSEQ is a multibyte form but in these special format:
 601           ASCII: 0xA0 ASCII_CODE+0x80,
 602           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 603         RULE is a one byte code of the range 0xA0..0xF0 that
 604         represents a composition rule.
 605   */
 606
 607 enum emacs_code_class_type emacs_code_class[256];
 608
 609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 610    Check if a text is encoded in Emacs' internal format.  If it is,
 611    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 612
 613 static int
 614 detect_coding_emacs_mule (src, src_end, multibytep)
 615       unsigned char *src, *src_end;
 616       int multibytep;
 617 {
 618   unsigned char c;
 619   int composing = 0;
 620   /* Dummy for ONE_MORE_BYTE.  */
 621   struct coding_system dummy_coding;
 622   struct coding_system *coding = &dummy_coding;
 623
 624   while (1)
 625     {
 626       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 627
 628       if (composing)
 629         {
 630           if (c < 0xA0)
 631             composing = 0;
 632           else if (c == 0xA0)
 633             {
 634               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 635               c &= 0x7F;
 636             }
 637           else
 638             c -= 0x20;
 639         }
 640
 641       if (c < 0x20)
 642         {
 643           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 644             return 0;
 645         }
 646       else if (c >= 0x80 && c < 0xA0)
 647         {
 648           if (c == 0x80)
 649             /* Old leading code for a composite character.  */
 650             composing = 1;
 651           else
 652             {
 653               unsigned char *src_base = src - 1;
 654               int bytes;
 655
 656               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 657                                                bytes))
 658                 return 0;
 659               src = src_base + bytes;
 660             }
 661         }
 662     }
 663  label_end_of_loop:
 664   return CODING_CATEGORY_MASK_EMACS_MULE;
 665 }
 666
 667
 668 /* Record the starting position START and METHOD of one composition.  */
 669
 670 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 671   do {                                                          \
 672     struct composition_data *cmp_data = coding->cmp_data;       \
 673     int *data = cmp_data->data + cmp_data->used;                \
 674     coding->cmp_data_start = cmp_data->used;                    \
 675     data[0] = -1;                                               \
 676     data[1] = cmp_data->char_offset + start;                    \
 677     data[3] = (int) method;                                     \
 678     cmp_data->used += 4;                                        \
 679   } while (0)
 680
 681 /* Record the ending position END of the current composition.  */
 682
 683 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 684   do {                                                          \
 685     struct composition_data *cmp_data = coding->cmp_data;       \
 686     int *data = cmp_data->data + coding->cmp_data_start;        \
 687     data[0] = cmp_data->used - coding->cmp_data_start;          \
 688     data[2] = cmp_data->char_offset + end;                      \
 689   } while (0)
 690
 691 /* Record one COMPONENT (alternate character or composition rule).  */
 692
 693 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 694   do {                                                                  \
 695     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 696     if (coding->cmp_data->used - coding->cmp_data_start                 \
 697         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 698       {                                                                 \
 699         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 700         coding->composing = COMPOSITION_NO;                             \
 701       }                                                                 \
 702   } while (0)
 703
 704
 705 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 706    is not less than SRC_END, return -1 without incrementing Src.  */
 707
 708 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 709
 710
 711 /* Decode a character represented as a component of composition
 712    sequence of Emacs 20 style at SRC.  Set C to that character, store
 713    its multibyte form sequence at P, and set P to the end of that
 714    sequence.  If no valid character is found, set C to -1.  */
 715
 716 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 717   do {                                                          \
 718     int bytes;                                                  \
 719                                                                 \
 720     c = SAFE_ONE_MORE_BYTE ();                                  \
 721     if (c < 0)                                                  \
 722       break;                                                    \
 723     if (CHAR_HEAD_P (c))                                        \
 724       c = -1;                                                   \
 725     else if (c == 0xA0)                                         \
 726       {                                                         \
 727         c = SAFE_ONE_MORE_BYTE ();                              \
 728         if (c < 0xA0)                                           \
 729           c = -1;                                               \
 730         else                                                    \
 731           {                                                     \
 732             c -= 0xA0;                                          \
 733             *p++ = c;                                           \
 734           }                                                     \
 735       }                                                         \
 736     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 737       {                                                         \
 738         unsigned char *p0 = p;                                  \
 739                                                                 \
 740         c -= 0x20;                                              \
 741         *p++ = c;                                               \
 742         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 743         while (--bytes)                                         \
 744           {                                                     \
 745             c = SAFE_ONE_MORE_BYTE ();                          \
 746             if (c < 0)                                          \
 747               break;                                            \
 748             *p++ = c;                                           \
 749           }                                                     \
 750         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 751             || (coding->flags /* We are recovering a file.  */  \
 752                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 753                 && ! CHAR_HEAD_P (p0[1])))                      \
 754           c = STRING_CHAR (p0, bytes);                          \
 755         else                                                    \
 756           c = -1;                                               \
 757       }                                                         \
 758     else                                                        \
 759       c = -1;                                                   \
 760   } while (0)
 761
 762
 763 /* Decode a composition rule represented as a component of composition
 764    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 765    valid rule is found, set C to -1.  */
 766
 767 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 768   do {                                                  \
 769     c = SAFE_ONE_MORE_BYTE ();                          \
 770     c -= 0xA0;                                          \
 771     if (c < 0 || c >= 81)                               \
 772       c = -1;                                           \
 773     else                                                \
 774       {                                                 \
 775         gref = c / 9, nref = c % 9;                     \
 776         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 777       }                                                 \
 778   } while (0)
 779
 780
 781 /* Decode composition sequence encoded by `emacs-mule' at the source
 782    pointed by SRC.  SRC_END is the end of source.  Store information
 783    of the composition in CODING->cmp_data.
 784
 785    For backward compatibility, decode also a composition sequence of
 786    Emacs 20 style.  In that case, the composition sequence contains
 787    characters that should be extracted into a buffer or string.  Store
 788    those characters at *DESTINATION in multibyte form.
 789
 790    If we encounter an invalid byte sequence, return 0.
 791    If we encounter an insufficient source or destination, or
 792    insufficient space in CODING->cmp_data, return 1.
 793    Otherwise, return consumed bytes in the source.
 794
 795 */
 796 static INLINE int
 797 decode_composition_emacs_mule (coding, src, src_end,
 798                                destination, dst_end, dst_bytes)
 799      struct coding_system *coding;
 800      const unsigned char *src, *src_end;
 801      unsigned char **destination, *dst_end;
 802      int dst_bytes;
 803 {
 804   unsigned char *dst = *destination;
 805   int method, data_len, nchars;
 806   const unsigned char *src_base = src++;
 807   /* Store components of composition.  */
 808   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 809   int ncomponent;
 810   /* Store multibyte form of characters to be composed.  This is for
 811      Emacs 20 style composition sequence.  */
 812   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 813   unsigned char *bufp = buf;
 814   int c, i, gref, nref;
 815
 816   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 817       >= COMPOSITION_DATA_SIZE)
 818     {
 819       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 820       return -1;
 821     }
 822
 823   ONE_MORE_BYTE (c);
 824   if (c - 0xF0 >= COMPOSITION_RELATIVE
 825            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 826     {
 827       int with_rule;
 828
 829       method = c - 0xF0;
 830       with_rule = (method == COMPOSITION_WITH_RULE
 831                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 832       ONE_MORE_BYTE (c);
 833       data_len = c - 0xA0;
 834       if (data_len < 4
 835           || src_base + data_len > src_end)
 836         return 0;
 837       ONE_MORE_BYTE (c);
 838       nchars = c - 0xA0;
 839       if (c < 1)
 840         return 0;
 841       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 842         {
 843           /* If it is longer than this, it can't be valid.  */
 844           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 845             return 0;
 846
 847           if (ncomponent % 2 && with_rule)
 848             {
 849               ONE_MORE_BYTE (gref);
 850               gref -= 32;
 851               ONE_MORE_BYTE (nref);
 852               nref -= 32;
 853               c = COMPOSITION_ENCODE_RULE (gref, nref);
 854             }
 855           else
 856             {
 857               int bytes;
 858               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 859                   || (coding->flags /* We are recovering a file.  */
 860                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 861                       && ! CHAR_HEAD_P (src[1])))
 862                 c = STRING_CHAR (src, bytes);
 863               else
 864                 c = *src, bytes = 1;
 865               src += bytes;
 866             }
 867           component[ncomponent] = c;
 868         }
 869     }
 870   else
 871     {
 872       /* This may be an old Emacs 20 style format.  See the comment at
 873          the section 2 of this file.  */
 874       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 875       if (src == src_end
 876           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 877         goto label_end_of_loop;
 878
 879       src_end = src;
 880       src = src_base + 1;
 881       if (c < 0xC0)
 882         {
 883           method = COMPOSITION_RELATIVE;
 884           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 885             {
 886               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 887               if (c < 0)
 888                 break;
 889               component[ncomponent++] = c;
 890             }
 891           if (ncomponent < 2)
 892             return 0;
 893           nchars = ncomponent;
 894         }
 895       else if (c == 0xFF)
 896         {
 897           method = COMPOSITION_WITH_RULE;
 898           src++;
 899           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 900           if (c < 0)
 901             return 0;
 902           component[0] = c;
 903           for (ncomponent = 1;
 904                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 905             {
 906               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 907               if (c < 0)
 908                 break;
 909               component[ncomponent++] = c;
 910               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 911               if (c < 0)
 912                 break;
 913               component[ncomponent++] = c;
 914             }
 915           if (ncomponent < 3)
 916             return 0;
 917           nchars = (ncomponent + 1) / 2;
 918         }
 919       else
 920         return 0;
 921     }
 922
 923   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 924     {
 925       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 926       for (i = 0; i < ncomponent; i++)
 927         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 928       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 929       if (buf < bufp)
 930         {
 931           unsigned char *p = buf;
 932           EMIT_BYTES (p, bufp);
 933           *destination += bufp - buf;
 934           coding->produced_char += nchars;
 935         }
 936       return (src - src_base);
 937     }
 938  label_end_of_loop:
 939   return -1;
 940 }
 941
 942 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 943
 944 static void
 945 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 946      struct coding_system *coding;
 947      const unsigned char *source;
 948      unsigned char *destination;
 949      int src_bytes, dst_bytes;
 950 {
 951   const unsigned char *src = source;
 952   const unsigned char *src_end = source + src_bytes;
 953   unsigned char *dst = destination;
 954   unsigned char *dst_end = destination + dst_bytes;
 955   /* SRC_BASE remembers the start position in source in each loop.
 956      The loop will be exited when there's not enough source code, or
 957      when there's not enough destination area to produce a
 958      character.  */
 959   const unsigned char *src_base;
 960
 961   coding->produced_char = 0;
 962   while ((src_base = src) < src_end)
 963     {
 964       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 965       const unsigned char *p;
 966       int bytes;
 967
 968       if (*src == '\r')
 969         {
 970           int c = *src++;
 971
 972           if (coding->eol_type == CODING_EOL_CR)
 973             c = '\n';
 974           else if (coding->eol_type == CODING_EOL_CRLF)
 975             {
 976               ONE_MORE_BYTE (c);
 977               if (c != '\n')
 978                 {
 979                   src--;
 980                   c = '\r';
 981                 }
 982             }
 983           *dst++ = c;
 984           coding->produced_char++;
 985           continue;
 986         }
 987       else if (*src == '\n')
 988         {
 989           if ((coding->eol_type == CODING_EOL_CR
 990                || coding->eol_type == CODING_EOL_CRLF)
 991               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 992             {
 993               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 994               goto label_end_of_loop;
 995             }
 996           *dst++ = *src++;
 997           coding->produced_char++;
 998           continue;
 999         }
1000       else if (*src == 0x80 && coding->cmp_data)
1001         {
1002           /* Start of composition data.  */
1003           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1004                                                          &dst, dst_end,
1005                                                          dst_bytes);
1006           if (consumed < 0)
1007             goto label_end_of_loop;
1008           else if (consumed > 0)
1009             {
1010               src += consumed;
1011               continue;
1012             }
1013           bytes = CHAR_STRING (*src, tmp);
1014           p = tmp;
1015           src++;
1016         }
1017       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1018                || (coding->flags /* We are recovering a file.  */
1019                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1020                    && ! CHAR_HEAD_P (src[1])))
1021         {
1022           p = src;
1023           src += bytes;
1024         }
1025       else
1026         {
1027           int i, c;
1028
1029           bytes = BYTES_BY_CHAR_HEAD (*src);
1030           src++;
1031           for (i = 1; i < bytes; i++)
1032             {
1033               ONE_MORE_BYTE (c);
1034               if (CHAR_HEAD_P (c))
1035                 break;
1036             }
1037           if (i < bytes)
1038             {
1039               bytes = CHAR_STRING (*src_base, tmp);
1040               p = tmp;
1041               src = src_base + 1;
1042             }
1043           else
1044             {
1045               p = src_base;
1046             }
1047         }
1048       if (dst + bytes >= (dst_bytes ? dst_end : src))
1049         {
1050           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1051           break;
1052         }
1053       while (bytes--) *dst++ = *p++;
1054       coding->produced_char++;
1055     }
1056  label_end_of_loop:
1057   coding->consumed = coding->consumed_char = src_base - source;
1058   coding->produced = dst - destination;
1059 }
1060
1061
1062 /* Encode composition data stored at DATA into a special byte sequence
1063    starting by 0x80.  Update CODING->cmp_data_start and maybe
1064    CODING->cmp_data for the next call.  */
1065
1066 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1067   do {                                                                  \
1068     unsigned char buf[1024], *p0 = buf, *p;                             \
1069     int len = data[0];                                                  \
1070     int i;                                                              \
1071                                                                         \
1072     buf[0] = 0x80;                                                      \
1073     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1074     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1075     p = buf + 4;                                                        \
1076     if (data[3] == COMPOSITION_WITH_RULE                                \
1077         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1078       {                                                                 \
1079         p += CHAR_STRING (data[4], p);                                  \
1080         for (i = 5; i < len; i += 2)                                    \
1081           {                                                             \
1082             int gref, nref;                                             \
1083              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1084             *p++ = 0x20 + gref;                                         \
1085             *p++ = 0x20 + nref;                                         \
1086             p += CHAR_STRING (data[i + 1], p);                          \
1087           }                                                             \
1088       }                                                                 \
1089     else                                                                \
1090       {                                                                 \
1091         for (i = 4; i < len; i++)                                       \
1092           p += CHAR_STRING (data[i], p);                                \
1093       }                                                                 \
1094     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1095                                                                         \
1096     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1097       {                                                                 \
1098         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1099         goto label_end_of_loop;                                         \
1100       }                                                                 \
1101     while (p0 < p)                                                      \
1102       *dst++ = *p0++;                                                   \
1103     coding->cmp_data_start += data[0];                                  \
1104     if (coding->cmp_data_start == coding->cmp_data->used                \
1105         && coding->cmp_data->next)                                      \
1106       {                                                                 \
1107         coding->cmp_data = coding->cmp_data->next;                      \
1108         coding->cmp_data_start = 0;                                     \
1109       }                                                                 \
1110   } while (0)
1111
1112
1113 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1114                             unsigned char *, int, int));
1115
1116 static void
1117 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1118      struct coding_system *coding;
1119      const unsigned char *source;
1120      unsigned char *destination;
1121      int src_bytes, dst_bytes;
1122 {
1123   const unsigned char *src = source;
1124   const unsigned char *src_end = source + src_bytes;
1125   unsigned char *dst = destination;
1126   unsigned char *dst_end = destination + dst_bytes;
1127   const unsigned char *src_base;
1128   int c;
1129   int char_offset;
1130   int *data;
1131
1132   Lisp_Object translation_table;
1133
1134   translation_table = Qnil;
1135
1136   /* Optimization for the case that there's no composition.  */
1137   if (!coding->cmp_data || coding->cmp_data->used == 0)
1138     {
1139       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1140       return;
1141     }
1142
1143   char_offset = coding->cmp_data->char_offset;
1144   data = coding->cmp_data->data + coding->cmp_data_start;
1145   while (1)
1146     {
1147       src_base = src;
1148
1149       /* If SRC starts a composition, encode the information about the
1150          composition in advance.  */
1151       if (coding->cmp_data_start < coding->cmp_data->used
1152           && char_offset + coding->consumed_char == data[1])
1153         {
1154           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1155           char_offset = coding->cmp_data->char_offset;
1156           data = coding->cmp_data->data + coding->cmp_data_start;
1157         }
1158
1159       ONE_MORE_CHAR (c);
1160       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1161                         || coding->eol_type == CODING_EOL_CR))
1162         {
1163           if (coding->eol_type == CODING_EOL_CRLF)
1164             EMIT_TWO_BYTES ('\r', c);
1165           else
1166             EMIT_ONE_BYTE ('\r');
1167         }
1168       else if (SINGLE_BYTE_CHAR_P (c))
1169         {
1170           if (coding->flags && ! ASCII_BYTE_P (c))
1171             {
1172               /* As we are auto saving, retain the multibyte form for
1173                  8-bit chars.  */
1174               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1175               int bytes = CHAR_STRING (c, buf);
1176
1177               if (bytes == 1)
1178                 EMIT_ONE_BYTE (buf[0]);
1179               else
1180                 EMIT_TWO_BYTES (buf[0], buf[1]);
1181             }
1182           else
1183             EMIT_ONE_BYTE (c);
1184         }
1185       else
1186         EMIT_BYTES (src_base, src);
1187       coding->consumed_char++;
1188     }
1189  label_end_of_loop:
1190   coding->consumed = src_base - source;
1191   coding->produced = coding->produced_char = dst - destination;
1192   return;
1193 }
1194
1195 \f
1196 /*** 3. ISO2022 handlers ***/
1197
1198 /* The following note describes the coding system ISO2022 briefly.
1199    Since the intention of this note is to help understand the
1200    functions in this file, some parts are NOT ACCURATE or are OVERLY
1201    SIMPLIFIED.  For thorough understanding, please refer to the
1202    original document of ISO2022.  This is equivalent to the standard
1203    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1204
1205    ISO2022 provides many mechanisms to encode several character sets
1206    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1207    is encoded using bytes less than 128.  This may make the encoded
1208    text a little bit longer, but the text passes more easily through
1209    several types of gateway, some of which strip off the MSB (Most
1210    Significant Bit).
1211
1212    There are two kinds of character sets: control character sets and
1213    graphic character sets.  The former contain control characters such
1214    as `newline' and `escape' to provide control functions (control
1215    functions are also provided by escape sequences).  The latter
1216    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1217    two control character sets and many graphic character sets.
1218
1219    Graphic character sets are classified into one of the following
1220    four classes, according to the number of bytes (DIMENSION) and
1221    number of characters in one dimension (CHARS) of the set:
1222    - DIMENSION1_CHARS94
1223    - DIMENSION1_CHARS96
1224    - DIMENSION2_CHARS94
1225    - DIMENSION2_CHARS96
1226
1227    In addition, each character set is assigned an identification tag,
1228    unique for each set, called the "final character" (denoted as <F>
1229    hereafter).  The <F> of each character set is decided by ECMA(*)
1230    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1231    (0x30..0x3F are for private use only).
1232
1233    Note (*): ECMA = European Computer Manufacturers Association
1234
1235    Here are examples of graphic character sets [NAME(<F>)]:
1236         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1237         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1238         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1239         o DIMENSION2_CHARS96 -- none for the moment
1240
1241    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1242         C0 [0x00..0x1F] -- control character plane 0
1243         GL [0x20..0x7F] -- graphic character plane 0
1244         C1 [0x80..0x9F] -- control character plane 1
1245         GR [0xA0..0xFF] -- graphic character plane 1
1246
1247    A control character set is directly designated and invoked to C0 or
1248    C1 by an escape sequence.  The most common case is that:
1249    - ISO646's  control character set is designated/invoked to C0, and
1250    - ISO6429's control character set is designated/invoked to C1,
1251    and usually these designations/invocations are omitted in encoded
1252    text.  In a 7-bit environment, only C0 can be used, and a control
1253    character for C1 is encoded by an appropriate escape sequence to
1254    fit into the environment.  All control characters for C1 are
1255    defined to have corresponding escape sequences.
1256
1257    A graphic character set is at first designated to one of four
1258    graphic registers (G0 through G3), then these graphic registers are
1259    invoked to GL or GR.  These designations and invocations can be
1260    done independently.  The most common case is that G0 is invoked to
1261    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1262    these invocations and designations are omitted in encoded text.
1263    In a 7-bit environment, only GL can be used.
1264
1265    When a graphic character set of CHARS94 is invoked to GL, codes
1266    0x20 and 0x7F of the GL area work as control characters SPACE and
1267    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1268    be used.
1269
1270    There are two ways of invocation: locking-shift and single-shift.
1271    With locking-shift, the invocation lasts until the next different
1272    invocation, whereas with single-shift, the invocation affects the
1273    following character only and doesn't affect the locking-shift
1274    state.  Invocations are done by the following control characters or
1275    escape sequences:
1276
1277    ----------------------------------------------------------------------
1278    abbrev  function                  cntrl escape seq   description
1279    ----------------------------------------------------------------------
1280    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1281    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1282    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1283    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1284    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1285    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1286    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1287    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1288    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1289    ----------------------------------------------------------------------
1290    (*) These are not used by any known coding system.
1291
1292    Control characters for these functions are defined by macros
1293    ISO_CODE_XXX in `coding.h'.
1294
1295    Designations are done by the following escape sequences:
1296    ----------------------------------------------------------------------
1297    escape sequence      description
1298    ----------------------------------------------------------------------
1299    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1300    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1301    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1302    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1303    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1304    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1305    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1306    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1307    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1308    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1309    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1310    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1311    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1312    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1313    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1314    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1315    ----------------------------------------------------------------------
1316
1317    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1318    of dimension 1, chars 94, and final character <F>, etc...
1319
1320    Note (*): Although these designations are not allowed in ISO2022,
1321    Emacs accepts them on decoding, and produces them on encoding
1322    CHARS96 character sets in a coding system which is characterized as
1323    7-bit environment, non-locking-shift, and non-single-shift.
1324
1325    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1326    '(' can be omitted.  We refer to this as "short-form" hereafter.
1327
1328    Now you may notice that there are a lot of ways of encoding the
1329    same multilingual text in ISO2022.  Actually, there exist many
1330    coding systems such as Compound Text (used in X11's inter client
1331    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1332    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1333    localized platforms), and all of these are variants of ISO2022.
1334
1335    In addition to the above, Emacs handles two more kinds of escape
1336    sequences: ISO6429's direction specification and Emacs' private
1337    sequence for specifying character composition.
1338
1339    ISO6429's direction specification takes the following form:
1340         o CSI ']'      -- end of the current direction
1341         o CSI '0' ']'  -- end of the current direction
1342         o CSI '1' ']'  -- start of left-to-right text
1343         o CSI '2' ']'  -- start of right-to-left text
1344    The control character CSI (0x9B: control sequence introducer) is
1345    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1346
1347    Character composition specification takes the following form:
1348         o ESC '0' -- start relative composition
1349         o ESC '1' -- end composition
1350         o ESC '2' -- start rule-base composition (*)
1351         o ESC '3' -- start relative composition with alternate chars  (**)
1352         o ESC '4' -- start rule-base composition with alternate chars  (**)
1353   Since these are not standard escape sequences of any ISO standard,
1354   the use of them with these meanings is restricted to Emacs only.
1355
1356   (*) This form is used only in Emacs 20.5 and older versions,
1357   but the newer versions can safely decode it.
1358   (**) This form is used only in Emacs 21.1 and newer versions,
1359   and the older versions can't decode it.
1360
1361   Here's a list of example usages of these composition escape
1362   sequences (categorized by `enum composition_method').
1363
1364   COMPOSITION_RELATIVE:
1365         ESC 0 CHAR [ CHAR ] ESC 1
1366   COMPOSITION_WITH_RULE:
1367         ESC 2 CHAR [ RULE CHAR ] ESC 1
1368   COMPOSITION_WITH_ALTCHARS:
1369         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1370   COMPOSITION_WITH_RULE_ALTCHARS:
1371         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1372
1373 enum iso_code_class_type iso_code_class[256];
1374
1375 #define CHARSET_OK(idx, charset, c)                                     \
1376   (coding_system_table[idx]                                             \
1377    && (charset == CHARSET_ASCII                                         \
1378        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1379            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1380    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1381                                               charset)                  \
1382        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1383
1384 #define SHIFT_OUT_OK(idx) \
1385   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1386
1387 #define COMPOSITION_OK(idx)     \
1388   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1389
1390 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1391    Check if a text is encoded in ISO2022.  If it is, return an
1392    integer in which appropriate flag bits any of:
1393         CODING_CATEGORY_MASK_ISO_7
1394         CODING_CATEGORY_MASK_ISO_7_TIGHT
1395         CODING_CATEGORY_MASK_ISO_8_1
1396         CODING_CATEGORY_MASK_ISO_8_2
1397         CODING_CATEGORY_MASK_ISO_7_ELSE
1398         CODING_CATEGORY_MASK_ISO_8_ELSE
1399    are set.  If a code which should never appear in ISO2022 is found,
1400    returns 0.  */
1401
1402 static int
1403 detect_coding_iso2022 (src, src_end, multibytep)
1404      unsigned char *src, *src_end;
1405      int multibytep;
1406 {
1407   int mask = CODING_CATEGORY_MASK_ISO;
1408   int mask_found = 0;
1409   int reg[4], shift_out = 0, single_shifting = 0;
1410   int c, c1, charset;
1411   /* Dummy for ONE_MORE_BYTE.  */
1412   struct coding_system dummy_coding;
1413   struct coding_system *coding = &dummy_coding;
1414   Lisp_Object safe_chars;
1415
1416   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1417   while (mask && src < src_end)
1418     {
1419       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1420     retry:
1421       switch (c)
1422         {
1423         case ISO_CODE_ESC:
1424           if (inhibit_iso_escape_detection)
1425             break;
1426           single_shifting = 0;
1427           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1428           if (c >= '(' && c <= '/')
1429             {
1430               /* Designation sequence for a charset of dimension 1.  */
1431               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1432               if (c1 < ' ' || c1 >= 0x80
1433                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1434                 /* Invalid designation sequence.  Just ignore.  */
1435                 break;
1436               reg[(c - '(') % 4] = charset;
1437             }
1438           else if (c == '$')
1439             {
1440               /* Designation sequence for a charset of dimension 2.  */
1441               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1442               if (c >= '@' && c <= 'B')
1443                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1444                 reg[0] = charset = iso_charset_table[1][0][c];
1445               else if (c >= '(' && c <= '/')
1446                 {
1447                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1448                   if (c1 < ' ' || c1 >= 0x80
1449                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1450                     /* Invalid designation sequence.  Just ignore.  */
1451                     break;
1452                   reg[(c - '(') % 4] = charset;
1453                 }
1454               else
1455                 /* Invalid designation sequence.  Just ignore.  */
1456                 break;
1457             }
1458           else if (c == 'N' || c == 'O')
1459             {
1460               /* ESC <Fe> for SS2 or SS3.  */
1461               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1462               break;
1463             }
1464           else if (c >= '0' && c <= '4')
1465             {
1466               /* ESC <Fp> for start/end composition.  */
1467               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1468                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1469               else
1470                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1471               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1472                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1473               else
1474                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1475               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1476                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1477               else
1478                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1479               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1480                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1481               else
1482                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1483               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1484                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1485               else
1486                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1487               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1488                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1489               else
1490                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1491               break;
1492             }
1493           else
1494             /* Invalid escape sequence.  Just ignore.  */
1495             break;
1496
1497           /* We found a valid designation sequence for CHARSET.  */
1498           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1499           c = MAKE_CHAR (charset, 0, 0);
1500           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1501             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1502           else
1503             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1504           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1505             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1506           else
1507             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1508           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1509             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1510           else
1511             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1512           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1513             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1514           else
1515             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1516           break;
1517
1518         case ISO_CODE_SO:
1519           if (inhibit_iso_escape_detection)
1520             break;
1521           single_shifting = 0;
1522           if (shift_out == 0
1523               && (reg[1] >= 0
1524                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1525                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1526             {
1527               /* Locking shift out.  */
1528               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1529               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1530             }
1531           break;
1532
1533         case ISO_CODE_SI:
1534           if (inhibit_iso_escape_detection)
1535             break;
1536           single_shifting = 0;
1537           if (shift_out == 1)
1538             {
1539               /* Locking shift in.  */
1540               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1541               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1542             }
1543           break;
1544
1545         case ISO_CODE_CSI:
1546           single_shifting = 0;
1547         case ISO_CODE_SS2:
1548         case ISO_CODE_SS3:
1549           {
1550             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1551
1552             if (inhibit_iso_escape_detection)
1553               break;
1554             if (c != ISO_CODE_CSI)
1555               {
1556                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1557                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1558                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1559                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1560                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1561                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1562                 single_shifting = 1;
1563               }
1564             if (VECTORP (Vlatin_extra_code_table)
1565                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1566               {
1567                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1568                     & CODING_FLAG_ISO_LATIN_EXTRA)
1569                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1570                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1571                     & CODING_FLAG_ISO_LATIN_EXTRA)
1572                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1573               }
1574             mask &= newmask;
1575             mask_found |= newmask;
1576           }
1577           break;
1578
1579         default:
1580           if (c < 0x80)
1581             {
1582               single_shifting = 0;
1583               break;
1584             }
1585           else if (c < 0xA0)
1586             {
1587               single_shifting = 0;
1588               if (VECTORP (Vlatin_extra_code_table)
1589                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1590                 {
1591                   int newmask = 0;
1592
1593                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1594                       & CODING_FLAG_ISO_LATIN_EXTRA)
1595                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1596                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1597                       & CODING_FLAG_ISO_LATIN_EXTRA)
1598                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1599                   mask &= newmask;
1600                   mask_found |= newmask;
1601                 }
1602               else
1603                 return 0;
1604             }
1605           else
1606             {
1607               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1608                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1609               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1610               /* Check the length of succeeding codes of the range
1611                  0xA0..0FF.  If the byte length is odd, we exclude
1612                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1613                  when we are not single shifting.  */
1614               if (!single_shifting
1615                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1616                 {
1617                   int i = 1;
1618
1619                   c = -1;
1620                   while (src < src_end)
1621                     {
1622                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1623                       if (c < 0xA0)
1624                         break;
1625                       i++;
1626                     }
1627
1628                   if (i & 1 && src < src_end)
1629                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1630                   else
1631                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1632                   if (c >= 0)
1633                     /* This means that we have read one extra byte.  */
1634                     goto retry;
1635                 }
1636             }
1637           break;
1638         }
1639     }
1640  label_end_of_loop:
1641   return (mask & mask_found);
1642 }
1643
1644 /* Decode a character of which charset is CHARSET, the 1st position
1645    code is C1, the 2nd position code is C2, and return the decoded
1646    character code.  If the variable `translation_table' is non-nil,
1647    returned the translated code.  */
1648
1649 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1650   (NILP (translation_table)                     \
1651    ? MAKE_CHAR (charset, c1, c2)                \
1652    : translate_char (translation_table, -1, charset, c1, c2))
1653
1654 /* Set designation state into CODING.  */
1655 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1656   do {                                                                     \
1657     int charset, c;                                                        \
1658                                                                            \
1659     if (final_char < '0' || final_char >= 128)                             \
1660       goto label_invalid_code;                                             \
1661     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1662                                  make_number (chars),                      \
1663                                  make_number (final_char));                \
1664     c = MAKE_CHAR (charset, 0, 0);                                         \
1665     if (charset >= 0                                                       \
1666         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1667             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1668       {                                                                    \
1669         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1670             && reg == 0                                                    \
1671             && charset == CHARSET_ASCII)                                   \
1672           {                                                                \
1673             /* We should insert this designation sequence as is so         \
1674                that it is surely written back to a file.  */               \
1675             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1676             goto label_invalid_code;                                       \
1677           }                                                                \
1678         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1679         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1680             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1681           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1682         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1683       }                                                                    \
1684     else                                                                   \
1685       {                                                                    \
1686         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1687         goto label_invalid_code;                                           \
1688       }                                                                    \
1689   } while (0)
1690
1691 /* Allocate a memory block for storing information about compositions.
1692    The block is chained to the already allocated blocks.  */
1693
1694 void
1695 coding_allocate_composition_data (coding, char_offset)
1696      struct coding_system *coding;
1697      int char_offset;
1698 {
1699   struct composition_data *cmp_data
1700     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1701
1702   cmp_data->char_offset = char_offset;
1703   cmp_data->used = 0;
1704   cmp_data->prev = coding->cmp_data;
1705   cmp_data->next = NULL;
1706   if (coding->cmp_data)
1707     coding->cmp_data->next = cmp_data;
1708   coding->cmp_data = cmp_data;
1709   coding->cmp_data_start = 0;
1710   coding->composing = COMPOSITION_NO;
1711 }
1712
1713 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1714    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1715    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1716    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1717    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1718   */
1719
1720 #define DECODE_COMPOSITION_START(c1)                                       \
1721   do {                                                                     \
1722     if (coding->composing == COMPOSITION_DISABLED)                         \
1723       {                                                                    \
1724         *dst++ = ISO_CODE_ESC;                                             \
1725         *dst++ = c1 & 0x7f;                                                \
1726         coding->produced_char += 2;                                        \
1727       }                                                                    \
1728     else if (!COMPOSING_P (coding))                                        \
1729       {                                                                    \
1730         /* This is surely the start of a composition.  We must be sure     \
1731            that coding->cmp_data has enough space to store the             \
1732            information about the composition.  If not, terminate the       \
1733            current decoding loop, allocate one more memory block for       \
1734            coding->cmp_data in the caller, then start the decoding         \
1735            loop again.  We can't allocate memory here directly because     \
1736            it may cause buffer/string relocation.  */                      \
1737         if (!coding->cmp_data                                              \
1738             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1739                 >= COMPOSITION_DATA_SIZE))                                 \
1740           {                                                                \
1741             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1742             goto label_end_of_loop;                                        \
1743           }                                                                \
1744         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1745                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1746                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1747                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1748         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1749                                       coding->composing);                  \
1750         coding->composition_rule_follows = 0;                              \
1751       }                                                                    \
1752     else                                                                   \
1753       {                                                                    \
1754         /* We are already handling a composition.  If the method is        \
1755            the following two, the codes following the current escape       \
1756            sequence are actual characters stored in a buffer.  */          \
1757         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1758             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1759           {                                                                \
1760             coding->composing = COMPOSITION_RELATIVE;                      \
1761             coding->composition_rule_follows = 0;                          \
1762           }                                                                \
1763       }                                                                    \
1764   } while (0)
1765
1766 /* Handle composition end sequence ESC 1.  */
1767
1768 #define DECODE_COMPOSITION_END(c1)                                      \
1769   do {                                                                  \
1770     if (! COMPOSING_P (coding))                                         \
1771       {                                                                 \
1772         *dst++ = ISO_CODE_ESC;                                          \
1773         *dst++ = c1;                                                    \
1774         coding->produced_char += 2;                                     \
1775       }                                                                 \
1776     else                                                                \
1777       {                                                                 \
1778         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1779         coding->composing = COMPOSITION_NO;                             \
1780       }                                                                 \
1781   } while (0)
1782
1783 /* Decode a composition rule from the byte C1 (and maybe one more byte
1784    from SRC) and store one encoded composition rule in
1785    coding->cmp_data.  */
1786
1787 #define DECODE_COMPOSITION_RULE(c1)                                     \
1788   do {                                                                  \
1789     int rule = 0;                                                       \
1790     (c1) -= 32;                                                         \
1791     if (c1 < 81)                /* old format (before ver.21) */        \
1792       {                                                                 \
1793         int gref = (c1) / 9;                                            \
1794         int nref = (c1) % 9;                                            \
1795         if (gref == 4) gref = 10;                                       \
1796         if (nref == 4) nref = 10;                                       \
1797         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1798       }                                                                 \
1799     else if (c1 < 93)           /* new format (after ver.21) */         \
1800       {                                                                 \
1801         ONE_MORE_BYTE (c2);                                             \
1802         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1803       }                                                                 \
1804     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1805     coding->composition_rule_follows = 0;                               \
1806   } while (0)
1807
1808
1809 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1810
1811 static void
1812 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1813      struct coding_system *coding;
1814      const unsigned char *source;
1815      unsigned char *destination;
1816      int src_bytes, dst_bytes;
1817 {
1818   const unsigned char *src = source;
1819   const unsigned char *src_end = source + src_bytes;
1820   unsigned char *dst = destination;
1821   unsigned char *dst_end = destination + dst_bytes;
1822   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1823   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1824   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1825   /* SRC_BASE remembers the start position in source in each loop.
1826      The loop will be exited when there's not enough source code
1827      (within macro ONE_MORE_BYTE), or when there's not enough
1828      destination area to produce a character (within macro
1829      EMIT_CHAR).  */
1830   const unsigned char *src_base;
1831   int c, charset;
1832   Lisp_Object translation_table;
1833   Lisp_Object safe_chars;
1834
1835   safe_chars = coding_safe_chars (coding->symbol);
1836
1837   if (NILP (Venable_character_translation))
1838     translation_table = Qnil;
1839   else
1840     {
1841       translation_table = coding->translation_table_for_decode;
1842       if (NILP (translation_table))
1843         translation_table = Vstandard_translation_table_for_decode;
1844     }
1845
1846   coding->result = CODING_FINISH_NORMAL;
1847
1848   while (1)
1849     {
1850       int c1, c2 = 0;
1851
1852       src_base = src;
1853       ONE_MORE_BYTE (c1);
1854
1855       /* We produce no character or one character.  */
1856       switch (iso_code_class [c1])
1857         {
1858         case ISO_0x20_or_0x7F:
1859           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1860             {
1861               DECODE_COMPOSITION_RULE (c1);
1862               continue;
1863             }
1864           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1865             {
1866               /* This is SPACE or DEL.  */
1867               charset = CHARSET_ASCII;
1868               break;
1869             }
1870           /* This is a graphic character, we fall down ...  */
1871
1872         case ISO_graphic_plane_0:
1873           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1874             {
1875               DECODE_COMPOSITION_RULE (c1);
1876               continue;
1877             }
1878           charset = charset0;
1879           break;
1880
1881         case ISO_0xA0_or_0xFF:
1882           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1883               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1884             goto label_invalid_code;
1885           /* This is a graphic character, we fall down ... */
1886
1887         case ISO_graphic_plane_1:
1888           if (charset1 < 0)
1889             goto label_invalid_code;
1890           charset = charset1;
1891           break;
1892
1893         case ISO_control_0:
1894           if (COMPOSING_P (coding))
1895             DECODE_COMPOSITION_END ('1');
1896
1897           /* All ISO2022 control characters in this class have the
1898              same representation in Emacs internal format.  */
1899           if (c1 == '\n'
1900               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1901               && (coding->eol_type == CODING_EOL_CR
1902                   || coding->eol_type == CODING_EOL_CRLF))
1903             {
1904               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1905               goto label_end_of_loop;
1906             }
1907           charset = CHARSET_ASCII;
1908           break;
1909
1910         case ISO_control_1:
1911           if (COMPOSING_P (coding))
1912             DECODE_COMPOSITION_END ('1');
1913           goto label_invalid_code;
1914
1915         case ISO_carriage_return:
1916           if (COMPOSING_P (coding))
1917             DECODE_COMPOSITION_END ('1');
1918
1919           if (coding->eol_type == CODING_EOL_CR)
1920             c1 = '\n';
1921           else if (coding->eol_type == CODING_EOL_CRLF)
1922             {
1923               ONE_MORE_BYTE (c1);
1924               if (c1 != ISO_CODE_LF)
1925                 {
1926                   src--;
1927                   c1 = '\r';
1928                 }
1929             }
1930           charset = CHARSET_ASCII;
1931           break;
1932
1933         case ISO_shift_out:
1934           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1935               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1936             goto label_invalid_code;
1937           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1938           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1939           continue;
1940
1941         case ISO_shift_in:
1942           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1943             goto label_invalid_code;
1944           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1945           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1946           continue;
1947
1948         case ISO_single_shift_2_7:
1949         case ISO_single_shift_2:
1950           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1951             goto label_invalid_code;
1952           /* SS2 is handled as an escape sequence of ESC 'N' */
1953           c1 = 'N';
1954           goto label_escape_sequence;
1955
1956         case ISO_single_shift_3:
1957           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1958             goto label_invalid_code;
1959           /* SS2 is handled as an escape sequence of ESC 'O' */
1960           c1 = 'O';
1961           goto label_escape_sequence;
1962
1963         case ISO_control_sequence_introducer:
1964           /* CSI is handled as an escape sequence of ESC '[' ...  */
1965           c1 = '[';
1966           goto label_escape_sequence;
1967
1968         case ISO_escape:
1969           ONE_MORE_BYTE (c1);
1970         label_escape_sequence:
1971           /* Escape sequences handled by Emacs are invocation,
1972              designation, direction specification, and character
1973              composition specification.  */
1974           switch (c1)
1975             {
1976             case '&':           /* revision of following character set */
1977               ONE_MORE_BYTE (c1);
1978               if (!(c1 >= '@' && c1 <= '~'))
1979                 goto label_invalid_code;
1980               ONE_MORE_BYTE (c1);
1981               if (c1 != ISO_CODE_ESC)
1982                 goto label_invalid_code;
1983               ONE_MORE_BYTE (c1);
1984               goto label_escape_sequence;
1985
1986             case '$':           /* designation of 2-byte character set */
1987               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1988                 goto label_invalid_code;
1989               ONE_MORE_BYTE (c1);
1990               if (c1 >= '@' && c1 <= 'B')
1991                 {       /* designation of JISX0208.1978, GB2312.1980,
1992                            or JISX0208.1980 */
1993                   DECODE_DESIGNATION (0, 2, 94, c1);
1994                 }
1995               else if (c1 >= 0x28 && c1 <= 0x2B)
1996                 {       /* designation of DIMENSION2_CHARS94 character set */
1997                   ONE_MORE_BYTE (c2);
1998                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1999                 }
2000               else if (c1 >= 0x2C && c1 <= 0x2F)
2001                 {       /* designation of DIMENSION2_CHARS96 character set */
2002                   ONE_MORE_BYTE (c2);
2003                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2004                 }
2005               else
2006                 goto label_invalid_code;
2007               /* We must update these variables now.  */
2008               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2009               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2010               continue;
2011
2012             case 'n':           /* invocation of locking-shift-2 */
2013               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2014                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2015                 goto label_invalid_code;
2016               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2017               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2018               continue;
2019
2020             case 'o':           /* invocation of locking-shift-3 */
2021               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2022                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2023                 goto label_invalid_code;
2024               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2025               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2026               continue;
2027
2028             case 'N':           /* invocation of single-shift-2 */
2029               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2030                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2031                 goto label_invalid_code;
2032               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2033               ONE_MORE_BYTE (c1);
2034               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2035                 goto label_invalid_code;
2036               break;
2037
2038             case 'O':           /* invocation of single-shift-3 */
2039               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2040                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2041                 goto label_invalid_code;
2042               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2043               ONE_MORE_BYTE (c1);
2044               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2045                 goto label_invalid_code;
2046               break;
2047
2048             case '0': case '2': case '3': case '4': /* start composition */
2049               DECODE_COMPOSITION_START (c1);
2050               continue;
2051
2052             case '1':           /* end composition */
2053               DECODE_COMPOSITION_END (c1);
2054               continue;
2055
2056             case '[':           /* specification of direction */
2057               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2058                 goto label_invalid_code;
2059               /* For the moment, nested direction is not supported.
2060                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2061                  left-to-right, and nonzero means right-to-left.  */
2062               ONE_MORE_BYTE (c1);
2063               switch (c1)
2064                 {
2065                 case ']':       /* end of the current direction */
2066                   coding->mode &= ~CODING_MODE_DIRECTION;
2067
2068                 case '0':       /* end of the current direction */
2069                 case '1':       /* start of left-to-right direction */
2070                   ONE_MORE_BYTE (c1);
2071                   if (c1 == ']')
2072                     coding->mode &= ~CODING_MODE_DIRECTION;
2073                   else
2074                     goto label_invalid_code;
2075                   break;
2076
2077                 case '2':       /* start of right-to-left direction */
2078                   ONE_MORE_BYTE (c1);
2079                   if (c1 == ']')
2080                     coding->mode |= CODING_MODE_DIRECTION;
2081                   else
2082                     goto label_invalid_code;
2083                   break;
2084
2085                 default:
2086                   goto label_invalid_code;
2087                 }
2088               continue;
2089
2090             case '%':
2091               if (COMPOSING_P (coding))
2092                 DECODE_COMPOSITION_END ('1');
2093               ONE_MORE_BYTE (c1);
2094               if (c1 == '/')
2095                 {
2096                   /* CTEXT extended segment:
2097                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2098                      We keep these bytes as is for the moment.
2099                      They may be decoded by post-read-conversion.  */
2100                   int dim, M, L;
2101                   int size, required;
2102                   int produced_chars;
2103
2104                   ONE_MORE_BYTE (dim);
2105                   ONE_MORE_BYTE (M);
2106                   ONE_MORE_BYTE (L);
2107                   size = ((M - 128) * 128) + (L - 128);
2108                   required = 8 + size * 2;
2109                   if (dst + required > (dst_bytes ? dst_end : src))
2110                     goto label_end_of_loop;
2111                   *dst++ = ISO_CODE_ESC;
2112                   *dst++ = '%';
2113                   *dst++ = '/';
2114                   *dst++ = dim;
2115                   produced_chars = 4;
2116                   dst += CHAR_STRING (M, dst), produced_chars++;
2117                   dst += CHAR_STRING (L, dst), produced_chars++;
2118                   while (size-- > 0)
2119                     {
2120                       ONE_MORE_BYTE (c1);
2121                       dst += CHAR_STRING (c1, dst), produced_chars++;
2122                     }
2123                   coding->produced_char += produced_chars;
2124                 }
2125               else if (c1 == 'G')
2126                 {
2127                   unsigned char *d = dst;
2128                   int produced_chars;
2129
2130                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2131                      ESC % G --UTF-8-BYTES-- ESC % @
2132                      We keep these bytes as is for the moment.
2133                      They may be decoded by post-read-conversion.  */
2134                   if (d + 6 > (dst_bytes ? dst_end : src))
2135                     goto label_end_of_loop;
2136                   *d++ = ISO_CODE_ESC;
2137                   *d++ = '%';
2138                   *d++ = 'G';
2139                   produced_chars = 3;
2140                   while (d + 1 < (dst_bytes ? dst_end : src))
2141                     {
2142                       ONE_MORE_BYTE (c1);
2143                       if (c1 == ISO_CODE_ESC
2144                           && src + 1 < src_end
2145                           && src[0] == '%'
2146                           && src[1] == '@')
2147                         {
2148                           src += 2;
2149                           break;
2150                         }
2151                       d += CHAR_STRING (c1, d), produced_chars++;
2152                     }
2153                   if (d + 3 > (dst_bytes ? dst_end : src))
2154                     goto label_end_of_loop;
2155                   *d++ = ISO_CODE_ESC;
2156                   *d++ = '%';
2157                   *d++ = '@';
2158                   dst = d;
2159                   coding->produced_char += produced_chars + 3;
2160                 }
2161               else
2162                 goto label_invalid_code;
2163               continue;
2164
2165             default:
2166               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2167                 goto label_invalid_code;
2168               if (c1 >= 0x28 && c1 <= 0x2B)
2169                 {       /* designation of DIMENSION1_CHARS94 character set */
2170                   ONE_MORE_BYTE (c2);
2171                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2172                 }
2173               else if (c1 >= 0x2C && c1 <= 0x2F)
2174                 {       /* designation of DIMENSION1_CHARS96 character set */
2175                   ONE_MORE_BYTE (c2);
2176                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2177                 }
2178               else
2179                 goto label_invalid_code;
2180               /* We must update these variables now.  */
2181               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2182               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2183               continue;
2184             }
2185         }
2186
2187       /* Now we know CHARSET and 1st position code C1 of a character.
2188          Produce a multibyte sequence for that character while getting
2189          2nd position code C2 if necessary.  */
2190       if (CHARSET_DIMENSION (charset) == 2)
2191         {
2192           ONE_MORE_BYTE (c2);
2193           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2194             /* C2 is not in a valid range.  */
2195             goto label_invalid_code;
2196         }
2197       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2198       EMIT_CHAR (c);
2199       continue;
2200
2201     label_invalid_code:
2202       coding->errors++;
2203       if (COMPOSING_P (coding))
2204         DECODE_COMPOSITION_END ('1');
2205       src = src_base;
2206       c = *src++;
2207       if (! NILP (translation_table))
2208         c = translate_char (translation_table, c, 0, 0, 0);
2209       EMIT_CHAR (c);
2210     }
2211
2212  label_end_of_loop:
2213   coding->consumed = coding->consumed_char = src_base - source;
2214   coding->produced = dst - destination;
2215   return;
2216 }
2217
2218
2219 /* ISO2022 encoding stuff.  */
2220
2221 /*
2222    It is not enough to say just "ISO2022" on encoding, we have to
2223    specify more details.  In Emacs, each ISO2022 coding system
2224    variant has the following specifications:
2225         1. Initial designation to G0 through G3.
2226         2. Allows short-form designation?
2227         3. ASCII should be designated to G0 before control characters?
2228         4. ASCII should be designated to G0 at end of line?
2229         5. 7-bit environment or 8-bit environment?
2230         6. Use locking-shift?
2231         7. Use Single-shift?
2232    And the following two are only for Japanese:
2233         8. Use ASCII in place of JIS0201-1976-Roman?
2234         9. Use JISX0208-1983 in place of JISX0208-1978?
2235    These specifications are encoded in `coding->flags' as flag bits
2236    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2237    details.
2238 */
2239
2240 /* Produce codes (escape sequence) for designating CHARSET to graphic
2241    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2242    '@', 'A', or 'B' and the coding system CODING allows, produce
2243    designation sequence of short-form.  */
2244
2245 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2246   do {                                                                  \
2247     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2248     char *intermediate_char_94 = "()*+";                                \
2249     char *intermediate_char_96 = ",-./";                                \
2250     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2251                                                                         \
2252     if (revision < 255)                                                 \
2253       {                                                                 \
2254         *dst++ = ISO_CODE_ESC;                                          \
2255         *dst++ = '&';                                                   \
2256         *dst++ = '@' + revision;                                        \
2257       }                                                                 \
2258     *dst++ = ISO_CODE_ESC;                                              \
2259     if (CHARSET_DIMENSION (charset) == 1)                               \
2260       {                                                                 \
2261         if (CHARSET_CHARS (charset) == 94)                              \
2262           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2263         else                                                            \
2264           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2265       }                                                                 \
2266     else                                                                \
2267       {                                                                 \
2268         *dst++ = '$';                                                   \
2269         if (CHARSET_CHARS (charset) == 94)                              \
2270           {                                                             \
2271             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2272                 || reg != 0                                             \
2273                 || final_char < '@' || final_char > 'B')                \
2274               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2275           }                                                             \
2276         else                                                            \
2277           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2278       }                                                                 \
2279     *dst++ = final_char;                                                \
2280     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2281   } while (0)
2282
2283 /* The following two macros produce codes (control character or escape
2284    sequence) for ISO2022 single-shift functions (single-shift-2 and
2285    single-shift-3).  */
2286
2287 #define ENCODE_SINGLE_SHIFT_2                           \
2288   do {                                                  \
2289     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2290       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2291     else                                                \
2292       *dst++ = ISO_CODE_SS2;                            \
2293     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2294   } while (0)
2295
2296 #define ENCODE_SINGLE_SHIFT_3                           \
2297   do {                                                  \
2298     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2299       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2300     else                                                \
2301       *dst++ = ISO_CODE_SS3;                            \
2302     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2303   } while (0)
2304
2305 /* The following four macros produce codes (control character or
2306    escape sequence) for ISO2022 locking-shift functions (shift-in,
2307    shift-out, locking-shift-2, and locking-shift-3).  */
2308
2309 #define ENCODE_SHIFT_IN                         \
2310   do {                                          \
2311     *dst++ = ISO_CODE_SI;                       \
2312     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2313   } while (0)
2314
2315 #define ENCODE_SHIFT_OUT                        \
2316   do {                                          \
2317     *dst++ = ISO_CODE_SO;                       \
2318     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2319   } while (0)
2320
2321 #define ENCODE_LOCKING_SHIFT_2                  \
2322   do {                                          \
2323     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2324     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2325   } while (0)
2326
2327 #define ENCODE_LOCKING_SHIFT_3                  \
2328   do {                                          \
2329     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2330     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2331   } while (0)
2332
2333 /* Produce codes for a DIMENSION1 character whose character set is
2334    CHARSET and whose position-code is C1.  Designation and invocation
2335    sequences are also produced in advance if necessary.  */
2336
2337 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2338   do {                                                                  \
2339     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2340       {                                                                 \
2341         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2342           *dst++ = c1 & 0x7F;                                           \
2343         else                                                            \
2344           *dst++ = c1 | 0x80;                                           \
2345         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2346         break;                                                          \
2347       }                                                                 \
2348     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2349       {                                                                 \
2350         *dst++ = c1 & 0x7F;                                             \
2351         break;                                                          \
2352       }                                                                 \
2353     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2354       {                                                                 \
2355         *dst++ = c1 | 0x80;                                             \
2356         break;                                                          \
2357       }                                                                 \
2358     else                                                                \
2359       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2360          must invoke it, or, at first, designate it to some graphic     \
2361          register.  Then repeat the loop to actually produce the        \
2362          character.  */                                                 \
2363       dst = encode_invocation_designation (charset, coding, dst);       \
2364   } while (1)
2365
2366 /* Produce codes for a DIMENSION2 character whose character set is
2367    CHARSET and whose position-codes are C1 and C2.  Designation and
2368    invocation codes are also produced in advance if necessary.  */
2369
2370 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2371   do {                                                                  \
2372     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2373       {                                                                 \
2374         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2375           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2376         else                                                            \
2377           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2378         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2379         break;                                                          \
2380       }                                                                 \
2381     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2382       {                                                                 \
2383         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2384         break;                                                          \
2385       }                                                                 \
2386     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2387       {                                                                 \
2388         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2389         break;                                                          \
2390       }                                                                 \
2391     else                                                                \
2392       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2393          must invoke it, or, at first, designate it to some graphic     \
2394          register.  Then repeat the loop to actually produce the        \
2395          character.  */                                                 \
2396       dst = encode_invocation_designation (charset, coding, dst);       \
2397   } while (1)
2398
2399 #define ENCODE_ISO_CHARACTER(c)                                 \
2400   do {                                                          \
2401     int charset, c1, c2;                                        \
2402                                                                 \
2403     SPLIT_CHAR (c, charset, c1, c2);                            \
2404     if (CHARSET_DEFINED_P (charset))                            \
2405       {                                                         \
2406         if (CHARSET_DIMENSION (charset) == 1)                   \
2407           {                                                     \
2408             if (charset == CHARSET_ASCII                        \
2409                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2410               charset = charset_latin_jisx0201;                 \
2411             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2412           }                                                     \
2413         else                                                    \
2414           {                                                     \
2415             if (charset == charset_jisx0208                     \
2416                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2417               charset = charset_jisx0208_1978;                  \
2418             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2419           }                                                     \
2420       }                                                         \
2421     else                                                        \
2422       {                                                         \
2423         *dst++ = c1;                                            \
2424         if (c2 >= 0)                                            \
2425           *dst++ = c2;                                          \
2426       }                                                         \
2427   } while (0)
2428
2429
2430 /* Instead of encoding character C, produce one or two `?'s.  */
2431
2432 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2433   do {                                                          \
2434     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2435     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2436       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2437   } while (0)
2438
2439
2440 /* Produce designation and invocation codes at a place pointed by DST
2441    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2442    Return new DST.  */
2443
2444 unsigned char *
2445 encode_invocation_designation (charset, coding, dst)
2446      int charset;
2447      struct coding_system *coding;
2448      unsigned char *dst;
2449 {
2450   int reg;                      /* graphic register number */
2451
2452   /* At first, check designations.  */
2453   for (reg = 0; reg < 4; reg++)
2454     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2455       break;
2456
2457   if (reg >= 4)
2458     {
2459       /* CHARSET is not yet designated to any graphic registers.  */
2460       /* At first check the requested designation.  */
2461       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2462       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2463         /* Since CHARSET requests no special designation, designate it
2464            to graphic register 0.  */
2465         reg = 0;
2466
2467       ENCODE_DESIGNATION (charset, reg, coding);
2468     }
2469
2470   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2471       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2472     {
2473       /* Since the graphic register REG is not invoked to any graphic
2474          planes, invoke it to graphic plane 0.  */
2475       switch (reg)
2476         {
2477         case 0:                 /* graphic register 0 */
2478           ENCODE_SHIFT_IN;
2479           break;
2480
2481         case 1:                 /* graphic register 1 */
2482           ENCODE_SHIFT_OUT;
2483           break;
2484
2485         case 2:                 /* graphic register 2 */
2486           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2487             ENCODE_SINGLE_SHIFT_2;
2488           else
2489             ENCODE_LOCKING_SHIFT_2;
2490           break;
2491
2492         case 3:                 /* graphic register 3 */
2493           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2494             ENCODE_SINGLE_SHIFT_3;
2495           else
2496             ENCODE_LOCKING_SHIFT_3;
2497           break;
2498         }
2499     }
2500
2501   return dst;
2502 }
2503
2504 /* Produce 2-byte codes for encoded composition rule RULE.  */
2505
2506 #define ENCODE_COMPOSITION_RULE(rule)           \
2507   do {                                          \
2508     int gref, nref;                             \
2509     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2510     *dst++ = 32 + 81 + gref;                    \
2511     *dst++ = 32 + nref;                         \
2512   } while (0)
2513
2514 /* Produce codes for indicating the start of a composition sequence
2515    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2516    which specify information about the composition.  See the comment
2517    in coding.h for the format of DATA.  */
2518
2519 #define ENCODE_COMPOSITION_START(coding, data)                          \
2520   do {                                                                  \
2521     coding->composing = data[3];                                        \
2522     *dst++ = ISO_CODE_ESC;                                              \
2523     if (coding->composing == COMPOSITION_RELATIVE)                      \
2524       *dst++ = '0';                                                     \
2525     else                                                                \
2526       {                                                                 \
2527         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2528                   ? '3' : '4');                                         \
2529         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2530         coding->composition_rule_follows = 0;                           \
2531       }                                                                 \
2532   } while (0)
2533
2534 /* Produce codes for indicating the end of the current composition.  */
2535
2536 #define ENCODE_COMPOSITION_END(coding, data)                    \
2537   do {                                                          \
2538     *dst++ = ISO_CODE_ESC;                                      \
2539     *dst++ = '1';                                               \
2540     coding->cmp_data_start += data[0];                          \
2541     coding->composing = COMPOSITION_NO;                         \
2542     if (coding->cmp_data_start == coding->cmp_data->used        \
2543         && coding->cmp_data->next)                              \
2544       {                                                         \
2545         coding->cmp_data = coding->cmp_data->next;              \
2546         coding->cmp_data_start = 0;                             \
2547       }                                                         \
2548   } while (0)
2549
2550 /* Produce composition start sequence ESC 0.  Here, this sequence
2551    doesn't mean the start of a new composition but means that we have
2552    just produced components (alternate chars and composition rules) of
2553    the composition and the actual text follows in SRC.  */
2554
2555 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2556   do {                                          \
2557     *dst++ = ISO_CODE_ESC;                      \
2558     *dst++ = '0';                               \
2559     coding->composing = COMPOSITION_RELATIVE;   \
2560   } while (0)
2561
2562 /* The following three macros produce codes for indicating direction
2563    of text.  */
2564 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2565   do {                                                  \
2566     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2567       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2568     else                                                \
2569       *dst++ = ISO_CODE_CSI;                            \
2570   } while (0)
2571
2572 #define ENCODE_DIRECTION_R2L    \
2573   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2574
2575 #define ENCODE_DIRECTION_L2R    \
2576   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2577
2578 /* Produce codes for designation and invocation to reset the graphic
2579    planes and registers to initial state.  */
2580 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2581   do {                                                                      \
2582     int reg;                                                                \
2583     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2584       ENCODE_SHIFT_IN;                                                      \
2585     for (reg = 0; reg < 4; reg++)                                           \
2586       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2587           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2588               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2589         ENCODE_DESIGNATION                                                  \
2590           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2591   } while (0)
2592
2593 /* Produce designation sequences of charsets in the line started from
2594    SRC to a place pointed by DST, and return updated DST.
2595
2596    If the current block ends before any end-of-line, we may fail to
2597    find all the necessary designations.  */
2598
2599 static unsigned char *
2600 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2601      struct coding_system *coding;
2602      Lisp_Object translation_table;
2603      const unsigned char *src, *src_end;
2604      unsigned char *dst;
2605 {
2606   int charset, c, found = 0, reg;
2607   /* Table of charsets to be designated to each graphic register.  */
2608   int r[4];
2609
2610   for (reg = 0; reg < 4; reg++)
2611     r[reg] = -1;
2612
2613   while (found < 4)
2614     {
2615       ONE_MORE_CHAR (c);
2616       if (c == '\n')
2617         break;
2618
2619       charset = CHAR_CHARSET (c);
2620       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2621       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2622         {
2623           found++;
2624           r[reg] = charset;
2625         }
2626     }
2627
2628  label_end_of_loop:
2629   if (found)
2630     {
2631       for (reg = 0; reg < 4; reg++)
2632         if (r[reg] >= 0
2633             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2634           ENCODE_DESIGNATION (r[reg], reg, coding);
2635     }
2636
2637   return dst;
2638 }
2639
2640 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2641
2642 static void
2643 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2644      struct coding_system *coding;
2645      const unsigned char *source;
2646      unsigned char *destination;
2647      int src_bytes, dst_bytes;
2648 {
2649   const unsigned char *src = source;
2650   const unsigned char *src_end = source + src_bytes;
2651   unsigned char *dst = destination;
2652   unsigned char *dst_end = destination + dst_bytes;
2653   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2654      from DST_END to assure overflow checking is necessary only at the
2655      head of loop.  */
2656   unsigned char *adjusted_dst_end = dst_end - 19;
2657   /* SRC_BASE remembers the start position in source in each loop.
2658      The loop will be exited when there's not enough source text to
2659      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2660      there's not enough destination area to produce encoded codes
2661      (within macro EMIT_BYTES).  */
2662   const unsigned char *src_base;
2663   int c;
2664   Lisp_Object translation_table;
2665   Lisp_Object safe_chars;
2666
2667   if (coding->flags & CODING_FLAG_ISO_SAFE)
2668     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2669
2670   safe_chars = coding_safe_chars (coding->symbol);
2671
2672   if (NILP (Venable_character_translation))
2673     translation_table = Qnil;
2674   else
2675     {
2676       translation_table = coding->translation_table_for_encode;
2677       if (NILP (translation_table))
2678         translation_table = Vstandard_translation_table_for_encode;
2679     }
2680
2681   coding->consumed_char = 0;
2682   coding->errors = 0;
2683   while (1)
2684     {
2685       src_base = src;
2686
2687       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2688         {
2689           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2690           break;
2691         }
2692
2693       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2694           && CODING_SPEC_ISO_BOL (coding))
2695         {
2696           /* We have to produce designation sequences if any now.  */
2697           dst = encode_designation_at_bol (coding, translation_table,
2698                                            src, src_end, dst);
2699           CODING_SPEC_ISO_BOL (coding) = 0;
2700         }
2701
2702       /* Check composition start and end.  */
2703       if (coding->composing != COMPOSITION_DISABLED
2704           && coding->cmp_data_start < coding->cmp_data->used)
2705         {
2706           struct composition_data *cmp_data = coding->cmp_data;
2707           int *data = cmp_data->data + coding->cmp_data_start;
2708           int this_pos = cmp_data->char_offset + coding->consumed_char;
2709
2710           if (coding->composing == COMPOSITION_RELATIVE)
2711             {
2712               if (this_pos == data[2])
2713                 {
2714                   ENCODE_COMPOSITION_END (coding, data);
2715                   cmp_data = coding->cmp_data;
2716                   data = cmp_data->data + coding->cmp_data_start;
2717                 }
2718             }
2719           else if (COMPOSING_P (coding))
2720             {
2721               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2722               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2723                 /* We have consumed components of the composition.
2724                    What follows in SRC is the composition's base
2725                    text.  */
2726                 ENCODE_COMPOSITION_FAKE_START (coding);
2727               else
2728                 {
2729                   int c = cmp_data->data[coding->cmp_data_index++];
2730                   if (coding->composition_rule_follows)
2731                     {
2732                       ENCODE_COMPOSITION_RULE (c);
2733                       coding->composition_rule_follows = 0;
2734                     }
2735                   else
2736                     {
2737                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2738                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2739                         ENCODE_UNSAFE_CHARACTER (c);
2740                       else
2741                         ENCODE_ISO_CHARACTER (c);
2742                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2743                         coding->composition_rule_follows = 1;
2744                     }
2745                   continue;
2746                 }
2747             }
2748           if (!COMPOSING_P (coding))
2749             {
2750               if (this_pos == data[1])
2751                 {
2752                   ENCODE_COMPOSITION_START (coding, data);
2753                   continue;
2754                 }
2755             }
2756         }
2757
2758       ONE_MORE_CHAR (c);
2759
2760       /* Now encode the character C.  */
2761       if (c < 0x20 || c == 0x7F)
2762         {
2763           if (c == '\r')
2764             {
2765               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2766                 {
2767                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2768                     ENCODE_RESET_PLANE_AND_REGISTER;
2769                   *dst++ = c;
2770                   continue;
2771                 }
2772               /* fall down to treat '\r' as '\n' ...  */
2773               c = '\n';
2774             }
2775           if (c == '\n')
2776             {
2777               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2778                 ENCODE_RESET_PLANE_AND_REGISTER;
2779               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2780                 bcopy (coding->spec.iso2022.initial_designation,
2781                        coding->spec.iso2022.current_designation,
2782                        sizeof coding->spec.iso2022.initial_designation);
2783               if (coding->eol_type == CODING_EOL_LF
2784                   || coding->eol_type == CODING_EOL_UNDECIDED)
2785                 *dst++ = ISO_CODE_LF;
2786               else if (coding->eol_type == CODING_EOL_CRLF)
2787                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2788               else
2789                 *dst++ = ISO_CODE_CR;
2790               CODING_SPEC_ISO_BOL (coding) = 1;
2791             }
2792           else
2793             {
2794               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2795                 ENCODE_RESET_PLANE_AND_REGISTER;
2796               *dst++ = c;
2797             }
2798         }
2799       else if (ASCII_BYTE_P (c))
2800         ENCODE_ISO_CHARACTER (c);
2801       else if (SINGLE_BYTE_CHAR_P (c))
2802         {
2803           *dst++ = c;
2804           coding->errors++;
2805         }
2806       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2807                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2808         ENCODE_UNSAFE_CHARACTER (c);
2809       else
2810         ENCODE_ISO_CHARACTER (c);
2811
2812       coding->consumed_char++;
2813     }
2814
2815  label_end_of_loop:
2816   coding->consumed = src_base - source;
2817   coding->produced = coding->produced_char = dst - destination;
2818 }
2819
2820 \f
2821 /*** 4. SJIS and BIG5 handlers ***/
2822
2823 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2824    quite widely.  So, for the moment, Emacs supports them in the bare
2825    C code.  But, in the future, they may be supported only by CCL.  */
2826
2827 /* SJIS is a coding system encoding three character sets: ASCII, right
2828    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2829    as is.  A character of charset katakana-jisx0201 is encoded by
2830    "position-code + 0x80".  A character of charset japanese-jisx0208
2831    is encoded in 2-byte but two position-codes are divided and shifted
2832    so that it fits in the range below.
2833
2834    --- CODE RANGE of SJIS ---
2835    (character set)      (range)
2836    ASCII                0x00 .. 0x7F
2837    KATAKANA-JISX0201    0xA1 .. 0xDF
2838    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2839             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2840    -------------------------------
2841
2842 */
2843
2844 /* BIG5 is a coding system encoding two character sets: ASCII and
2845    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2846    character set and is encoded in two bytes.
2847
2848    --- CODE RANGE of BIG5 ---
2849    (character set)      (range)
2850    ASCII                0x00 .. 0x7F
2851    Big5 (1st byte)      0xA1 .. 0xFE
2852         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2853    --------------------------
2854
2855    Since the number of characters in Big5 is larger than maximum
2856    characters in Emacs' charset (96x96), it can't be handled as one
2857    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2858    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2859    contains frequently used characters and the latter contains less
2860    frequently used characters.  */
2861
2862 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2863    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2864    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2865    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2866
2867 /* Number of Big5 characters which have the same code in 1st byte.  */
2868 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2869
2870 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2871   do {                                                                  \
2872     unsigned int temp                                                   \
2873       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2874     if (b1 < 0xC9)                                                      \
2875       charset = charset_big5_1;                                         \
2876     else                                                                \
2877       {                                                                 \
2878         charset = charset_big5_2;                                       \
2879         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2880       }                                                                 \
2881     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2882     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2883   } while (0)
2884
2885 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2886   do {                                                                  \
2887     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2888     if (charset == charset_big5_2)                                      \
2889       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2890     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2891     b2 = temp % BIG5_SAME_ROW;                                          \
2892     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2893   } while (0)
2894
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896    Check if a text is encoded in SJIS.  If it is, return
2897    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2898
2899 static int
2900 detect_coding_sjis (src, src_end, multibytep)
2901      unsigned char *src, *src_end;
2902      int multibytep;
2903 {
2904   int c;
2905   /* Dummy for ONE_MORE_BYTE.  */
2906   struct coding_system dummy_coding;
2907   struct coding_system *coding = &dummy_coding;
2908
2909   while (1)
2910     {
2911       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2912       if (c < 0x80)
2913         continue;
2914       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2915         return 0;
2916       if (c <= 0x9F || c >= 0xE0)
2917         {
2918           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2919           if (c < 0x40 || c == 0x7F || c > 0xFC)
2920             return 0;
2921         }
2922     }
2923  label_end_of_loop:
2924   return CODING_CATEGORY_MASK_SJIS;
2925 }
2926
2927 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2928    Check if a text is encoded in BIG5.  If it is, return
2929    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2930
2931 static int
2932 detect_coding_big5 (src, src_end, multibytep)
2933      unsigned char *src, *src_end;
2934      int multibytep;
2935 {
2936   int c;
2937   /* Dummy for ONE_MORE_BYTE.  */
2938   struct coding_system dummy_coding;
2939   struct coding_system *coding = &dummy_coding;
2940
2941   while (1)
2942     {
2943       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2944       if (c < 0x80)
2945         continue;
2946       if (c < 0xA1 || c > 0xFE)
2947         return 0;
2948       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2949       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2950         return 0;
2951     }
2952  label_end_of_loop:
2953   return CODING_CATEGORY_MASK_BIG5;
2954 }
2955
2956 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2957    Check if a text is encoded in UTF-8.  If it is, return
2958    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2959
2960 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2961 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2962 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2963 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2964 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2965 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2966 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2967
2968 static int
2969 detect_coding_utf_8 (src, src_end, multibytep)
2970      unsigned char *src, *src_end;
2971      int multibytep;
2972 {
2973   unsigned char c;
2974   int seq_maybe_bytes;
2975   /* Dummy for ONE_MORE_BYTE.  */
2976   struct coding_system dummy_coding;
2977   struct coding_system *coding = &dummy_coding;
2978
2979   while (1)
2980     {
2981       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2982       if (UTF_8_1_OCTET_P (c))
2983         continue;
2984       else if (UTF_8_2_OCTET_LEADING_P (c))
2985         seq_maybe_bytes = 1;
2986       else if (UTF_8_3_OCTET_LEADING_P (c))
2987         seq_maybe_bytes = 2;
2988       else if (UTF_8_4_OCTET_LEADING_P (c))
2989         seq_maybe_bytes = 3;
2990       else if (UTF_8_5_OCTET_LEADING_P (c))
2991         seq_maybe_bytes = 4;
2992       else if (UTF_8_6_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 5;
2994       else
2995         return 0;
2996
2997       do
2998         {
2999           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3000           if (!UTF_8_EXTRA_OCTET_P (c))
3001             return 0;
3002           seq_maybe_bytes--;
3003         }
3004       while (seq_maybe_bytes > 0);
3005     }
3006
3007  label_end_of_loop:
3008   return CODING_CATEGORY_MASK_UTF_8;
3009 }
3010
3011 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3012    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3013    Little Endian (otherwise).  If it is, return
3014    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3015    else return 0.  */
3016
3017 #define UTF_16_INVALID_P(val)   \
3018   (((val) == 0xFFFE)            \
3019    || ((val) == 0xFFFF))
3020
3021 #define UTF_16_HIGH_SURROGATE_P(val) \
3022   (((val) & 0xD800) == 0xD800)
3023
3024 #define UTF_16_LOW_SURROGATE_P(val) \
3025   (((val) & 0xDC00) == 0xDC00)
3026
3027 static int
3028 detect_coding_utf_16 (src, src_end, multibytep)
3029      unsigned char *src, *src_end;
3030      int multibytep;
3031 {
3032   unsigned char c1, c2;
3033   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3034   struct coding_system dummy_coding;
3035   struct coding_system *coding = &dummy_coding;
3036
3037   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3038   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3039
3040   if ((c1 == 0xFF) && (c2 == 0xFE))
3041     return CODING_CATEGORY_MASK_UTF_16_LE;
3042   else if ((c1 == 0xFE) && (c2 == 0xFF))
3043     return CODING_CATEGORY_MASK_UTF_16_BE;
3044
3045  label_end_of_loop:
3046   return 0;
3047 }
3048
3049 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3050    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3051
3052 static void
3053 decode_coding_sjis_big5 (coding, source, destination,
3054                          src_bytes, dst_bytes, sjis_p)
3055      struct coding_system *coding;
3056      const unsigned char *source;
3057      unsigned char  *destination;
3058      int src_bytes, dst_bytes;
3059      int sjis_p;
3060 {
3061   const unsigned char *src = source;
3062   const unsigned char *src_end = source + src_bytes;
3063   unsigned char *dst = destination;
3064   unsigned char *dst_end = destination + dst_bytes;
3065   /* SRC_BASE remembers the start position in source in each loop.
3066      The loop will be exited when there's not enough source code
3067      (within macro ONE_MORE_BYTE), or when there's not enough
3068      destination area to produce a character (within macro
3069      EMIT_CHAR).  */
3070   const unsigned char *src_base;
3071   Lisp_Object translation_table;
3072
3073   if (NILP (Venable_character_translation))
3074     translation_table = Qnil;
3075   else
3076     {
3077       translation_table = coding->translation_table_for_decode;
3078       if (NILP (translation_table))
3079         translation_table = Vstandard_translation_table_for_decode;
3080     }
3081
3082   coding->produced_char = 0;
3083   while (1)
3084     {
3085       int c, charset, c1, c2 = 0;
3086
3087       src_base = src;
3088       ONE_MORE_BYTE (c1);
3089
3090       if (c1 < 0x80)
3091         {
3092           charset = CHARSET_ASCII;
3093           if (c1 < 0x20)
3094             {
3095               if (c1 == '\r')
3096                 {
3097                   if (coding->eol_type == CODING_EOL_CRLF)
3098                     {
3099                       ONE_MORE_BYTE (c2);
3100                       if (c2 == '\n')
3101                         c1 = c2;
3102                       else
3103                         /* To process C2 again, SRC is subtracted by 1.  */
3104                         src--;
3105                     }
3106                   else if (coding->eol_type == CODING_EOL_CR)
3107                     c1 = '\n';
3108                 }
3109               else if (c1 == '\n'
3110                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3111                        && (coding->eol_type == CODING_EOL_CR
3112                            || coding->eol_type == CODING_EOL_CRLF))
3113                 {
3114                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3115                   goto label_end_of_loop;
3116                 }
3117             }
3118         }
3119       else
3120         {
3121           if (sjis_p)
3122             {
3123               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3124                 goto label_invalid_code;
3125               if (c1 <= 0x9F || c1 >= 0xE0)
3126                 {
3127                   /* SJIS -> JISX0208 */
3128                   ONE_MORE_BYTE (c2);
3129                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3130                     goto label_invalid_code;
3131                   DECODE_SJIS (c1, c2, c1, c2);
3132                   charset = charset_jisx0208;
3133                 }
3134               else
3135                 /* SJIS -> JISX0201-Kana */
3136                 charset = charset_katakana_jisx0201;
3137             }
3138           else
3139             {
3140               /* BIG5 -> Big5 */
3141               if (c1 < 0xA0 || c1 > 0xFE)
3142                 goto label_invalid_code;
3143               ONE_MORE_BYTE (c2);
3144               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3145                 goto label_invalid_code;
3146               DECODE_BIG5 (c1, c2, charset, c1, c2);
3147             }
3148         }
3149
3150       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3151       EMIT_CHAR (c);
3152       continue;
3153
3154     label_invalid_code:
3155       coding->errors++;
3156       src = src_base;
3157       c = *src++;
3158       EMIT_CHAR (c);
3159     }
3160
3161  label_end_of_loop:
3162   coding->consumed = coding->consumed_char = src_base - source;
3163   coding->produced = dst - destination;
3164   return;
3165 }
3166
3167 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3168    This function can encode charsets `ascii', `katakana-jisx0201',
3169    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3170    are sure that all these charsets are registered as official charset
3171    (i.e. do not have extended leading-codes).  Characters of other
3172    charsets are produced without any encoding.  If SJIS_P is 1, encode
3173    SJIS text, else encode BIG5 text.  */
3174
3175 static void
3176 encode_coding_sjis_big5 (coding, source, destination,
3177                          src_bytes, dst_bytes, sjis_p)
3178      struct coding_system *coding;
3179      unsigned char *source, *destination;
3180      int src_bytes, dst_bytes;
3181      int sjis_p;
3182 {
3183   unsigned char *src = source;
3184   unsigned char *src_end = source + src_bytes;
3185   unsigned char *dst = destination;
3186   unsigned char *dst_end = destination + dst_bytes;
3187   /* SRC_BASE remembers the start position in source in each loop.
3188      The loop will be exited when there's not enough source text to
3189      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3190      there's not enough destination area to produce encoded codes
3191      (within macro EMIT_BYTES).  */
3192   unsigned char *src_base;
3193   Lisp_Object translation_table;
3194
3195   if (NILP (Venable_character_translation))
3196     translation_table = Qnil;
3197   else
3198     {
3199       translation_table = coding->translation_table_for_encode;
3200       if (NILP (translation_table))
3201         translation_table = Vstandard_translation_table_for_encode;
3202     }
3203
3204   while (1)
3205     {
3206       int c, charset, c1, c2;
3207
3208       src_base = src;
3209       ONE_MORE_CHAR (c);
3210
3211       /* Now encode the character C.  */
3212       if (SINGLE_BYTE_CHAR_P (c))
3213         {
3214           switch (c)
3215             {
3216             case '\r':
3217               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3218                 {
3219                   EMIT_ONE_BYTE (c);
3220                   break;
3221                 }
3222               c = '\n';
3223             case '\n':
3224               if (coding->eol_type == CODING_EOL_CRLF)
3225                 {
3226                   EMIT_TWO_BYTES ('\r', c);
3227                   break;
3228                 }
3229               else if (coding->eol_type == CODING_EOL_CR)
3230                 c = '\r';
3231             default:
3232               EMIT_ONE_BYTE (c);
3233             }
3234         }
3235       else
3236         {
3237           SPLIT_CHAR (c, charset, c1, c2);
3238           if (sjis_p)
3239             {
3240               if (charset == charset_jisx0208
3241                   || charset == charset_jisx0208_1978)
3242                 {
3243                   ENCODE_SJIS (c1, c2, c1, c2);
3244                   EMIT_TWO_BYTES (c1, c2);
3245                 }
3246               else if (charset == charset_katakana_jisx0201)
3247                 EMIT_ONE_BYTE (c1 | 0x80);
3248               else if (charset == charset_latin_jisx0201)
3249                 EMIT_ONE_BYTE (c1);
3250               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3251                 {
3252                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3253                   if (CHARSET_WIDTH (charset) > 1)
3254                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3255                 }
3256               else
3257                 /* There's no way other than producing the internal
3258                    codes as is.  */
3259                 EMIT_BYTES (src_base, src);
3260             }
3261           else
3262             {
3263               if (charset == charset_big5_1 || charset == charset_big5_2)
3264                 {
3265                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3266                   EMIT_TWO_BYTES (c1, c2);
3267                 }
3268               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3269                 {
3270                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3271                   if (CHARSET_WIDTH (charset) > 1)
3272                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3273                 }
3274               else
3275                 /* There's no way other than producing the internal
3276                    codes as is.  */
3277                 EMIT_BYTES (src_base, src);
3278             }
3279         }
3280       coding->consumed_char++;
3281     }
3282
3283  label_end_of_loop:
3284   coding->consumed = src_base - source;
3285   coding->produced = coding->produced_char = dst - destination;
3286 }
3287
3288 \f
3289 /*** 5. CCL handlers ***/
3290
3291 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3292    Check if a text is encoded in a coding system of which
3293    encoder/decoder are written in CCL program.  If it is, return
3294    CODING_CATEGORY_MASK_CCL, else return 0.  */
3295
3296 static int
3297 detect_coding_ccl (src, src_end, multibytep)
3298      unsigned char *src, *src_end;
3299      int multibytep;
3300 {
3301   unsigned char *valid;
3302   int c;
3303   /* Dummy for ONE_MORE_BYTE.  */
3304   struct coding_system dummy_coding;
3305   struct coding_system *coding = &dummy_coding;
3306
3307   /* No coding system is assigned to coding-category-ccl.  */
3308   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3309     return 0;
3310
3311   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3312   while (1)
3313     {
3314       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3315       if (! valid[c])
3316         return 0;
3317     }
3318  label_end_of_loop:
3319   return CODING_CATEGORY_MASK_CCL;
3320 }
3321
3322 \f
3323 /*** 6. End-of-line handlers ***/
3324
3325 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3326
3327 static void
3328 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3329      struct coding_system *coding;
3330      const unsigned char *source;
3331      unsigned char *destination;
3332      int src_bytes, dst_bytes;
3333 {
3334   const unsigned char *src = source;
3335   unsigned char *dst = destination;
3336   const unsigned char *src_end = src + src_bytes;
3337   unsigned char *dst_end = dst + dst_bytes;
3338   Lisp_Object translation_table;
3339   /* SRC_BASE remembers the start position in source in each loop.
3340      The loop will be exited when there's not enough source code
3341      (within macro ONE_MORE_BYTE), or when there's not enough
3342      destination area to produce a character (within macro
3343      EMIT_CHAR).  */
3344   const unsigned char *src_base;
3345   int c;
3346
3347   translation_table = Qnil;
3348   switch (coding->eol_type)
3349     {
3350     case CODING_EOL_CRLF:
3351       while (1)
3352         {
3353           src_base = src;
3354           ONE_MORE_BYTE (c);
3355           if (c == '\r')
3356             {
3357               ONE_MORE_BYTE (c);
3358               if (c != '\n')
3359                 {
3360                   src--;
3361                   c = '\r';
3362                 }
3363             }
3364           else if (c == '\n'
3365                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3366             {
3367               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3368               goto label_end_of_loop;
3369             }
3370           EMIT_CHAR (c);
3371         }
3372       break;
3373
3374     case CODING_EOL_CR:
3375       while (1)
3376         {
3377           src_base = src;
3378           ONE_MORE_BYTE (c);
3379           if (c == '\n')
3380             {
3381               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3382                 {
3383                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3384                   goto label_end_of_loop;
3385                 }
3386             }
3387           else if (c == '\r')
3388             c = '\n';
3389           EMIT_CHAR (c);
3390         }
3391       break;
3392
3393     default:                    /* no need for EOL handling */
3394       while (1)
3395         {
3396           src_base = src;
3397           ONE_MORE_BYTE (c);
3398           EMIT_CHAR (c);
3399         }
3400     }
3401
3402  label_end_of_loop:
3403   coding->consumed = coding->consumed_char = src_base - source;
3404   coding->produced = dst - destination;
3405   return;
3406 }
3407
3408 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3409    format of end-of-line according to `coding->eol_type'.  It also
3410    convert multibyte form 8-bit characters to unibyte if
3411    CODING->src_multibyte is nonzero.  If `coding->mode &
3412    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3413    also means end-of-line.  */
3414
3415 static void
3416 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3417      struct coding_system *coding;
3418      const unsigned char *source;
3419      unsigned char *destination;
3420      int src_bytes, dst_bytes;
3421 {
3422   const unsigned char *src = source;
3423   unsigned char *dst = destination;
3424   const unsigned char *src_end = src + src_bytes;
3425   unsigned char *dst_end = dst + dst_bytes;
3426   Lisp_Object translation_table;
3427   /* SRC_BASE remembers the start position in source in each loop.
3428      The loop will be exited when there's not enough source text to
3429      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3430      there's not enough destination area to produce encoded codes
3431      (within macro EMIT_BYTES).  */
3432   const unsigned char *src_base;
3433   unsigned char *tmp;
3434   int c;
3435   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3436
3437   translation_table = Qnil;
3438   if (coding->src_multibyte
3439       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3440     {
3441       src_end--;
3442       src_bytes--;
3443       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3444     }
3445
3446   if (coding->eol_type == CODING_EOL_CRLF)
3447     {
3448       while (src < src_end)
3449         {
3450           src_base = src;
3451           c = *src++;
3452           if (c >= 0x20)
3453             EMIT_ONE_BYTE (c);
3454           else if (c == '\n' || (c == '\r' && selective_display))
3455             EMIT_TWO_BYTES ('\r', '\n');
3456           else
3457             EMIT_ONE_BYTE (c);
3458         }
3459       src_base = src;
3460     label_end_of_loop:
3461       ;
3462     }
3463   else
3464     {
3465       if (!dst_bytes || src_bytes <= dst_bytes)
3466         {
3467           safe_bcopy (src, dst, src_bytes);
3468           src_base = src_end;
3469           dst += src_bytes;
3470         }
3471       else
3472         {
3473           if (coding->src_multibyte
3474               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3475             dst_bytes--;
3476           safe_bcopy (src, dst, dst_bytes);
3477           src_base = src + dst_bytes;
3478           dst = destination + dst_bytes;
3479           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3480         }
3481       if (coding->eol_type == CODING_EOL_CR)
3482         {
3483           for (tmp = destination; tmp < dst; tmp++)
3484             if (*tmp == '\n') *tmp = '\r';
3485         }
3486       else if (selective_display)
3487         {
3488           for (tmp = destination; tmp < dst; tmp++)
3489             if (*tmp == '\r') *tmp = '\n';
3490         }
3491     }
3492   if (coding->src_multibyte)
3493     dst = destination + str_as_unibyte (destination, dst - destination);
3494
3495   coding->consumed = src_base - source;
3496   coding->produced = dst - destination;
3497   coding->produced_char = coding->produced;
3498 }
3499
3500 \f
3501 /*** 7. C library functions ***/
3502
3503 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3504    has a property `coding-system'.  The value of this property is a
3505    vector of length 5 (called the coding-vector).  Among elements of
3506    this vector, the first (element[0]) and the fifth (element[4])
3507    carry important information for decoding/encoding.  Before
3508    decoding/encoding, this information should be set in fields of a
3509    structure of type `coding_system'.
3510
3511    The value of the property `coding-system' can be a symbol of another
3512    subsidiary coding-system.  In that case, Emacs gets coding-vector
3513    from that symbol.
3514
3515    `element[0]' contains information to be set in `coding->type'.  The
3516    value and its meaning is as follows:
3517
3518    0 -- coding_type_emacs_mule
3519    1 -- coding_type_sjis
3520    2 -- coding_type_iso2022
3521    3 -- coding_type_big5
3522    4 -- coding_type_ccl encoder/decoder written in CCL
3523    nil -- coding_type_no_conversion
3524    t -- coding_type_undecided (automatic conversion on decoding,
3525                                no-conversion on encoding)
3526
3527    `element[4]' contains information to be set in `coding->flags' and
3528    `coding->spec'.  The meaning varies by `coding->type'.
3529
3530    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3531    of length 32 (of which the first 13 sub-elements are used now).
3532    Meanings of these sub-elements are:
3533
3534    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3535         If the value is an integer of valid charset, the charset is
3536         assumed to be designated to graphic register N initially.
3537
3538         If the value is minus, it is a minus value of charset which
3539         reserves graphic register N, which means that the charset is
3540         not designated initially but should be designated to graphic
3541         register N just before encoding a character in that charset.
3542
3543         If the value is nil, graphic register N is never used on
3544         encoding.
3545
3546    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3547         Each value takes t or nil.  See the section ISO2022 of
3548         `coding.h' for more information.
3549
3550    If `coding->type' is `coding_type_big5', element[4] is t to denote
3551    BIG5-ETen or nil to denote BIG5-HKU.
3552
3553    If `coding->type' takes the other value, element[4] is ignored.
3554
3555    Emacs Lisp's coding systems also carry information about format of
3556    end-of-line in a value of property `eol-type'.  If the value is
3557    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3558    means CODING_EOL_CR.  If it is not integer, it should be a vector
3559    of subsidiary coding systems of which property `eol-type' has one
3560    of the above values.
3561
3562 */
3563
3564 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3565    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3566    is setup so that no conversion is necessary and return -1, else
3567    return 0.  */
3568
3569 int
3570 setup_coding_system (coding_system, coding)
3571      Lisp_Object coding_system;
3572      struct coding_system *coding;
3573 {
3574   Lisp_Object coding_spec, coding_type, eol_type, plist;
3575   Lisp_Object val;
3576
3577   /* At first, zero clear all members.  */
3578   bzero (coding, sizeof (struct coding_system));
3579
3580   /* Initialize some fields required for all kinds of coding systems.  */
3581   coding->symbol = coding_system;
3582   coding->heading_ascii = -1;
3583   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3584   coding->composing = COMPOSITION_DISABLED;
3585   coding->cmp_data = NULL;
3586
3587   if (NILP (coding_system))
3588     goto label_invalid_coding_system;
3589
3590   coding_spec = Fget (coding_system, Qcoding_system);
3591
3592   if (!VECTORP (coding_spec)
3593       || XVECTOR (coding_spec)->size != 5
3594       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3595     goto label_invalid_coding_system;
3596
3597   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3598   if (VECTORP (eol_type))
3599     {
3600       coding->eol_type = CODING_EOL_UNDECIDED;
3601       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3602     }
3603   else if (XFASTINT (eol_type) == 1)
3604     {
3605       coding->eol_type = CODING_EOL_CRLF;
3606       coding->common_flags
3607         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3608     }
3609   else if (XFASTINT (eol_type) == 2)
3610     {
3611       coding->eol_type = CODING_EOL_CR;
3612       coding->common_flags
3613         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3614     }
3615   else
3616     coding->eol_type = CODING_EOL_LF;
3617
3618   coding_type = XVECTOR (coding_spec)->contents[0];
3619   /* Try short cut.  */
3620   if (SYMBOLP (coding_type))
3621     {
3622       if (EQ (coding_type, Qt))
3623         {
3624           coding->type = coding_type_undecided;
3625           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3626         }
3627       else
3628         coding->type = coding_type_no_conversion;
3629       /* Initialize this member.  Any thing other than
3630          CODING_CATEGORY_IDX_UTF_16_BE and
3631          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3632          special treatment in detect_eol.  */
3633       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3634
3635       return 0;
3636     }
3637
3638   /* Get values of coding system properties:
3639      `post-read-conversion', `pre-write-conversion',
3640      `translation-table-for-decode', `translation-table-for-encode'.  */
3641   plist = XVECTOR (coding_spec)->contents[3];
3642   /* Pre & post conversion functions should be disabled if
3643      inhibit_eol_conversion is nonzero.  This is the case that a code
3644      conversion function is called while those functions are running.  */
3645   if (! inhibit_pre_post_conversion)
3646     {
3647       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3648       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3649     }
3650   val = Fplist_get (plist, Qtranslation_table_for_decode);
3651   if (SYMBOLP (val))
3652     val = Fget (val, Qtranslation_table_for_decode);
3653   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3654   val = Fplist_get (plist, Qtranslation_table_for_encode);
3655   if (SYMBOLP (val))
3656     val = Fget (val, Qtranslation_table_for_encode);
3657   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3658   val = Fplist_get (plist, Qcoding_category);
3659   if (!NILP (val))
3660     {
3661       val = Fget (val, Qcoding_category_index);
3662       if (INTEGERP (val))
3663         coding->category_idx = XINT (val);
3664       else
3665         goto label_invalid_coding_system;
3666     }
3667   else
3668     goto label_invalid_coding_system;
3669
3670   /* If the coding system has non-nil `composition' property, enable
3671      composition handling.  */
3672   val = Fplist_get (plist, Qcomposition);
3673   if (!NILP (val))
3674     coding->composing = COMPOSITION_NO;
3675
3676   switch (XFASTINT (coding_type))
3677     {
3678     case 0:
3679       coding->type = coding_type_emacs_mule;
3680       coding->common_flags
3681         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3682       if (!NILP (coding->post_read_conversion))
3683         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3684       if (!NILP (coding->pre_write_conversion))
3685         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3686       break;
3687
3688     case 1:
3689       coding->type = coding_type_sjis;
3690       coding->common_flags
3691         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3692       break;
3693
3694     case 2:
3695       coding->type = coding_type_iso2022;
3696       coding->common_flags
3697         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3698       {
3699         Lisp_Object val, temp;
3700         Lisp_Object *flags;
3701         int i, charset, reg_bits = 0;
3702
3703         val = XVECTOR (coding_spec)->contents[4];
3704
3705         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3706           goto label_invalid_coding_system;
3707
3708         flags = XVECTOR (val)->contents;
3709         coding->flags
3710           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3711              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3712              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3713              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3714              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3715              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3716              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3717              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3718              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3719              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3720              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3721              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3722              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3723              );
3724
3725         /* Invoke graphic register 0 to plane 0.  */
3726         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3727         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3728         CODING_SPEC_ISO_INVOCATION (coding, 1)
3729           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3730         /* Not single shifting at first.  */
3731         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3732         /* Beginning of buffer should also be regarded as bol. */
3733         CODING_SPEC_ISO_BOL (coding) = 1;
3734
3735         for (charset = 0; charset <= MAX_CHARSET; charset++)
3736           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3737         val = Vcharset_revision_alist;
3738         while (CONSP (val))
3739           {
3740             charset = get_charset_id (Fcar_safe (XCAR (val)));
3741             if (charset >= 0
3742                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3743                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3744               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3745             val = XCDR (val);
3746           }
3747
3748         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3749            FLAGS[REG] can be one of below:
3750                 integer CHARSET: CHARSET occupies register I,
3751                 t: designate nothing to REG initially, but can be used
3752                   by any charsets,
3753                 list of integer, nil, or t: designate the first
3754                   element (if integer) to REG initially, the remaining
3755                   elements (if integer) is designated to REG on request,
3756                   if an element is t, REG can be used by any charsets,
3757                 nil: REG is never used.  */
3758         for (charset = 0; charset <= MAX_CHARSET; charset++)
3759           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3760             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3761         for (i = 0; i < 4; i++)
3762           {
3763             if ((INTEGERP (flags[i])
3764                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3765                 || (charset = get_charset_id (flags[i])) >= 0)
3766               {
3767                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3768                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3769               }
3770             else if (EQ (flags[i], Qt))
3771               {
3772                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3773                 reg_bits |= 1 << i;
3774                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3775               }
3776             else if (CONSP (flags[i]))
3777               {
3778                 Lisp_Object tail;
3779                 tail = flags[i];
3780
3781                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3782                 if ((INTEGERP (XCAR (tail))
3783                      && (charset = XINT (XCAR (tail)),
3784                          CHARSET_VALID_P (charset)))
3785                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3786                   {
3787                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3788                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3789                   }
3790                 else
3791                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3792                 tail = XCDR (tail);
3793                 while (CONSP (tail))
3794                   {
3795                     if ((INTEGERP (XCAR (tail))
3796                          && (charset = XINT (XCAR (tail)),
3797                              CHARSET_VALID_P (charset)))
3798                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3799                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3800                         = i;
3801                     else if (EQ (XCAR (tail), Qt))
3802                       reg_bits |= 1 << i;
3803                     tail = XCDR (tail);
3804                   }
3805               }
3806             else
3807               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3808
3809             CODING_SPEC_ISO_DESIGNATION (coding, i)
3810               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3811           }
3812
3813         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3814           {
3815             /* REG 1 can be used only by locking shift in 7-bit env.  */
3816             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3817               reg_bits &= ~2;
3818             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3819               /* Without any shifting, only REG 0 and 1 can be used.  */
3820               reg_bits &= 3;
3821           }
3822
3823         if (reg_bits)
3824           for (charset = 0; charset <= MAX_CHARSET; charset++)
3825             {
3826               if (CHARSET_DEFINED_P (charset)
3827                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3828                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3829                 {
3830                   /* There exist some default graphic registers to be
3831                      used by CHARSET.  */
3832
3833                   /* We had better avoid designating a charset of
3834                      CHARS96 to REG 0 as far as possible.  */
3835                   if (CHARSET_CHARS (charset) == 96)
3836                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3837                       = (reg_bits & 2
3838                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3839                   else
3840                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3841                       = (reg_bits & 1
3842                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3843                 }
3844             }
3845       }
3846       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3847       coding->spec.iso2022.last_invalid_designation_register = -1;
3848       break;
3849
3850     case 3:
3851       coding->type = coding_type_big5;
3852       coding->common_flags
3853         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3854       coding->flags
3855         = (NILP (XVECTOR (coding_spec)->contents[4])
3856            ? CODING_FLAG_BIG5_HKU
3857            : CODING_FLAG_BIG5_ETEN);
3858       break;
3859
3860     case 4:
3861       coding->type = coding_type_ccl;
3862       coding->common_flags
3863         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3864       {
3865         val = XVECTOR (coding_spec)->contents[4];
3866         if (! CONSP (val)
3867             || setup_ccl_program (&(coding->spec.ccl.decoder),
3868                                   XCAR (val)) < 0
3869             || setup_ccl_program (&(coding->spec.ccl.encoder),
3870                                   XCDR (val)) < 0)
3871           goto label_invalid_coding_system;
3872
3873         bzero (coding->spec.ccl.valid_codes, 256);
3874         val = Fplist_get (plist, Qvalid_codes);
3875         if (CONSP (val))
3876           {
3877             Lisp_Object this;
3878
3879             for (; CONSP (val); val = XCDR (val))
3880               {
3881                 this = XCAR (val);
3882                 if (INTEGERP (this)
3883                     && XINT (this) >= 0 && XINT (this) < 256)
3884                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3885                 else if (CONSP (this)
3886                          && INTEGERP (XCAR (this))
3887                          && INTEGERP (XCDR (this)))
3888                   {
3889                     int start = XINT (XCAR (this));
3890                     int end = XINT (XCDR (this));
3891
3892                     if (start >= 0 && start <= end && end < 256)
3893                       while (start <= end)
3894                         coding->spec.ccl.valid_codes[start++] = 1;
3895                   }
3896               }
3897           }
3898       }
3899       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3900       coding->spec.ccl.cr_carryover = 0;
3901       coding->spec.ccl.eight_bit_carryover[0] = 0;
3902       break;
3903
3904     case 5:
3905       coding->type = coding_type_raw_text;
3906       break;
3907
3908     default:
3909       goto label_invalid_coding_system;
3910     }
3911   return 0;
3912
3913  label_invalid_coding_system:
3914   coding->type = coding_type_no_conversion;
3915   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3916   coding->common_flags = 0;
3917   coding->eol_type = CODING_EOL_LF;
3918   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3919   return -1;
3920 }
3921
3922 /* Free memory blocks allocated for storing composition information.  */
3923
3924 void
3925 coding_free_composition_data (coding)
3926      struct coding_system *coding;
3927 {
3928   struct composition_data *cmp_data = coding->cmp_data, *next;
3929
3930   if (!cmp_data)
3931     return;
3932   /* Memory blocks are chained.  At first, rewind to the first, then,
3933      free blocks one by one.  */
3934   while (cmp_data->prev)
3935     cmp_data = cmp_data->prev;
3936   while (cmp_data)
3937     {
3938       next = cmp_data->next;
3939       xfree (cmp_data);
3940       cmp_data = next;
3941     }
3942   coding->cmp_data = NULL;
3943 }
3944
3945 /* Set `char_offset' member of all memory blocks pointed by
3946    coding->cmp_data to POS.  */
3947
3948 void
3949 coding_adjust_composition_offset (coding, pos)
3950      struct coding_system *coding;
3951      int pos;
3952 {
3953   struct composition_data *cmp_data;
3954
3955   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3956     cmp_data->char_offset = pos;
3957 }
3958
3959 /* Setup raw-text or one of its subsidiaries in the structure
3960    coding_system CODING according to the already setup value eol_type
3961    in CODING.  CODING should be setup for some coding system in
3962    advance.  */
3963
3964 void
3965 setup_raw_text_coding_system (coding)
3966      struct coding_system *coding;
3967 {
3968   if (coding->type != coding_type_raw_text)
3969     {
3970       coding->symbol = Qraw_text;
3971       coding->type = coding_type_raw_text;
3972       if (coding->eol_type != CODING_EOL_UNDECIDED)
3973         {
3974           Lisp_Object subsidiaries;
3975           subsidiaries = Fget (Qraw_text, Qeol_type);
3976
3977           if (VECTORP (subsidiaries)
3978               && XVECTOR (subsidiaries)->size == 3)
3979             coding->symbol
3980               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3981         }
3982       setup_coding_system (coding->symbol, coding);
3983     }
3984   return;
3985 }
3986
3987 /* Emacs has a mechanism to automatically detect a coding system if it
3988    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3989    it's impossible to distinguish some coding systems accurately
3990    because they use the same range of codes.  So, at first, coding
3991    systems are categorized into 7, those are:
3992
3993    o coding-category-emacs-mule
3994
3995         The category for a coding system which has the same code range
3996         as Emacs' internal format.  Assigned the coding-system (Lisp
3997         symbol) `emacs-mule' by default.
3998
3999    o coding-category-sjis
4000
4001         The category for a coding system which has the same code range
4002         as SJIS.  Assigned the coding-system (Lisp
4003         symbol) `japanese-shift-jis' by default.
4004
4005    o coding-category-iso-7
4006
4007         The category for a coding system which has the same code range
4008         as ISO2022 of 7-bit environment.  This doesn't use any locking
4009         shift and single shift functions.  This can encode/decode all
4010         charsets.  Assigned the coding-system (Lisp symbol)
4011         `iso-2022-7bit' by default.
4012
4013    o coding-category-iso-7-tight
4014
4015         Same as coding-category-iso-7 except that this can
4016         encode/decode only the specified charsets.
4017
4018    o coding-category-iso-8-1
4019
4020         The category for a coding system which has the same code range
4021         as ISO2022 of 8-bit environment and graphic plane 1 used only
4022         for DIMENSION1 charset.  This doesn't use any locking shift
4023         and single shift functions.  Assigned the coding-system (Lisp
4024         symbol) `iso-latin-1' by default.
4025
4026    o coding-category-iso-8-2
4027
4028         The category for a coding system which has the same code range
4029         as ISO2022 of 8-bit environment and graphic plane 1 used only
4030         for DIMENSION2 charset.  This doesn't use any locking shift
4031         and single shift functions.  Assigned the coding-system (Lisp
4032         symbol) `japanese-iso-8bit' by default.
4033
4034    o coding-category-iso-7-else
4035
4036         The category for a coding system which has the same code range
4037         as ISO2022 of 7-bit environment but uses locking shift or
4038         single shift functions.  Assigned the coding-system (Lisp
4039         symbol) `iso-2022-7bit-lock' by default.
4040
4041    o coding-category-iso-8-else
4042
4043         The category for a coding system which has the same code range
4044         as ISO2022 of 8-bit environment but uses locking shift or
4045         single shift functions.  Assigned the coding-system (Lisp
4046         symbol) `iso-2022-8bit-ss2' by default.
4047
4048    o coding-category-big5
4049
4050         The category for a coding system which has the same code range
4051         as BIG5.  Assigned the coding-system (Lisp symbol)
4052         `cn-big5' by default.
4053
4054    o coding-category-utf-8
4055
4056         The category for a coding system which has the same code range
4057         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4058         symbol) `utf-8' by default.
4059
4060    o coding-category-utf-16-be
4061
4062         The category for a coding system in which a text has an
4063         Unicode signature (cf. Unicode Standard) in the order of BIG
4064         endian at the head.  Assigned the coding-system (Lisp symbol)
4065         `utf-16-be' by default.
4066
4067    o coding-category-utf-16-le
4068
4069         The category for a coding system in which a text has an
4070         Unicode signature (cf. Unicode Standard) in the order of
4071         LITTLE endian at the head.  Assigned the coding-system (Lisp
4072         symbol) `utf-16-le' by default.
4073
4074    o coding-category-ccl
4075
4076         The category for a coding system of which encoder/decoder is
4077         written in CCL programs.  The default value is nil, i.e., no
4078         coding system is assigned.
4079
4080    o coding-category-binary
4081
4082         The category for a coding system not categorized in any of the
4083         above.  Assigned the coding-system (Lisp symbol)
4084         `no-conversion' by default.
4085
4086    Each of them is a Lisp symbol and the value is an actual
4087    `coding-system' (this is also a Lisp symbol) assigned by a user.
4088    What Emacs does actually is to detect a category of coding system.
4089    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4090    decide a single possible category, it selects a category of the
4091    highest priority.  Priorities of categories are also specified by a
4092    user in a Lisp variable `coding-category-list'.
4093
4094 */
4095
4096 static
4097 int ascii_skip_code[256];
4098
4099 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4100    If it detects possible coding systems, return an integer in which
4101    appropriate flag bits are set.  Flag bits are defined by macros
4102    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4103    it should point the table `coding_priorities'.  In that case, only
4104    the flag bit for a coding system of the highest priority is set in
4105    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4106    range 0x80..0x9F are in multibyte form.
4107
4108    How many ASCII characters are at the head is returned as *SKIP.  */
4109
4110 static int
4111 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4112      unsigned char *source;
4113      int src_bytes, *priorities, *skip;
4114      int multibytep;
4115 {
4116   register unsigned char c;
4117   unsigned char *src = source, *src_end = source + src_bytes;
4118   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4119   int i;
4120
4121   /* At first, skip all ASCII characters and control characters except
4122      for three ISO2022 specific control characters.  */
4123   ascii_skip_code[ISO_CODE_SO] = 0;
4124   ascii_skip_code[ISO_CODE_SI] = 0;
4125   ascii_skip_code[ISO_CODE_ESC] = 0;
4126
4127  label_loop_detect_coding:
4128   while (src < src_end && ascii_skip_code[*src]) src++;
4129   *skip = src - source;
4130
4131   if (src >= src_end)
4132     /* We found nothing other than ASCII.  There's nothing to do.  */
4133     return 0;
4134
4135   c = *src;
4136   /* The text seems to be encoded in some multilingual coding system.
4137      Now, try to find in which coding system the text is encoded.  */
4138   if (c < 0x80)
4139     {
4140       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4141       /* C is an ISO2022 specific control code of C0.  */
4142       mask = detect_coding_iso2022 (src, src_end, multibytep);
4143       if (mask == 0)
4144         {
4145           /* No valid ISO2022 code follows C.  Try again.  */
4146           src++;
4147           if (c == ISO_CODE_ESC)
4148             ascii_skip_code[ISO_CODE_ESC] = 1;
4149           else
4150             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4151           goto label_loop_detect_coding;
4152         }
4153       if (priorities)
4154         {
4155           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4156             {
4157               if (mask & priorities[i])
4158                 return priorities[i];
4159             }
4160           return CODING_CATEGORY_MASK_RAW_TEXT;
4161         }
4162     }
4163   else
4164     {
4165       int try;
4166
4167       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4168         c = src[1] - 0x20;
4169
4170       if (c < 0xA0)
4171         {
4172           /* C is the first byte of SJIS character code,
4173              or a leading-code of Emacs' internal format (emacs-mule),
4174              or the first byte of UTF-16.  */
4175           try = (CODING_CATEGORY_MASK_SJIS
4176                   | CODING_CATEGORY_MASK_EMACS_MULE
4177                   | CODING_CATEGORY_MASK_UTF_16_BE
4178                   | CODING_CATEGORY_MASK_UTF_16_LE);
4179
4180           /* Or, if C is a special latin extra code,
4181              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4182              or is an ISO2022 control-sequence-introducer (CSI),
4183              we should also consider the possibility of ISO2022 codings.  */
4184           if ((VECTORP (Vlatin_extra_code_table)
4185                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4186               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4187               || (c == ISO_CODE_CSI
4188                   && (src < src_end
4189                       && (*src == ']'
4190                           || ((*src == '0' || *src == '1' || *src == '2')
4191                               && src + 1 < src_end
4192                               && src[1] == ']')))))
4193             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4194                      | CODING_CATEGORY_MASK_ISO_8BIT);
4195         }
4196       else
4197         /* C is a character of ISO2022 in graphic plane right,
4198            or a SJIS's 1-byte character code (i.e. JISX0201),
4199            or the first byte of BIG5's 2-byte code,
4200            or the first byte of UTF-8/16.  */
4201         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4202                 | CODING_CATEGORY_MASK_ISO_8BIT
4203                 | CODING_CATEGORY_MASK_SJIS
4204                 | CODING_CATEGORY_MASK_BIG5
4205                 | CODING_CATEGORY_MASK_UTF_8
4206                 | CODING_CATEGORY_MASK_UTF_16_BE
4207                 | CODING_CATEGORY_MASK_UTF_16_LE);
4208
4209       /* Or, we may have to consider the possibility of CCL.  */
4210       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4211           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4212               ->spec.ccl.valid_codes)[c])
4213         try |= CODING_CATEGORY_MASK_CCL;
4214
4215       mask = 0;
4216       utf16_examined_p = iso2022_examined_p = 0;
4217       if (priorities)
4218         {
4219           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4220             {
4221               if (!iso2022_examined_p
4222                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4223                 {
4224                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4225                   iso2022_examined_p = 1;
4226                 }
4227               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4228                 mask |= detect_coding_sjis (src, src_end, multibytep);
4229               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4230                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4231               else if (!utf16_examined_p
4232                        && (priorities[i] & try &
4233                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4234                 {
4235                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4236                   utf16_examined_p = 1;
4237                 }
4238               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4239                 mask |= detect_coding_big5 (src, src_end, multibytep);
4240               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4241                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4242               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4243                 mask |= detect_coding_ccl (src, src_end, multibytep);
4244               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4245                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4246               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4247                 mask |= CODING_CATEGORY_MASK_BINARY;
4248               if (mask & priorities[i])
4249                 return priorities[i];
4250             }
4251           return CODING_CATEGORY_MASK_RAW_TEXT;
4252         }
4253       if (try & CODING_CATEGORY_MASK_ISO)
4254         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4255       if (try & CODING_CATEGORY_MASK_SJIS)
4256         mask |= detect_coding_sjis (src, src_end, multibytep);
4257       if (try & CODING_CATEGORY_MASK_BIG5)
4258         mask |= detect_coding_big5 (src, src_end, multibytep);
4259       if (try & CODING_CATEGORY_MASK_UTF_8)
4260         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4261       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4262         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4263       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4264         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4265       if (try & CODING_CATEGORY_MASK_CCL)
4266         mask |= detect_coding_ccl (src, src_end, multibytep);
4267     }
4268   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4269 }
4270
4271 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4272    The information of the detected coding system is set in CODING.  */
4273
4274 void
4275 detect_coding (coding, src, src_bytes)
4276      struct coding_system *coding;
4277      const unsigned char *src;
4278      int src_bytes;
4279 {
4280   unsigned int idx;
4281   int skip, mask;
4282   Lisp_Object val;
4283
4284   val = Vcoding_category_list;
4285   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4286                              coding->src_multibyte);
4287   coding->heading_ascii = skip;
4288
4289   if (!mask) return;
4290
4291   /* We found a single coding system of the highest priority in MASK.  */
4292   idx = 0;
4293   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4294   if (! mask)
4295     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4296
4297   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4298
4299   if (coding->eol_type != CODING_EOL_UNDECIDED)
4300     {
4301       Lisp_Object tmp;
4302
4303       tmp = Fget (val, Qeol_type);
4304       if (VECTORP (tmp))
4305         val = XVECTOR (tmp)->contents[coding->eol_type];
4306     }
4307
4308   /* Setup this new coding system while preserving some slots.  */
4309   {
4310     int src_multibyte = coding->src_multibyte;
4311     int dst_multibyte = coding->dst_multibyte;
4312
4313     setup_coding_system (val, coding);
4314     coding->src_multibyte = src_multibyte;
4315     coding->dst_multibyte = dst_multibyte;
4316     coding->heading_ascii = skip;
4317   }
4318 }
4319
4320 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4321    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4322    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4323
4324    How many non-eol characters are at the head is returned as *SKIP.  */
4325
4326 #define MAX_EOL_CHECK_COUNT 3
4327
4328 static int
4329 detect_eol_type (source, src_bytes, skip)
4330      unsigned char *source;
4331      int src_bytes, *skip;
4332 {
4333   unsigned char *src = source, *src_end = src + src_bytes;
4334   unsigned char c;
4335   int total = 0;                /* How many end-of-lines are found so far.  */
4336   int eol_type = CODING_EOL_UNDECIDED;
4337   int this_eol_type;
4338
4339   *skip = 0;
4340
4341   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4342     {
4343       c = *src++;
4344       if (c == '\n' || c == '\r')
4345         {
4346           if (*skip == 0)
4347             *skip = src - 1 - source;
4348           total++;
4349           if (c == '\n')
4350             this_eol_type = CODING_EOL_LF;
4351           else if (src >= src_end || *src != '\n')
4352             this_eol_type = CODING_EOL_CR;
4353           else
4354             this_eol_type = CODING_EOL_CRLF, src++;
4355
4356           if (eol_type == CODING_EOL_UNDECIDED)
4357             /* This is the first end-of-line.  */
4358             eol_type = this_eol_type;
4359           else if (eol_type != this_eol_type)
4360             {
4361               /* The found type is different from what found before.  */
4362               eol_type = CODING_EOL_INCONSISTENT;
4363               break;
4364             }
4365         }
4366     }
4367
4368   if (*skip == 0)
4369     *skip = src_end - source;
4370   return eol_type;
4371 }
4372
4373 /* Like detect_eol_type, but detect EOL type in 2-octet
4374    big-endian/little-endian format for coding systems utf-16-be and
4375    utf-16-le.  */
4376
4377 static int
4378 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4379      unsigned char *source;
4380      int src_bytes, *skip, big_endian_p;
4381 {
4382   unsigned char *src = source, *src_end = src + src_bytes;
4383   unsigned int c1, c2;
4384   int total = 0;                /* How many end-of-lines are found so far.  */
4385   int eol_type = CODING_EOL_UNDECIDED;
4386   int this_eol_type;
4387   int msb, lsb;
4388
4389   if (big_endian_p)
4390     msb = 0, lsb = 1;
4391   else
4392     msb = 1, lsb = 0;
4393
4394   *skip = 0;
4395
4396   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4397     {
4398       c1 = (src[msb] << 8) | (src[lsb]);
4399       src += 2;
4400
4401       if (c1 == '\n' || c1 == '\r')
4402         {
4403           if (*skip == 0)
4404             *skip = src - 2 - source;
4405           total++;
4406           if (c1 == '\n')
4407             {
4408               this_eol_type = CODING_EOL_LF;
4409             }
4410           else
4411             {
4412               if ((src + 1) >= src_end)
4413                 {
4414                   this_eol_type = CODING_EOL_CR;
4415                 }
4416               else
4417                 {
4418                   c2 = (src[msb] << 8) | (src[lsb]);
4419                   if (c2 == '\n')
4420                     this_eol_type = CODING_EOL_CRLF, src += 2;
4421                   else
4422                     this_eol_type = CODING_EOL_CR;
4423                 }
4424             }
4425
4426           if (eol_type == CODING_EOL_UNDECIDED)
4427             /* This is the first end-of-line.  */
4428             eol_type = this_eol_type;
4429           else if (eol_type != this_eol_type)
4430             {
4431               /* The found type is different from what found before.  */
4432               eol_type = CODING_EOL_INCONSISTENT;
4433               break;
4434             }
4435         }
4436     }
4437
4438   if (*skip == 0)
4439     *skip = src_end - source;
4440   return eol_type;
4441 }
4442
4443 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4444    is encoded.  If it detects an appropriate format of end-of-line, it
4445    sets the information in *CODING.  */
4446
4447 void
4448 detect_eol (coding, src, src_bytes)
4449      struct coding_system *coding;
4450      const unsigned char *src;
4451      int src_bytes;
4452 {
4453   Lisp_Object val;
4454   int skip;
4455   int eol_type;
4456
4457   switch (coding->category_idx)
4458     {
4459     case CODING_CATEGORY_IDX_UTF_16_BE:
4460       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4461       break;
4462     case CODING_CATEGORY_IDX_UTF_16_LE:
4463       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4464       break;
4465     default:
4466       eol_type = detect_eol_type (src, src_bytes, &skip);
4467       break;
4468     }
4469
4470   if (coding->heading_ascii > skip)
4471     coding->heading_ascii = skip;
4472   else
4473     skip = coding->heading_ascii;
4474
4475   if (eol_type == CODING_EOL_UNDECIDED)
4476     return;
4477   if (eol_type == CODING_EOL_INCONSISTENT)
4478     {
4479 #if 0
4480       /* This code is suppressed until we find a better way to
4481          distinguish raw text file and binary file.  */
4482
4483       /* If we have already detected that the coding is raw-text, the
4484          coding should actually be no-conversion.  */
4485       if (coding->type == coding_type_raw_text)
4486         {
4487           setup_coding_system (Qno_conversion, coding);
4488           return;
4489         }
4490       /* Else, let's decode only text code anyway.  */
4491 #endif /* 0 */
4492       eol_type = CODING_EOL_LF;
4493     }
4494
4495   val = Fget (coding->symbol, Qeol_type);
4496   if (VECTORP (val) && XVECTOR (val)->size == 3)
4497     {
4498       int src_multibyte = coding->src_multibyte;
4499       int dst_multibyte = coding->dst_multibyte;
4500       struct composition_data *cmp_data = coding->cmp_data;
4501
4502       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4503       coding->src_multibyte = src_multibyte;
4504       coding->dst_multibyte = dst_multibyte;
4505       coding->heading_ascii = skip;
4506       coding->cmp_data = cmp_data;
4507     }
4508 }
4509
4510 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4511
4512 #define DECODING_BUFFER_MAG(coding)                     \
4513   (coding->type == coding_type_iso2022                  \
4514    ? 3                                                  \
4515    : (coding->type == coding_type_ccl                   \
4516       ? coding->spec.ccl.decoder.buf_magnification      \
4517       : 2))
4518
4519 /* Return maximum size (bytes) of a buffer enough for decoding
4520    SRC_BYTES of text encoded in CODING.  */
4521
4522 int
4523 decoding_buffer_size (coding, src_bytes)
4524      struct coding_system *coding;
4525      int src_bytes;
4526 {
4527   return (src_bytes * DECODING_BUFFER_MAG (coding)
4528           + CONVERSION_BUFFER_EXTRA_ROOM);
4529 }
4530
4531 /* Return maximum size (bytes) of a buffer enough for encoding
4532    SRC_BYTES of text to CODING.  */
4533
4534 int
4535 encoding_buffer_size (coding, src_bytes)
4536      struct coding_system *coding;
4537      int src_bytes;
4538 {
4539   int magnification;
4540
4541   if (coding->type == coding_type_ccl)
4542     {
4543       magnification = coding->spec.ccl.encoder.buf_magnification;
4544       if (coding->eol_type == CODING_EOL_CRLF)
4545         magnification *= 2;
4546     }
4547   else if (CODING_REQUIRE_ENCODING (coding))
4548     magnification = 3;
4549   else
4550     magnification = 1;
4551
4552   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4553 }
4554
4555 /* Working buffer for code conversion.  */
4556 struct conversion_buffer
4557 {
4558   int size;                     /* size of data.  */
4559   int on_stack;                 /* 1 if allocated by alloca.  */
4560   unsigned char *data;
4561 };
4562
4563 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4564 #define allocate_conversion_buffer(buf, len)            \
4565   do {                                                  \
4566     if (len < MAX_ALLOCA)                               \
4567       {                                                 \
4568         buf.data = (unsigned char *) alloca (len);      \
4569         buf.on_stack = 1;                               \
4570       }                                                 \
4571     else                                                \
4572       {                                                 \
4573         buf.data = (unsigned char *) xmalloc (len);     \
4574         buf.on_stack = 0;                               \
4575       }                                                 \
4576     buf.size = len;                                     \
4577   } while (0)
4578
4579 /* Double the allocated memory for *BUF.  */
4580 static void
4581 extend_conversion_buffer (buf)
4582      struct conversion_buffer *buf;
4583 {
4584   if (buf->on_stack)
4585     {
4586       unsigned char *save = buf->data;
4587       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4588       bcopy (save, buf->data, buf->size);
4589       buf->on_stack = 0;
4590     }
4591   else
4592     {
4593       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4594     }
4595   buf->size *= 2;
4596 }
4597
4598 /* Free the allocated memory for BUF if it is not on stack.  */
4599 static void
4600 free_conversion_buffer (buf)
4601      struct conversion_buffer *buf;
4602 {
4603   if (!buf->on_stack)
4604     xfree (buf->data);
4605 }
4606
4607 int
4608 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4609      struct coding_system *coding;
4610      unsigned char *source, *destination;
4611      int src_bytes, dst_bytes, encodep;
4612 {
4613   struct ccl_program *ccl
4614     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4615   unsigned char *dst = destination;
4616
4617   ccl->suppress_error = coding->suppress_error;
4618   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4619   if (encodep)
4620     {
4621       /* On encoding, EOL format is converted within ccl_driver.  For
4622          that, setup proper information in the structure CCL.  */
4623       ccl->eol_type = coding->eol_type;
4624       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4625         ccl->eol_type = CODING_EOL_LF;
4626       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4627       ccl->eight_bit_control = coding->dst_multibyte;
4628     }
4629   else
4630     ccl->eight_bit_control = 1;
4631   ccl->multibyte = coding->src_multibyte;
4632   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4633     {
4634       /* Move carryover bytes to DESTINATION.  */
4635       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4636       while (*p)
4637         *dst++ = *p++;
4638       coding->spec.ccl.eight_bit_carryover[0] = 0;
4639       if (dst_bytes)
4640         dst_bytes -= dst - destination;
4641     }
4642
4643   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4644                                   &(coding->consumed))
4645                       + dst - destination);
4646
4647   if (encodep)
4648     {
4649       coding->produced_char = coding->produced;
4650       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4651     }
4652   else if (!ccl->eight_bit_control)
4653     {
4654       /* The produced bytes forms a valid multibyte sequence. */
4655       coding->produced_char
4656         = multibyte_chars_in_text (destination, coding->produced);
4657       coding->spec.ccl.eight_bit_carryover[0] = 0;
4658     }
4659   else
4660     {
4661       /* On decoding, the destination should always multibyte.  But,
4662          CCL program might have been generated an invalid multibyte
4663          sequence.  Here we make such a sequence valid as
4664          multibyte.  */
4665       int bytes
4666         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4667
4668       if ((coding->consumed < src_bytes
4669            || !ccl->last_block)
4670           && coding->produced >= 1
4671           && destination[coding->produced - 1] >= 0x80)
4672         {
4673           /* We should not convert the tailing 8-bit codes to
4674              multibyte form even if they doesn't form a valid
4675              multibyte sequence.  They may form a valid sequence in
4676              the next call.  */
4677           int carryover = 0;
4678
4679           if (destination[coding->produced - 1] < 0xA0)
4680             carryover = 1;
4681           else if (coding->produced >= 2)
4682             {
4683               if (destination[coding->produced - 2] >= 0x80)
4684                 {
4685                   if (destination[coding->produced - 2] < 0xA0)
4686                     carryover = 2;
4687                   else if (coding->produced >= 3
4688                            && destination[coding->produced - 3] >= 0x80
4689                            && destination[coding->produced - 3] < 0xA0)
4690                     carryover = 3;
4691                 }
4692             }
4693           if (carryover > 0)
4694             {
4695               BCOPY_SHORT (destination + coding->produced - carryover,
4696                            coding->spec.ccl.eight_bit_carryover,
4697                            carryover);
4698               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4699               coding->produced -= carryover;
4700             }
4701         }
4702       coding->produced = str_as_multibyte (destination, bytes,
4703                                            coding->produced,
4704                                            &(coding->produced_char));
4705     }
4706
4707   switch (ccl->status)
4708     {
4709     case CCL_STAT_SUSPEND_BY_SRC:
4710       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4711       break;
4712     case CCL_STAT_SUSPEND_BY_DST:
4713       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4714       break;
4715     case CCL_STAT_QUIT:
4716     case CCL_STAT_INVALID_CMD:
4717       coding->result = CODING_FINISH_INTERRUPT;
4718       break;
4719     default:
4720       coding->result = CODING_FINISH_NORMAL;
4721       break;
4722     }
4723   return coding->result;
4724 }
4725
4726 /* Decode EOL format of the text at PTR of BYTES length destructively
4727    according to CODING->eol_type.  This is called after the CCL
4728    program produced a decoded text at PTR.  If we do CRLF->LF
4729    conversion, update CODING->produced and CODING->produced_char.  */
4730
4731 static void
4732 decode_eol_post_ccl (coding, ptr, bytes)
4733      struct coding_system *coding;
4734      unsigned char *ptr;
4735      int bytes;
4736 {
4737   Lisp_Object val, saved_coding_symbol;
4738   unsigned char *pend = ptr + bytes;
4739   int dummy;
4740
4741   /* Remember the current coding system symbol.  We set it back when
4742      an inconsistent EOL is found so that `last-coding-system-used' is
4743      set to the coding system that doesn't specify EOL conversion.  */
4744   saved_coding_symbol = coding->symbol;
4745
4746   coding->spec.ccl.cr_carryover = 0;
4747   if (coding->eol_type == CODING_EOL_UNDECIDED)
4748     {
4749       /* Here, to avoid the call of setup_coding_system, we directly
4750          call detect_eol_type.  */
4751       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4752       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4753         coding->eol_type = CODING_EOL_LF;
4754       if (coding->eol_type != CODING_EOL_UNDECIDED)
4755         {
4756           val = Fget (coding->symbol, Qeol_type);
4757           if (VECTORP (val) && XVECTOR (val)->size == 3)
4758             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4759         }
4760       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4761     }
4762
4763   if (coding->eol_type == CODING_EOL_LF
4764       || coding->eol_type == CODING_EOL_UNDECIDED)
4765     {
4766       /* We have nothing to do.  */
4767       ptr = pend;
4768     }
4769   else if (coding->eol_type == CODING_EOL_CRLF)
4770     {
4771       unsigned char *pstart = ptr, *p = ptr;
4772
4773       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4774           && *(pend - 1) == '\r')
4775         {
4776           /* If the last character is CR, we can't handle it here
4777              because LF will be in the not-yet-decoded source text.
4778              Record that the CR is not yet processed.  */
4779           coding->spec.ccl.cr_carryover = 1;
4780           coding->produced--;
4781           coding->produced_char--;
4782           pend--;
4783         }
4784       while (ptr < pend)
4785         {
4786           if (*ptr == '\r')
4787             {
4788               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4789                 {
4790                   *p++ = '\n';
4791                   ptr += 2;
4792                 }
4793               else
4794                 {
4795                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4796                     goto undo_eol_conversion;
4797                   *p++ = *ptr++;
4798                 }
4799             }
4800           else if (*ptr == '\n'
4801                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4802             goto undo_eol_conversion;
4803           else
4804             *p++ = *ptr++;
4805           continue;
4806
4807         undo_eol_conversion:
4808           /* We have faced with inconsistent EOL format at PTR.
4809              Convert all LFs before PTR back to CRLFs.  */
4810           for (p--, ptr--; p >= pstart; p--)
4811             {
4812               if (*p == '\n')
4813                 *ptr-- = '\n', *ptr-- = '\r';
4814               else
4815                 *ptr-- = *p;
4816             }
4817           /*  If carryover is recorded, cancel it because we don't
4818               convert CRLF anymore.  */
4819           if (coding->spec.ccl.cr_carryover)
4820             {
4821               coding->spec.ccl.cr_carryover = 0;
4822               coding->produced++;
4823               coding->produced_char++;
4824               pend++;
4825             }
4826           p = ptr = pend;
4827           coding->eol_type = CODING_EOL_LF;
4828           coding->symbol = saved_coding_symbol;
4829         }
4830       if (p < pend)
4831         {
4832           /* As each two-byte sequence CRLF was converted to LF, (PEND
4833              - P) is the number of deleted characters.  */
4834           coding->produced -= pend - p;
4835           coding->produced_char -= pend - p;
4836         }
4837     }
4838   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4839     {
4840       unsigned char *p = ptr;
4841
4842       for (; ptr < pend; ptr++)
4843         {
4844           if (*ptr == '\r')
4845             *ptr = '\n';
4846           else if (*ptr == '\n'
4847                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4848             {
4849               for (; p < ptr; p++)
4850                 {
4851                   if (*p == '\n')
4852                     *p = '\r';
4853                 }
4854               ptr = pend;
4855               coding->eol_type = CODING_EOL_LF;
4856               coding->symbol = saved_coding_symbol;
4857             }
4858         }
4859     }
4860 }
4861
4862 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4863    decoding, it may detect coding system and format of end-of-line if
4864    those are not yet decided.  The source should be unibyte, the
4865    result is multibyte if CODING->dst_multibyte is nonzero, else
4866    unibyte.  */
4867
4868 int
4869 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4870      struct coding_system *coding;
4871      const unsigned char *source;
4872      unsigned char *destination;
4873      int src_bytes, dst_bytes;
4874 {
4875   int extra = 0;
4876
4877   if (coding->type == coding_type_undecided)
4878     detect_coding (coding, source, src_bytes);
4879
4880   if (coding->eol_type == CODING_EOL_UNDECIDED
4881       && coding->type != coding_type_ccl)
4882     {
4883       detect_eol (coding, source, src_bytes);
4884       /* We had better recover the original eol format if we
4885          encounter an inconsistent eol format while decoding.  */
4886       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4887     }
4888
4889   coding->produced = coding->produced_char = 0;
4890   coding->consumed = coding->consumed_char = 0;
4891   coding->errors = 0;
4892   coding->result = CODING_FINISH_NORMAL;
4893
4894   switch (coding->type)
4895     {
4896     case coding_type_sjis:
4897       decode_coding_sjis_big5 (coding, source, destination,
4898                                src_bytes, dst_bytes, 1);
4899       break;
4900
4901     case coding_type_iso2022:
4902       decode_coding_iso2022 (coding, source, destination,
4903                              src_bytes, dst_bytes);
4904       break;
4905
4906     case coding_type_big5:
4907       decode_coding_sjis_big5 (coding, source, destination,
4908                                src_bytes, dst_bytes, 0);
4909       break;
4910
4911     case coding_type_emacs_mule:
4912       decode_coding_emacs_mule (coding, source, destination,
4913                                 src_bytes, dst_bytes);
4914       break;
4915
4916     case coding_type_ccl:
4917       if (coding->spec.ccl.cr_carryover)
4918         {
4919           /* Put the CR which was not processed by the previous call
4920              of decode_eol_post_ccl in DESTINATION.  It will be
4921              decoded together with the following LF by the call to
4922              decode_eol_post_ccl below.  */
4923           *destination = '\r';
4924           coding->produced++;
4925           coding->produced_char++;
4926           dst_bytes--;
4927           extra = coding->spec.ccl.cr_carryover;
4928         }
4929       ccl_coding_driver (coding, source, destination + extra,
4930                          src_bytes, dst_bytes, 0);
4931       if (coding->eol_type != CODING_EOL_LF)
4932         {
4933           coding->produced += extra;
4934           coding->produced_char += extra;
4935           decode_eol_post_ccl (coding, destination, coding->produced);
4936         }
4937       break;
4938
4939     default:
4940       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4941     }
4942
4943   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4944       && coding->mode & CODING_MODE_LAST_BLOCK
4945       && coding->consumed == src_bytes)
4946     coding->result = CODING_FINISH_NORMAL;
4947
4948   if (coding->mode & CODING_MODE_LAST_BLOCK
4949       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4950     {
4951       const unsigned char *src = source + coding->consumed;
4952       unsigned char *dst = destination + coding->produced;
4953
4954       src_bytes -= coding->consumed;
4955       coding->errors++;
4956       if (COMPOSING_P (coding))
4957         DECODE_COMPOSITION_END ('1');
4958       while (src_bytes--)
4959         {
4960           int c = *src++;
4961           dst += CHAR_STRING (c, dst);
4962           coding->produced_char++;
4963         }
4964       coding->consumed = coding->consumed_char = src - source;
4965       coding->produced = dst - destination;
4966       coding->result = CODING_FINISH_NORMAL;
4967     }
4968
4969   if (!coding->dst_multibyte)
4970     {
4971       coding->produced = str_as_unibyte (destination, coding->produced);
4972       coding->produced_char = coding->produced;
4973     }
4974
4975   return coding->result;
4976 }
4977
4978 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4979    multibyteness of the source is CODING->src_multibyte, the
4980    multibyteness of the result is always unibyte.  */
4981
4982 int
4983 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4984      struct coding_system *coding;
4985      const unsigned char *source;
4986      unsigned char *destination;
4987      int src_bytes, dst_bytes;
4988 {
4989   coding->produced = coding->produced_char = 0;
4990   coding->consumed = coding->consumed_char = 0;
4991   coding->errors = 0;
4992   coding->result = CODING_FINISH_NORMAL;
4993
4994   switch (coding->type)
4995     {
4996     case coding_type_sjis:
4997       encode_coding_sjis_big5 (coding, source, destination,
4998                                src_bytes, dst_bytes, 1);
4999       break;
5000
5001     case coding_type_iso2022:
5002       encode_coding_iso2022 (coding, source, destination,
5003                              src_bytes, dst_bytes);
5004       break;
5005
5006     case coding_type_big5:
5007       encode_coding_sjis_big5 (coding, source, destination,
5008                                src_bytes, dst_bytes, 0);
5009       break;
5010
5011     case coding_type_emacs_mule:
5012       encode_coding_emacs_mule (coding, source, destination,
5013                                 src_bytes, dst_bytes);
5014       break;
5015
5016     case coding_type_ccl:
5017       ccl_coding_driver (coding, source, destination,
5018                          src_bytes, dst_bytes, 1);
5019       break;
5020
5021     default:
5022       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5023     }
5024
5025   if (coding->mode & CODING_MODE_LAST_BLOCK
5026       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5027     {
5028       const unsigned char *src = source + coding->consumed;
5029       unsigned char *dst = destination + coding->produced;
5030
5031       if (coding->type == coding_type_iso2022)
5032         ENCODE_RESET_PLANE_AND_REGISTER;
5033       if (COMPOSING_P (coding))
5034         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5035       if (coding->consumed < src_bytes)
5036         {
5037           int len = src_bytes - coding->consumed;
5038
5039           BCOPY_SHORT (src, dst, len);
5040           if (coding->src_multibyte)
5041             len = str_as_unibyte (dst, len);
5042           dst += len;
5043           coding->consumed = src_bytes;
5044         }
5045       coding->produced = coding->produced_char = dst - destination;
5046       coding->result = CODING_FINISH_NORMAL;
5047     }
5048
5049   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5050       && coding->consumed == src_bytes)
5051     coding->result = CODING_FINISH_NORMAL;
5052
5053   return coding->result;
5054 }
5055
5056 /* Scan text in the region between *BEG and *END (byte positions),
5057    skip characters which we don't have to decode by coding system
5058    CODING at the head and tail, then set *BEG and *END to the region
5059    of the text we actually have to convert.  The caller should move
5060    the gap out of the region in advance if the region is from a
5061    buffer.
5062
5063    If STR is not NULL, *BEG and *END are indices into STR.  */
5064
5065 static void
5066 shrink_decoding_region (beg, end, coding, str)
5067      int *beg, *end;
5068      struct coding_system *coding;
5069      unsigned char *str;
5070 {
5071   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5072   int eol_conversion;
5073   Lisp_Object translation_table;
5074
5075   if (coding->type == coding_type_ccl
5076       || coding->type == coding_type_undecided
5077       || coding->eol_type != CODING_EOL_LF
5078       || !NILP (coding->post_read_conversion)
5079       || coding->composing != COMPOSITION_DISABLED)
5080     {
5081       /* We can't skip any data.  */
5082       return;
5083     }
5084   if (coding->type == coding_type_no_conversion
5085       || coding->type == coding_type_raw_text
5086       || coding->type == coding_type_emacs_mule)
5087     {
5088       /* We need no conversion, but don't have to skip any data here.
5089          Decoding routine handles them effectively anyway.  */
5090       return;
5091     }
5092
5093   translation_table = coding->translation_table_for_decode;
5094   if (NILP (translation_table) && !NILP (Venable_character_translation))
5095     translation_table = Vstandard_translation_table_for_decode;
5096   if (CHAR_TABLE_P (translation_table))
5097     {
5098       int i;
5099       for (i = 0; i < 128; i++)
5100         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5101           break;
5102       if (i < 128)
5103         /* Some ASCII character should be translated.  We give up
5104            shrinking.  */
5105         return;
5106     }
5107
5108   if (coding->heading_ascii >= 0)
5109     /* Detection routine has already found how much we can skip at the
5110        head.  */
5111     *beg += coding->heading_ascii;
5112
5113   if (str)
5114     {
5115       begp_orig = begp = str + *beg;
5116       endp_orig = endp = str + *end;
5117     }
5118   else
5119     {
5120       begp_orig = begp = BYTE_POS_ADDR (*beg);
5121       endp_orig = endp = begp + *end - *beg;
5122     }
5123
5124   eol_conversion = (coding->eol_type == CODING_EOL_CR
5125                     || coding->eol_type == CODING_EOL_CRLF);
5126
5127   switch (coding->type)
5128     {
5129     case coding_type_sjis:
5130     case coding_type_big5:
5131       /* We can skip all ASCII characters at the head.  */
5132       if (coding->heading_ascii < 0)
5133         {
5134           if (eol_conversion)
5135             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5136           else
5137             while (begp < endp && *begp < 0x80) begp++;
5138         }
5139       /* We can skip all ASCII characters at the tail except for the
5140          second byte of SJIS or BIG5 code.  */
5141       if (eol_conversion)
5142         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5143       else
5144         while (begp < endp && endp[-1] < 0x80) endp--;
5145       /* Do not consider LF as ascii if preceded by CR, since that
5146          confuses eol decoding. */
5147       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5148         endp++;
5149       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5150         endp++;
5151       break;
5152
5153     case coding_type_iso2022:
5154       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5155         /* We can't skip any data.  */
5156         break;
5157       if (coding->heading_ascii < 0)
5158         {
5159           /* We can skip all ASCII characters at the head except for a
5160              few control codes.  */
5161           while (begp < endp && (c = *begp) < 0x80
5162                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5163                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5164                  && (!eol_conversion || c != ISO_CODE_LF))
5165             begp++;
5166         }
5167       switch (coding->category_idx)
5168         {
5169         case CODING_CATEGORY_IDX_ISO_8_1:
5170         case CODING_CATEGORY_IDX_ISO_8_2:
5171           /* We can skip all ASCII characters at the tail.  */
5172           if (eol_conversion)
5173             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5174           else
5175             while (begp < endp && endp[-1] < 0x80) endp--;
5176           /* Do not consider LF as ascii if preceded by CR, since that
5177              confuses eol decoding. */
5178           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5179             endp++;
5180           break;
5181
5182         case CODING_CATEGORY_IDX_ISO_7:
5183         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5184           {
5185             /* We can skip all characters at the tail except for 8-bit
5186                codes and ESC and the following 2-byte at the tail.  */
5187             unsigned char *eight_bit = NULL;
5188
5189             if (eol_conversion)
5190               while (begp < endp
5191                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5192                 {
5193                   if (!eight_bit && c & 0x80) eight_bit = endp;
5194                   endp--;
5195                 }
5196             else
5197               while (begp < endp
5198                      && (c = endp[-1]) != ISO_CODE_ESC)
5199                 {
5200                   if (!eight_bit && c & 0x80) eight_bit = endp;
5201                   endp--;
5202                 }
5203             /* Do not consider LF as ascii if preceded by CR, since that
5204                confuses eol decoding. */
5205             if (begp < endp && endp < endp_orig
5206                 && endp[-1] == '\r' && endp[0] == '\n')
5207               endp++;
5208             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5209               {
5210                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5211                   /* This is an ASCII designation sequence.  We can
5212                      surely skip the tail.  But, if we have
5213                      encountered an 8-bit code, skip only the codes
5214                      after that.  */
5215                   endp = eight_bit ? eight_bit : endp + 2;
5216                 else
5217                   /* Hmmm, we can't skip the tail.  */
5218                   endp = endp_orig;
5219               }
5220             else if (eight_bit)
5221               endp = eight_bit;
5222           }
5223         }
5224       break;
5225
5226     default:
5227       abort ();
5228     }
5229   *beg += begp - begp_orig;
5230   *end += endp - endp_orig;
5231   return;
5232 }
5233
5234 /* Like shrink_decoding_region but for encoding.  */
5235
5236 static void
5237 shrink_encoding_region (beg, end, coding, str)
5238      int *beg, *end;
5239      struct coding_system *coding;
5240      unsigned char *str;
5241 {
5242   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5243   int eol_conversion;
5244   Lisp_Object translation_table;
5245
5246   if (coding->type == coding_type_ccl
5247       || coding->eol_type == CODING_EOL_CRLF
5248       || coding->eol_type == CODING_EOL_CR
5249       || (coding->cmp_data && coding->cmp_data->used > 0))
5250     {
5251       /* We can't skip any data.  */
5252       return;
5253     }
5254   if (coding->type == coding_type_no_conversion
5255       || coding->type == coding_type_raw_text
5256       || coding->type == coding_type_emacs_mule
5257       || coding->type == coding_type_undecided)
5258     {
5259       /* We need no conversion, but don't have to skip any data here.
5260          Encoding routine handles them effectively anyway.  */
5261       return;
5262     }
5263
5264   translation_table = coding->translation_table_for_encode;
5265   if (NILP (translation_table) && !NILP (Venable_character_translation))
5266     translation_table = Vstandard_translation_table_for_encode;
5267   if (CHAR_TABLE_P (translation_table))
5268     {
5269       int i;
5270       for (i = 0; i < 128; i++)
5271         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5272           break;
5273       if (i < 128)
5274         /* Some ASCII character should be translated.  We give up
5275            shrinking.  */
5276         return;
5277     }
5278
5279   if (str)
5280     {
5281       begp_orig = begp = str + *beg;
5282       endp_orig = endp = str + *end;
5283     }
5284   else
5285     {
5286       begp_orig = begp = BYTE_POS_ADDR (*beg);
5287       endp_orig = endp = begp + *end - *beg;
5288     }
5289
5290   eol_conversion = (coding->eol_type == CODING_EOL_CR
5291                     || coding->eol_type == CODING_EOL_CRLF);
5292
5293   /* Here, we don't have to check coding->pre_write_conversion because
5294      the caller is expected to have handled it already.  */
5295   switch (coding->type)
5296     {
5297     case coding_type_iso2022:
5298       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5299         /* We can't skip any data.  */
5300         break;
5301       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5302         {
5303           unsigned char *bol = begp;
5304           while (begp < endp && *begp < 0x80)
5305             {
5306               begp++;
5307               if (begp[-1] == '\n')
5308                 bol = begp;
5309             }
5310           begp = bol;
5311           goto label_skip_tail;
5312         }
5313       /* fall down ... */
5314
5315     case coding_type_sjis:
5316     case coding_type_big5:
5317       /* We can skip all ASCII characters at the head and tail.  */
5318       if (eol_conversion)
5319         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5320       else
5321         while (begp < endp && *begp < 0x80) begp++;
5322     label_skip_tail:
5323       if (eol_conversion)
5324         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5325       else
5326         while (begp < endp && *(endp - 1) < 0x80) endp--;
5327       break;
5328
5329     default:
5330       abort ();
5331     }
5332
5333   *beg += begp - begp_orig;
5334   *end += endp - endp_orig;
5335   return;
5336 }
5337
5338 /* As shrinking conversion region requires some overhead, we don't try
5339    shrinking if the length of conversion region is less than this
5340    value.  */
5341 static int shrink_conversion_region_threshhold = 1024;
5342
5343 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5344   do {                                                                  \
5345     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5346       {                                                                 \
5347         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5348         else shrink_decoding_region (beg, end, coding, str);            \
5349       }                                                                 \
5350   } while (0)
5351
5352 static Lisp_Object
5353 code_convert_region_unwind (arg)
5354      Lisp_Object arg;
5355 {
5356   inhibit_pre_post_conversion = 0;
5357   Vlast_coding_system_used = arg;
5358   return Qnil;
5359 }
5360
5361 /* Store information about all compositions in the range FROM and TO
5362    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5363    buffer or a string, defaults to the current buffer.  */
5364
5365 void
5366 coding_save_composition (coding, from, to, obj)
5367      struct coding_system *coding;
5368      int from, to;
5369      Lisp_Object obj;
5370 {
5371   Lisp_Object prop;
5372   int start, end;
5373
5374   if (coding->composing == COMPOSITION_DISABLED)
5375     return;
5376   if (!coding->cmp_data)
5377     coding_allocate_composition_data (coding, from);
5378   if (!find_composition (from, to, &start, &end, &prop, obj)
5379       || end > to)
5380     return;
5381   if (start < from
5382       && (!find_composition (end, to, &start, &end, &prop, obj)
5383           || end > to))
5384     return;
5385   coding->composing = COMPOSITION_NO;
5386   do
5387     {
5388       if (COMPOSITION_VALID_P (start, end, prop))
5389         {
5390           enum composition_method method = COMPOSITION_METHOD (prop);
5391           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5392               >= COMPOSITION_DATA_SIZE)
5393             coding_allocate_composition_data (coding, from);
5394           /* For relative composition, we remember start and end
5395              positions, for the other compositions, we also remember
5396              components.  */
5397           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5398           if (method != COMPOSITION_RELATIVE)
5399             {
5400               /* We must store a*/
5401               Lisp_Object val, ch;
5402
5403               val = COMPOSITION_COMPONENTS (prop);
5404               if (CONSP (val))
5405                 while (CONSP (val))
5406                   {
5407                     ch = XCAR (val), val = XCDR (val);
5408                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5409                   }
5410               else if (VECTORP (val) || STRINGP (val))
5411                 {
5412                   int len = (VECTORP (val)
5413                              ? XVECTOR (val)->size : SCHARS (val));
5414                   int i;
5415                   for (i = 0; i < len; i++)
5416                     {
5417                       ch = (STRINGP (val)
5418                             ? Faref (val, make_number (i))
5419                             : XVECTOR (val)->contents[i]);
5420                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5421                     }
5422                 }
5423               else              /* INTEGERP (val) */
5424                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5425             }
5426           CODING_ADD_COMPOSITION_END (coding, end - from);
5427         }
5428       start = end;
5429     }
5430   while (start < to
5431          && find_composition (start, to, &start, &end, &prop, obj)
5432          && end <= to);
5433
5434   /* Make coding->cmp_data point to the first memory block.  */
5435   while (coding->cmp_data->prev)
5436     coding->cmp_data = coding->cmp_data->prev;
5437   coding->cmp_data_start = 0;
5438 }
5439
5440 /* Reflect the saved information about compositions to OBJ.
5441    CODING->cmp_data points to a memory block for the information.  OBJ
5442    is a buffer or a string, defaults to the current buffer.  */
5443
5444 void
5445 coding_restore_composition (coding, obj)
5446      struct coding_system *coding;
5447      Lisp_Object obj;
5448 {
5449   struct composition_data *cmp_data = coding->cmp_data;
5450
5451   if (!cmp_data)
5452     return;
5453
5454   while (cmp_data->prev)
5455     cmp_data = cmp_data->prev;
5456
5457   while (cmp_data)
5458     {
5459       int i;
5460
5461       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5462            i += cmp_data->data[i])
5463         {
5464           int *data = cmp_data->data + i;
5465           enum composition_method method = (enum composition_method) data[3];
5466           Lisp_Object components;
5467
5468           if (data[0] < 0 || i + data[0] > cmp_data->used)
5469             /* Invalid composition data.  */
5470             break;
5471
5472           if (method == COMPOSITION_RELATIVE)
5473             components = Qnil;
5474           else
5475             {
5476               int len = data[0] - 4, j;
5477               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5478
5479               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5480                   && len % 2 == 0)
5481                 len --;
5482               if (len < 1)
5483                 /* Invalid composition data.  */
5484                 break;
5485               for (j = 0; j < len; j++)
5486                 args[j] = make_number (data[4 + j]);
5487               components = (method == COMPOSITION_WITH_ALTCHARS
5488                             ? Fstring (len, args)
5489                             : Fvector (len, args));
5490             }
5491           compose_text (data[1], data[2], components, Qnil, obj);
5492         }
5493       cmp_data = cmp_data->next;
5494     }
5495 }
5496
5497 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5498    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5499    coding system CODING, and return the status code of code conversion
5500    (currently, this value has no meaning).
5501
5502    How many characters (and bytes) are converted to how many
5503    characters (and bytes) are recorded in members of the structure
5504    CODING.
5505
5506    If REPLACE is nonzero, we do various things as if the original text
5507    is deleted and a new text is inserted.  See the comments in
5508    replace_range (insdel.c) to know what we are doing.
5509
5510    If REPLACE is zero, it is assumed that the source text is unibyte.
5511    Otherwise, it is assumed that the source text is multibyte.  */
5512
5513 int
5514 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5515      int from, from_byte, to, to_byte, encodep, replace;
5516      struct coding_system *coding;
5517 {
5518   int len = to - from, len_byte = to_byte - from_byte;
5519   int nchars_del = 0, nbytes_del = 0;
5520   int require, inserted, inserted_byte;
5521   int head_skip, tail_skip, total_skip = 0;
5522   Lisp_Object saved_coding_symbol;
5523   int first = 1;
5524   unsigned char *src, *dst;
5525   Lisp_Object deletion;
5526   int orig_point = PT, orig_len = len;
5527   int prev_Z;
5528   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5529
5530   deletion = Qnil;
5531   saved_coding_symbol = coding->symbol;
5532
5533   if (from < PT && PT < to)
5534     {
5535       TEMP_SET_PT_BOTH (from, from_byte);
5536       orig_point = from;
5537     }
5538
5539   if (replace)
5540     {
5541       int saved_from = from;
5542       int saved_inhibit_modification_hooks;
5543
5544       prepare_to_modify_buffer (from, to, &from);
5545       if (saved_from != from)
5546         {
5547           to = from + len;
5548           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5549           len_byte = to_byte - from_byte;
5550         }
5551
5552       /* The code conversion routine can not preserve text properties
5553          for now.  So, we must remove all text properties in the
5554          region.  Here, we must suppress all modification hooks.  */
5555       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5556       inhibit_modification_hooks = 1;
5557       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5558       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5559     }
5560
5561   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5562     {
5563       /* We must detect encoding of text and eol format.  */
5564
5565       if (from < GPT && to > GPT)
5566         move_gap_both (from, from_byte);
5567       if (coding->type == coding_type_undecided)
5568         {
5569           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5570           if (coding->type == coding_type_undecided)
5571             {
5572               /* It seems that the text contains only ASCII, but we
5573                  should not leave it undecided because the deeper
5574                  decoding routine (decode_coding) tries to detect the
5575                  encodings again in vain.  */
5576               coding->type = coding_type_emacs_mule;
5577               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5578               /* As emacs-mule decoder will handle composition, we
5579                  need this setting to allocate coding->cmp_data
5580                  later.  */
5581               coding->composing = COMPOSITION_NO;
5582             }
5583         }
5584       if (coding->eol_type == CODING_EOL_UNDECIDED
5585           && coding->type != coding_type_ccl)
5586         {
5587           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5588           if (coding->eol_type == CODING_EOL_UNDECIDED)
5589             coding->eol_type = CODING_EOL_LF;
5590           /* We had better recover the original eol format if we
5591              encounter an inconsistent eol format while decoding.  */
5592           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5593         }
5594     }
5595
5596   /* Now we convert the text.  */
5597
5598   /* For encoding, we must process pre-write-conversion in advance.  */
5599   if (! inhibit_pre_post_conversion
5600       && encodep
5601       && SYMBOLP (coding->pre_write_conversion)
5602       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5603     {
5604       /* The function in pre-write-conversion may put a new text in a
5605          new buffer.  */
5606       struct buffer *prev = current_buffer;
5607       Lisp_Object new;
5608
5609       record_unwind_protect (code_convert_region_unwind,
5610                              Vlast_coding_system_used);
5611       /* We should not call any more pre-write/post-read-conversion
5612          functions while this pre-write-conversion is running.  */
5613       inhibit_pre_post_conversion = 1;
5614       call2 (coding->pre_write_conversion,
5615              make_number (from), make_number (to));
5616       inhibit_pre_post_conversion = 0;
5617       /* Discard the unwind protect.  */
5618       specpdl_ptr--;
5619
5620       if (current_buffer != prev)
5621         {
5622           len = ZV - BEGV;
5623           new = Fcurrent_buffer ();
5624           set_buffer_internal_1 (prev);
5625           del_range_2 (from, from_byte, to, to_byte, 0);
5626           TEMP_SET_PT_BOTH (from, from_byte);
5627           insert_from_buffer (XBUFFER (new), 1, len, 0);
5628           Fkill_buffer (new);
5629           if (orig_point >= to)
5630             orig_point += len - orig_len;
5631           else if (orig_point > from)
5632             orig_point = from;
5633           orig_len = len;
5634           to = from + len;
5635           from_byte = CHAR_TO_BYTE (from);
5636           to_byte = CHAR_TO_BYTE (to);
5637           len_byte = to_byte - from_byte;
5638           TEMP_SET_PT_BOTH (from, from_byte);
5639         }
5640     }
5641
5642   if (replace)
5643     {
5644       if (! EQ (current_buffer->undo_list, Qt))
5645         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5646       else
5647         {
5648           nchars_del = to - from;
5649           nbytes_del = to_byte - from_byte;
5650         }
5651     }
5652
5653   if (coding->composing != COMPOSITION_DISABLED)
5654     {
5655       if (encodep)
5656         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5657       else
5658         coding_allocate_composition_data (coding, from);
5659     }
5660
5661   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5662      if we must run CCL program or there are compositions to
5663      encode.  */
5664   if (coding->type != coding_type_ccl
5665       && (! coding->cmp_data || coding->cmp_data->used == 0))
5666     {
5667       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5668
5669       if (from < GPT && GPT < to)
5670         move_gap_both (from, from_byte);
5671       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5672       if (from_byte == to_byte
5673           && (encodep || NILP (coding->post_read_conversion))
5674           && ! CODING_REQUIRE_FLUSHING (coding))
5675         {
5676           coding->produced = len_byte;
5677           coding->produced_char = len;
5678           if (!replace)
5679             /* We must record and adjust for this new text now.  */
5680             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5681           coding_free_composition_data (coding);
5682           return 0;
5683         }
5684
5685       head_skip = from_byte - from_byte_orig;
5686       tail_skip = to_byte_orig - to_byte;
5687       total_skip = head_skip + tail_skip;
5688       from += head_skip;
5689       to -= tail_skip;
5690       len -= total_skip; len_byte -= total_skip;
5691     }
5692
5693   /* For conversion, we must put the gap before the text in addition to
5694      making the gap larger for efficient decoding.  The required gap
5695      size starts from 2000 which is the magic number used in make_gap.
5696      But, after one batch of conversion, it will be incremented if we
5697      find that it is not enough .  */
5698   require = 2000;
5699
5700   if (GAP_SIZE  < require)
5701     make_gap (require - GAP_SIZE);
5702   move_gap_both (from, from_byte);
5703
5704   inserted = inserted_byte = 0;
5705
5706   GAP_SIZE += len_byte;
5707   ZV -= len;
5708   Z -= len;
5709   ZV_BYTE -= len_byte;
5710   Z_BYTE -= len_byte;
5711
5712   if (GPT - BEG < BEG_UNCHANGED)
5713     BEG_UNCHANGED = GPT - BEG;
5714   if (Z - GPT < END_UNCHANGED)
5715     END_UNCHANGED = Z - GPT;
5716
5717   if (!encodep && coding->src_multibyte)
5718     {
5719       /* Decoding routines expects that the source text is unibyte.
5720          We must convert 8-bit characters of multibyte form to
5721          unibyte.  */
5722       int len_byte_orig = len_byte;
5723       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5724       if (len_byte < len_byte_orig)
5725         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5726                     len_byte);
5727       coding->src_multibyte = 0;
5728     }
5729
5730   for (;;)
5731     {
5732       int result;
5733
5734       /* The buffer memory is now:
5735          +--------+converted-text+---------+-------original-text-------+---+
5736          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5737                   |<---------------------- GAP ----------------------->|  */
5738       src = GAP_END_ADDR - len_byte;
5739       dst = GPT_ADDR + inserted_byte;
5740
5741       if (encodep)
5742         result = encode_coding (coding, src, dst, len_byte, 0);
5743       else
5744         {
5745           if (coding->composing != COMPOSITION_DISABLED)
5746             coding->cmp_data->char_offset = from + inserted;
5747           result = decode_coding (coding, src, dst, len_byte, 0);
5748         }
5749
5750       /* The buffer memory is now:
5751          +--------+-------converted-text----+--+------original-text----+---+
5752          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5753                   |<---------------------- GAP ----------------------->|  */
5754
5755       inserted += coding->produced_char;
5756       inserted_byte += coding->produced;
5757       len_byte -= coding->consumed;
5758
5759       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5760         {
5761           coding_allocate_composition_data (coding, from + inserted);
5762           continue;
5763         }
5764
5765       src += coding->consumed;
5766       dst += coding->produced;
5767
5768       if (result == CODING_FINISH_NORMAL)
5769         {
5770           src += len_byte;
5771           break;
5772         }
5773       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5774         {
5775           unsigned char *pend = dst, *p = pend - inserted_byte;
5776           Lisp_Object eol_type;
5777
5778           /* Encode LFs back to the original eol format (CR or CRLF).  */
5779           if (coding->eol_type == CODING_EOL_CR)
5780             {
5781               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5782             }
5783           else
5784             {
5785               int count = 0;
5786
5787               while (p < pend) if (*p++ == '\n') count++;
5788               if (src - dst < count)
5789                 {
5790                   /* We don't have sufficient room for encoding LFs
5791                      back to CRLF.  We must record converted and
5792                      not-yet-converted text back to the buffer
5793                      content, enlarge the gap, then record them out of
5794                      the buffer contents again.  */
5795                   int add = len_byte + inserted_byte;
5796
5797                   GAP_SIZE -= add;
5798                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5799                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5800                   make_gap (count - GAP_SIZE);
5801                   GAP_SIZE += add;
5802                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5803                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5804                   /* Don't forget to update SRC, DST, and PEND.  */
5805                   src = GAP_END_ADDR - len_byte;
5806                   dst = GPT_ADDR + inserted_byte;
5807                   pend = dst;
5808                 }
5809               inserted += count;
5810               inserted_byte += count;
5811               coding->produced += count;
5812               p = dst = pend + count;
5813               while (count)
5814                 {
5815                   *--p = *--pend;
5816                   if (*p == '\n') count--, *--p = '\r';
5817                 }
5818             }
5819
5820           /* Suppress eol-format conversion in the further conversion.  */
5821           coding->eol_type = CODING_EOL_LF;
5822
5823           /* Set the coding system symbol to that for Unix-like EOL.  */
5824           eol_type = Fget (saved_coding_symbol, Qeol_type);
5825           if (VECTORP (eol_type)
5826               && XVECTOR (eol_type)->size == 3
5827               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5828             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5829           else
5830             coding->symbol = saved_coding_symbol;
5831
5832           continue;
5833         }
5834       if (len_byte <= 0)
5835         {
5836           if (coding->type != coding_type_ccl
5837               || coding->mode & CODING_MODE_LAST_BLOCK)
5838             break;
5839           coding->mode |= CODING_MODE_LAST_BLOCK;
5840           continue;
5841         }
5842       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5843         {
5844           /* The source text ends in invalid codes.  Let's just
5845              make them valid buffer contents, and finish conversion.  */
5846           if (multibyte_p)
5847             {
5848               unsigned char *start = dst;
5849
5850               inserted += len_byte;
5851               while (len_byte--)
5852                 {
5853                   int c = *src++;
5854                   dst += CHAR_STRING (c, dst);
5855                 }
5856
5857               inserted_byte += dst - start;
5858             }
5859           else
5860             {
5861               inserted += len_byte;
5862               inserted_byte += len_byte;
5863               while (len_byte--)
5864                 *dst++ = *src++;
5865             }
5866           break;
5867         }
5868       if (result == CODING_FINISH_INTERRUPT)
5869         {
5870           /* The conversion procedure was interrupted by a user.  */
5871           break;
5872         }
5873       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5874       if (coding->consumed < 1)
5875         {
5876           /* It's quite strange to require more memory without
5877              consuming any bytes.  Perhaps CCL program bug.  */
5878           break;
5879         }
5880       if (first)
5881         {
5882           /* We have just done the first batch of conversion which was
5883              stopped because of insufficient gap.  Let's reconsider the
5884              required gap size (i.e. SRT - DST) now.
5885
5886              We have converted ORIG bytes (== coding->consumed) into
5887              NEW bytes (coding->produced).  To convert the remaining
5888              LEN bytes, we may need REQUIRE bytes of gap, where:
5889                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5890                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5891              Here, we are sure that NEW >= ORIG.  */
5892
5893           if (coding->produced <= coding->consumed)
5894             {
5895               /* This happens because of CCL-based coding system with
5896                  eol-type CRLF.  */
5897               require = 0;
5898             }
5899           else
5900             {
5901               float ratio = coding->produced - coding->consumed;
5902               ratio /= coding->consumed;
5903               require = len_byte * ratio;
5904             }
5905           first = 0;
5906         }
5907       if ((src - dst) < (require + 2000))
5908         {
5909           /* See the comment above the previous call of make_gap.  */
5910           int add = len_byte + inserted_byte;
5911
5912           GAP_SIZE -= add;
5913           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5914           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5915           make_gap (require + 2000);
5916           GAP_SIZE += add;
5917           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5918           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5919         }
5920     }
5921   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5922
5923   if (encodep && coding->dst_multibyte)
5924     {
5925       /* The output is unibyte.  We must convert 8-bit characters to
5926          multibyte form.  */
5927       if (inserted_byte * 2 > GAP_SIZE)
5928         {
5929           GAP_SIZE -= inserted_byte;
5930           ZV += inserted_byte; Z += inserted_byte;
5931           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5932           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5933           make_gap (inserted_byte - GAP_SIZE);
5934           GAP_SIZE += inserted_byte;
5935           ZV -= inserted_byte; Z -= inserted_byte;
5936           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5937           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5938         }
5939       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5940     }
5941
5942   /* If we shrank the conversion area, adjust it now.  */
5943   if (total_skip > 0)
5944     {
5945       if (tail_skip > 0)
5946         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5947       inserted += total_skip; inserted_byte += total_skip;
5948       GAP_SIZE += total_skip;
5949       GPT -= head_skip; GPT_BYTE -= head_skip;
5950       ZV -= total_skip; ZV_BYTE -= total_skip;
5951       Z -= total_skip; Z_BYTE -= total_skip;
5952       from -= head_skip; from_byte -= head_skip;
5953       to += tail_skip; to_byte += tail_skip;
5954     }
5955
5956   prev_Z = Z;
5957   if (! EQ (current_buffer->undo_list, Qt))
5958     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5959   else
5960     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5961                                  inserted, inserted_byte);
5962   inserted = Z - prev_Z;
5963
5964   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5965     coding_restore_composition (coding, Fcurrent_buffer ());
5966   coding_free_composition_data (coding);
5967
5968   if (! inhibit_pre_post_conversion
5969       && ! encodep && ! NILP (coding->post_read_conversion))
5970     {
5971       Lisp_Object val;
5972       Lisp_Object saved_coding_system;
5973
5974       if (from != PT)
5975         TEMP_SET_PT_BOTH (from, from_byte);
5976       prev_Z = Z;
5977       record_unwind_protect (code_convert_region_unwind,
5978                              Vlast_coding_system_used);
5979       saved_coding_system = Vlast_coding_system_used;
5980       Vlast_coding_system_used = coding->symbol;
5981       /* We should not call any more pre-write/post-read-conversion
5982          functions while this post-read-conversion is running.  */
5983       inhibit_pre_post_conversion = 1;
5984       val = call1 (coding->post_read_conversion, make_number (inserted));
5985       inhibit_pre_post_conversion = 0;
5986       coding->symbol = Vlast_coding_system_used;
5987       Vlast_coding_system_used = saved_coding_system;
5988       /* Discard the unwind protect.  */
5989       specpdl_ptr--;
5990       CHECK_NUMBER (val);
5991       inserted += Z - prev_Z;
5992     }
5993
5994   if (orig_point >= from)
5995     {
5996       if (orig_point >= from + orig_len)
5997         orig_point += inserted - orig_len;
5998       else
5999         orig_point = from;
6000       TEMP_SET_PT (orig_point);
6001     }
6002
6003   if (replace)
6004     {
6005       signal_after_change (from, to - from, inserted);
6006       update_compositions (from, from + inserted, CHECK_BORDER);
6007     }
6008
6009   {
6010     coding->consumed = to_byte - from_byte;
6011     coding->consumed_char = to - from;
6012     coding->produced = inserted_byte;
6013     coding->produced_char = inserted;
6014   }
6015
6016   return 0;
6017 }
6018
6019 /* Name (or base name) of work buffer for code conversion.  */
6020 static Lisp_Object Vcode_conversion_workbuf_name;
6021
6022 /* Set the current buffer to the working buffer prepared for
6023    code-conversion.  MULTIBYTE specifies the multibyteness of the
6024    buffer.  */
6025
6026 static struct buffer *
6027 set_conversion_work_buffer (multibyte)
6028      int multibyte;
6029 {
6030   Lisp_Object buffer;
6031   struct buffer *buf;
6032
6033   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6034   buf = XBUFFER (buffer);
6035   delete_all_overlays (buf);
6036   buf->directory = current_buffer->directory;
6037   buf->read_only = Qnil;
6038   buf->filename = Qnil;
6039   buf->undo_list = Qt;
6040   eassert (buf->overlays_before == NULL);
6041   eassert (buf->overlays_after == NULL);
6042   set_buffer_internal (buf);
6043   if (BEG != BEGV || Z != ZV)
6044     Fwiden ();
6045   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6046   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6047   return buf;
6048 }
6049
6050 Lisp_Object
6051 run_pre_post_conversion_on_str (str, coding, encodep)
6052      Lisp_Object str;
6053      struct coding_system *coding;
6054      int encodep;
6055 {
6056   int count = SPECPDL_INDEX ();
6057   struct gcpro gcpro1, gcpro2;
6058   int multibyte = STRING_MULTIBYTE (str);
6059   Lisp_Object old_deactivate_mark;
6060
6061   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6062   record_unwind_protect (code_convert_region_unwind,
6063                          Vlast_coding_system_used);
6064   /* It is not crucial to specbind this.  */
6065   old_deactivate_mark = Vdeactivate_mark;
6066   GCPRO2 (str, old_deactivate_mark);
6067
6068   /* We must insert the contents of STR as is without
6069      unibyte<->multibyte conversion.  For that, we adjust the
6070      multibyteness of the working buffer to that of STR.  */
6071   set_conversion_work_buffer (multibyte);
6072
6073   insert_from_string (str, 0, 0,
6074                       SCHARS (str), SBYTES (str), 0);
6075   UNGCPRO;
6076   inhibit_pre_post_conversion = 1;
6077   if (encodep)
6078     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6079   else
6080     {
6081       Vlast_coding_system_used = coding->symbol;
6082       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6083       call1 (coding->post_read_conversion, make_number (Z - BEG));
6084       coding->symbol = Vlast_coding_system_used;
6085     }
6086   inhibit_pre_post_conversion = 0;
6087   Vdeactivate_mark = old_deactivate_mark;
6088   str = make_buffer_string (BEG, Z, 1);
6089   return unbind_to (count, str);
6090 }
6091
6092
6093 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6094    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6095    is intended that this function is called from encode_terminal_code,
6096    the pre-write-conversion function is run by safe_call and thus
6097    "Error during redisplay: ..." is logged when an error occurs.
6098
6099    Store the resulting text in *STR and set CODING->produced_char and
6100    CODING->produced to the number of characters and bytes
6101    respectively.  If the size of *STR is too small, enlarge it by
6102    xrealloc and update *STR and *SIZE.  */
6103
6104 void
6105 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6106      unsigned char **str;
6107      int *size, nchars, nbytes;
6108      struct coding_system *coding;
6109 {
6110   struct gcpro gcpro1, gcpro2;
6111   struct buffer *cur = current_buffer;
6112   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6113   Lisp_Object args[3];
6114
6115   /* It is not crucial to specbind this.  */
6116   old_deactivate_mark = Vdeactivate_mark;
6117   old_last_coding_system_used = Vlast_coding_system_used;
6118   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6119
6120   /* We must insert the contents of STR as is without
6121      unibyte<->multibyte conversion.  For that, we adjust the
6122      multibyteness of the working buffer to that of STR.  */
6123   set_conversion_work_buffer (coding->src_multibyte);
6124   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6125   UNGCPRO;
6126   inhibit_pre_post_conversion = 1;
6127   args[0] = coding->pre_write_conversion;
6128   args[1] = make_number (BEG);
6129   args[2] = make_number (Z);
6130   safe_call (3, args);
6131   inhibit_pre_post_conversion = 0;
6132   Vdeactivate_mark = old_deactivate_mark;
6133   Vlast_coding_system_used = old_last_coding_system_used;
6134   coding->produced_char = Z - BEG;
6135   coding->produced = Z_BYTE - BEG_BYTE;
6136   if (coding->produced > *size)
6137     {
6138       *size = coding->produced;
6139       *str = xrealloc (*str, *size);
6140     }
6141   if (BEG < GPT && GPT < Z)
6142     move_gap (BEG);
6143   bcopy (BEG_ADDR, *str, coding->produced);
6144   coding->src_multibyte
6145     = ! NILP (current_buffer->enable_multibyte_characters);
6146   set_buffer_internal (cur);
6147 }
6148
6149
6150 Lisp_Object
6151 decode_coding_string (str, coding, nocopy)
6152      Lisp_Object str;
6153      struct coding_system *coding;
6154      int nocopy;
6155 {
6156   int len;
6157   struct conversion_buffer buf;
6158   int from, to_byte;
6159   Lisp_Object saved_coding_symbol;
6160   int result;
6161   int require_decoding;
6162   int shrinked_bytes = 0;
6163   Lisp_Object newstr;
6164   int consumed, consumed_char, produced, produced_char;
6165
6166   from = 0;
6167   to_byte = SBYTES (str);
6168
6169   saved_coding_symbol = coding->symbol;
6170   coding->src_multibyte = STRING_MULTIBYTE (str);
6171   coding->dst_multibyte = 1;
6172   if (CODING_REQUIRE_DETECTION (coding))
6173     {
6174       /* See the comments in code_convert_region.  */
6175       if (coding->type == coding_type_undecided)
6176         {
6177           detect_coding (coding, SDATA (str), to_byte);
6178           if (coding->type == coding_type_undecided)
6179             {
6180               coding->type = coding_type_emacs_mule;
6181               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6182               /* As emacs-mule decoder will handle composition, we
6183                  need this setting to allocate coding->cmp_data
6184                  later.  */
6185               coding->composing = COMPOSITION_NO;
6186             }
6187         }
6188       if (coding->eol_type == CODING_EOL_UNDECIDED
6189           && coding->type != coding_type_ccl)
6190         {
6191           saved_coding_symbol = coding->symbol;
6192           detect_eol (coding, SDATA (str), to_byte);
6193           if (coding->eol_type == CODING_EOL_UNDECIDED)
6194             coding->eol_type = CODING_EOL_LF;
6195           /* We had better recover the original eol format if we
6196              encounter an inconsistent eol format while decoding.  */
6197           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6198         }
6199     }
6200
6201   if (coding->type == coding_type_no_conversion
6202       || coding->type == coding_type_raw_text)
6203     coding->dst_multibyte = 0;
6204
6205   require_decoding = CODING_REQUIRE_DECODING (coding);
6206
6207   if (STRING_MULTIBYTE (str))
6208     {
6209       /* Decoding routines expect the source text to be unibyte.  */
6210       str = Fstring_as_unibyte (str);
6211       to_byte = SBYTES (str);
6212       nocopy = 1;
6213       coding->src_multibyte = 0;
6214     }
6215
6216   /* Try to skip the heading and tailing ASCIIs.  */
6217   if (require_decoding && coding->type != coding_type_ccl)
6218     {
6219       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6220                                 0);
6221       if (from == to_byte)
6222         require_decoding = 0;
6223       shrinked_bytes = from + (SBYTES (str) - to_byte);
6224     }
6225
6226   if (!require_decoding
6227       && !(SYMBOLP (coding->post_read_conversion)
6228            && !NILP (Ffboundp (coding->post_read_conversion))))
6229     {
6230       coding->consumed = SBYTES (str);
6231       coding->consumed_char = SCHARS (str);
6232       if (coding->dst_multibyte)
6233         {
6234           str = Fstring_as_multibyte (str);
6235           nocopy = 1;
6236         }
6237       coding->produced = SBYTES (str);
6238       coding->produced_char = SCHARS (str);
6239       return (nocopy ? str : Fcopy_sequence (str));
6240     }
6241
6242   if (coding->composing != COMPOSITION_DISABLED)
6243     coding_allocate_composition_data (coding, from);
6244   len = decoding_buffer_size (coding, to_byte - from);
6245   allocate_conversion_buffer (buf, len);
6246
6247   consumed = consumed_char = produced = produced_char = 0;
6248   while (1)
6249     {
6250       result = decode_coding (coding, SDATA (str) + from + consumed,
6251                               buf.data + produced, to_byte - from - consumed,
6252                               buf.size - produced);
6253       consumed += coding->consumed;
6254       consumed_char += coding->consumed_char;
6255       produced += coding->produced;
6256       produced_char += coding->produced_char;
6257       if (result == CODING_FINISH_NORMAL
6258           || result == CODING_FINISH_INTERRUPT
6259           || (result == CODING_FINISH_INSUFFICIENT_SRC
6260               && coding->consumed == 0))
6261         break;
6262       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6263         coding_allocate_composition_data (coding, from + produced_char);
6264       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6265         extend_conversion_buffer (&buf);
6266       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6267         {
6268           Lisp_Object eol_type;
6269
6270           /* Recover the original EOL format.  */
6271           if (coding->eol_type == CODING_EOL_CR)
6272             {
6273               unsigned char *p;
6274               for (p = buf.data; p < buf.data + produced; p++)
6275                 if (*p == '\n') *p = '\r';
6276             }
6277           else if (coding->eol_type == CODING_EOL_CRLF)
6278             {
6279               int num_eol = 0;
6280               unsigned char *p0, *p1;
6281               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6282                 if (*p0 == '\n') num_eol++;
6283               if (produced + num_eol >= buf.size)
6284                 extend_conversion_buffer (&buf);
6285               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6286                 {
6287                   *--p1 = *--p0;
6288                   if (*p0 == '\n') *--p1 = '\r';
6289                 }
6290               produced += num_eol;
6291               produced_char += num_eol;
6292             }
6293           /* Suppress eol-format conversion in the further conversion.  */
6294           coding->eol_type = CODING_EOL_LF;
6295
6296           /* Set the coding system symbol to that for Unix-like EOL.  */
6297           eol_type = Fget (saved_coding_symbol, Qeol_type);
6298           if (VECTORP (eol_type)
6299               && XVECTOR (eol_type)->size == 3
6300               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6301             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6302           else
6303             coding->symbol = saved_coding_symbol;
6304
6305
6306         }
6307     }
6308
6309   coding->consumed = consumed;
6310   coding->consumed_char = consumed_char;
6311   coding->produced = produced;
6312   coding->produced_char = produced_char;
6313
6314   if (coding->dst_multibyte)
6315     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6316                                            produced + shrinked_bytes);
6317   else
6318     newstr = make_uninit_string (produced + shrinked_bytes);
6319   if (from > 0)
6320     STRING_COPYIN (newstr, 0, SDATA (str), from);
6321   STRING_COPYIN (newstr, from, buf.data, produced);
6322   if (shrinked_bytes > from)
6323     STRING_COPYIN (newstr, from + produced,
6324                    SDATA (str) + to_byte,
6325                    shrinked_bytes - from);
6326   free_conversion_buffer (&buf);
6327
6328   coding->consumed += shrinked_bytes;
6329   coding->consumed_char += shrinked_bytes;
6330   coding->produced += shrinked_bytes;
6331   coding->produced_char += shrinked_bytes;
6332
6333   if (coding->cmp_data && coding->cmp_data->used)
6334     coding_restore_composition (coding, newstr);
6335   coding_free_composition_data (coding);
6336
6337   if (SYMBOLP (coding->post_read_conversion)
6338       && !NILP (Ffboundp (coding->post_read_conversion)))
6339     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6340
6341   return newstr;
6342 }
6343
6344 Lisp_Object
6345 encode_coding_string (str, coding, nocopy)
6346      Lisp_Object str;
6347      struct coding_system *coding;
6348      int nocopy;
6349 {
6350   int len;
6351   struct conversion_buffer buf;
6352   int from, to, to_byte;
6353   int result;
6354   int shrinked_bytes = 0;
6355   Lisp_Object newstr;
6356   int consumed, consumed_char, produced, produced_char;
6357
6358   if (SYMBOLP (coding->pre_write_conversion)
6359       && !NILP (Ffboundp (coding->pre_write_conversion)))
6360     {
6361       str = run_pre_post_conversion_on_str (str, coding, 1);
6362       /* As STR is just newly generated, we don't have to copy it
6363          anymore.  */
6364       nocopy = 1;
6365     }
6366
6367   from = 0;
6368   to = SCHARS (str);
6369   to_byte = SBYTES (str);
6370
6371   /* Encoding routines determine the multibyteness of the source text
6372      by coding->src_multibyte.  */
6373   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6374   coding->dst_multibyte = 0;
6375   if (! CODING_REQUIRE_ENCODING (coding))
6376     goto no_need_of_encoding;
6377
6378   if (coding->composing != COMPOSITION_DISABLED)
6379     coding_save_composition (coding, from, to, str);
6380
6381   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6382      if we must run CCL program or there are compositions to
6383      encode.  */
6384   if (coding->type != coding_type_ccl
6385       && (! coding->cmp_data || coding->cmp_data->used == 0))
6386     {
6387       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6388                                 1);
6389       if (from == to_byte)
6390         {
6391           coding_free_composition_data (coding);
6392           goto no_need_of_encoding;
6393         }
6394       shrinked_bytes = from + (SBYTES (str) - to_byte);
6395     }
6396
6397   len = encoding_buffer_size (coding, to_byte - from);
6398   allocate_conversion_buffer (buf, len);
6399
6400   consumed = consumed_char = produced = produced_char = 0;
6401   while (1)
6402     {
6403       result = encode_coding (coding, SDATA (str) + from + consumed,
6404                               buf.data + produced, to_byte - from - consumed,
6405                               buf.size - produced);
6406       consumed += coding->consumed;
6407       consumed_char += coding->consumed_char;
6408       produced += coding->produced;
6409       produced_char += coding->produced_char;
6410       if (result == CODING_FINISH_NORMAL
6411           || result == CODING_FINISH_INTERRUPT
6412           || (result == CODING_FINISH_INSUFFICIENT_SRC
6413               && coding->consumed == 0))
6414         break;
6415       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6416       extend_conversion_buffer (&buf);
6417     }
6418
6419   coding->consumed = consumed;
6420   coding->consumed_char = consumed_char;
6421   coding->produced = produced;
6422   coding->produced_char = produced_char;
6423
6424   newstr = make_uninit_string (produced + shrinked_bytes);
6425   if (from > 0)
6426     STRING_COPYIN (newstr, 0, SDATA (str), from);
6427   STRING_COPYIN (newstr, from, buf.data, produced);
6428   if (shrinked_bytes > from)
6429     STRING_COPYIN (newstr, from + produced,
6430                    SDATA (str) + to_byte,
6431                    shrinked_bytes - from);
6432
6433   free_conversion_buffer (&buf);
6434   coding_free_composition_data (coding);
6435
6436   return newstr;
6437
6438  no_need_of_encoding:
6439   coding->consumed = SBYTES (str);
6440   coding->consumed_char = SCHARS (str);
6441   if (STRING_MULTIBYTE (str))
6442     {
6443       if (nocopy)
6444         /* We are sure that STR doesn't contain a multibyte
6445            character.  */
6446         STRING_SET_UNIBYTE (str);
6447       else
6448         {
6449           str = Fstring_as_unibyte (str);
6450           nocopy = 1;
6451         }
6452     }
6453   coding->produced = SBYTES (str);
6454   coding->produced_char = SCHARS (str);
6455   return (nocopy ? str : Fcopy_sequence (str));
6456 }
6457
6458 \f
6459 #ifdef emacs
6460 /*** 8. Emacs Lisp library functions ***/
6461
6462 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6463        doc: /* Return t if OBJECT is nil or a coding-system.
6464 See the documentation of `make-coding-system' for information
6465 about coding-system objects.  */)
6466      (obj)
6467      Lisp_Object obj;
6468 {
6469   if (NILP (obj))
6470     return Qt;
6471   if (!SYMBOLP (obj))
6472     return Qnil;
6473   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6474     return Qt;
6475   /* Get coding-spec vector for OBJ.  */
6476   obj = Fget (obj, Qcoding_system);
6477   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6478           ? Qt : Qnil);
6479 }
6480
6481 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6482        Sread_non_nil_coding_system, 1, 1, 0,
6483        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6484      (prompt)
6485      Lisp_Object prompt;
6486 {
6487   Lisp_Object val;
6488   do
6489     {
6490       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6491                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6492     }
6493   while (SCHARS (val) == 0);
6494   return (Fintern (val, Qnil));
6495 }
6496
6497 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6498        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6499 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6500      (prompt, default_coding_system)
6501      Lisp_Object prompt, default_coding_system;
6502 {
6503   Lisp_Object val;
6504   if (SYMBOLP (default_coding_system))
6505     default_coding_system = SYMBOL_NAME (default_coding_system);
6506   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6507                           Qt, Qnil, Qcoding_system_history,
6508                           default_coding_system, Qnil);
6509   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6510 }
6511
6512 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6513        1, 1, 0,
6514        doc: /* Check validity of CODING-SYSTEM.
6515 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6516 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6517 The value of this property should be a vector of length 5.  */)
6518      (coding_system)
6519      Lisp_Object coding_system;
6520 {
6521   Lisp_Object define_form;
6522
6523   define_form = Fget (coding_system, Qcoding_system_define_form);
6524   if (! NILP (define_form))
6525     {
6526       Fput (coding_system, Qcoding_system_define_form, Qnil);
6527       safe_eval (define_form);
6528     }
6529   if (!NILP (Fcoding_system_p (coding_system)))
6530     return coding_system;
6531   while (1)
6532     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6533 }
6534 \f
6535 Lisp_Object
6536 detect_coding_system (src, src_bytes, highest, multibytep)
6537      const unsigned char *src;
6538      int src_bytes, highest;
6539      int multibytep;
6540 {
6541   int coding_mask, eol_type;
6542   Lisp_Object val, tmp;
6543   int dummy;
6544
6545   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6546   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6547   if (eol_type == CODING_EOL_INCONSISTENT)
6548     eol_type = CODING_EOL_UNDECIDED;
6549
6550   if (!coding_mask)
6551     {
6552       val = Qundecided;
6553       if (eol_type != CODING_EOL_UNDECIDED)
6554         {
6555           Lisp_Object val2;
6556           val2 = Fget (Qundecided, Qeol_type);
6557           if (VECTORP (val2))
6558             val = XVECTOR (val2)->contents[eol_type];
6559         }
6560       return (highest ? val : Fcons (val, Qnil));
6561     }
6562
6563   /* At first, gather possible coding systems in VAL.  */
6564   val = Qnil;
6565   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6566     {
6567       Lisp_Object category_val, category_index;
6568
6569       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6570       category_val = Fsymbol_value (XCAR (tmp));
6571       if (!NILP (category_val)
6572           && NATNUMP (category_index)
6573           && (coding_mask & (1 << XFASTINT (category_index))))
6574         {
6575           val = Fcons (category_val, val);
6576           if (highest)
6577             break;
6578         }
6579     }
6580   if (!highest)
6581     val = Fnreverse (val);
6582
6583   /* Then, replace the elements with subsidiary coding systems.  */
6584   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6585     {
6586       if (eol_type != CODING_EOL_UNDECIDED
6587           && eol_type != CODING_EOL_INCONSISTENT)
6588         {
6589           Lisp_Object eol;
6590           eol = Fget (XCAR (tmp), Qeol_type);
6591           if (VECTORP (eol))
6592             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6593         }
6594     }
6595   return (highest ? XCAR (val) : val);
6596 }
6597
6598 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6599        2, 3, 0,
6600        doc: /* Detect how the byte sequence in the region is encoded.
6601 Return a list of possible coding systems used on decoding a byte
6602 sequence containing the bytes in the region between START and END when
6603 the coding system `undecided' is specified.  The list is ordered by
6604 priority decided in the current language environment.
6605
6606 If only ASCII characters are found, it returns a list of single element
6607 `undecided' or its subsidiary coding system according to a detected
6608 end-of-line format.
6609
6610 If optional argument HIGHEST is non-nil, return the coding system of
6611 highest priority.  */)
6612      (start, end, highest)
6613      Lisp_Object start, end, highest;
6614 {
6615   int from, to;
6616   int from_byte, to_byte;
6617   int include_anchor_byte = 0;
6618
6619   CHECK_NUMBER_COERCE_MARKER (start);
6620   CHECK_NUMBER_COERCE_MARKER (end);
6621
6622   validate_region (&start, &end);
6623   from = XINT (start), to = XINT (end);
6624   from_byte = CHAR_TO_BYTE (from);
6625   to_byte = CHAR_TO_BYTE (to);
6626
6627   if (from < GPT && to >= GPT)
6628     move_gap_both (to, to_byte);
6629   /* If we an anchor byte `\0' follows the region, we include it in
6630      the detecting source.  Then code detectors can handle the tailing
6631      byte sequence more accurately.
6632
6633      Fix me: This is not a perfect solution.  It is better that we
6634      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6635   */
6636   if (to == Z || (to == GPT && GAP_SIZE > 0))
6637     include_anchor_byte = 1;
6638   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6639                                to_byte - from_byte + include_anchor_byte,
6640                                !NILP (highest),
6641                                !NILP (current_buffer
6642                                       ->enable_multibyte_characters));
6643 }
6644
6645 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6646        1, 2, 0,
6647        doc: /* Detect how the byte sequence in STRING is encoded.
6648 Return a list of possible coding systems used on decoding a byte
6649 sequence containing the bytes in STRING when the coding system
6650 `undecided' is specified.  The list is ordered by priority decided in
6651 the current language environment.
6652
6653 If only ASCII characters are found, it returns a list of single element
6654 `undecided' or its subsidiary coding system according to a detected
6655 end-of-line format.
6656
6657 If optional argument HIGHEST is non-nil, return the coding system of
6658 highest priority.  */)
6659      (string, highest)
6660      Lisp_Object string, highest;
6661 {
6662   CHECK_STRING (string);
6663
6664   return detect_coding_system (SDATA (string),
6665                                /* "+ 1" is to include the anchor byte
6666                                   `\0'.  With this, code detectors can
6667                                   handle the tailing bytes more
6668                                   accurately.  */
6669                                SBYTES (string) + 1,
6670                                !NILP (highest),
6671                                STRING_MULTIBYTE (string));
6672 }
6673
6674 /*  Subroutine for Ffind_coding_systems_region_internal.
6675
6676     Return a list of coding systems that safely encode the multibyte
6677     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6678     possible coding systems.  If it is nil, it means that we have not
6679     yet found any coding systems.
6680
6681     WORK_TABLE a char-table of which element is set to t once the
6682     element is looked up.
6683
6684     If a non-ASCII single byte char is found, set
6685     *single_byte_char_found to 1.  */
6686
6687 static Lisp_Object
6688 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6689      unsigned char *p, *pend;
6690      Lisp_Object safe_codings, work_table;
6691      int *single_byte_char_found;
6692 {
6693   int c, len;
6694   Lisp_Object val, ch;
6695   Lisp_Object prev, tail;
6696
6697   if (NILP (safe_codings))
6698     goto done_safe_codings;
6699   while (p < pend)
6700     {
6701       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6702       p += len;
6703       if (ASCII_BYTE_P (c))
6704         /* We can ignore ASCII characters here.  */
6705         continue;
6706       if (SINGLE_BYTE_CHAR_P (c))
6707         *single_byte_char_found = 1;
6708       /* Check the safe coding systems for C.  */
6709       ch = make_number (c);
6710       val = Faref (work_table, ch);
6711       if (EQ (val, Qt))
6712         /* This element was already checked.  Ignore it.  */
6713         continue;
6714       /* Remember that we checked this element.  */
6715       Faset (work_table, ch, Qt);
6716
6717       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6718         {
6719           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6720           int encodable;
6721
6722           elt = XCAR (tail);
6723           if (CONSP (XCDR (elt)))
6724             {
6725               /* This entry has this format now:
6726                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6727                           ACCEPT-LATIN-EXTRA ) */
6728               val = XCDR (elt);
6729               encodable = ! NILP (Faref (XCAR (val), ch));
6730               if (! encodable)
6731                 {
6732                   val = XCDR (val);
6733                   translation_table = XCAR (val);
6734                   hash_table = XCAR (XCDR (val));
6735                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6736                 }
6737             }
6738           else
6739             {
6740               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6741               encodable = ! NILP (Faref (XCDR (elt), ch));
6742               if (! encodable)
6743                 {
6744                   /* Transform the format to:
6745                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6746                        ACCEPT-LATIN-EXTRA )  */
6747                   val = Fget (XCAR (elt), Qcoding_system);
6748                   translation_table
6749                     = Fplist_get (AREF (val, 3),
6750                                   Qtranslation_table_for_encode);
6751                   if (SYMBOLP (translation_table))
6752                     translation_table = Fget (translation_table,
6753                                               Qtranslation_table);
6754                   hash_table
6755                     = (CHAR_TABLE_P (translation_table)
6756                        ? XCHAR_TABLE (translation_table)->extras[1]
6757                        : Qnil);
6758                   accept_latin_extra
6759                     = ((EQ (AREF (val, 0), make_number (2))
6760                         && VECTORP (AREF (val, 4)))
6761                        ? AREF (AREF (val, 4), 16)
6762                        : Qnil);
6763                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6764                                         translation_table, hash_table,
6765                                         accept_latin_extra));
6766                 }
6767             }
6768
6769           if (! encodable
6770               && ((CHAR_TABLE_P (translation_table)
6771                    && ! NILP (Faref (translation_table, ch)))
6772                   || (HASH_TABLE_P (hash_table)
6773                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6774                   || (SINGLE_BYTE_CHAR_P (c)
6775                       && ! NILP (accept_latin_extra)
6776                       && VECTORP (Vlatin_extra_code_table)
6777                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6778             encodable = 1;
6779           if (encodable)
6780             prev = tail;
6781           else
6782             {
6783               /* Exclude this coding system from SAFE_CODINGS.  */
6784               if (EQ (tail, safe_codings))
6785                 {
6786                   safe_codings = XCDR (safe_codings);
6787                   if (NILP (safe_codings))
6788                     goto done_safe_codings;
6789                 }
6790               else
6791                 XSETCDR (prev, XCDR (tail));
6792             }
6793         }
6794     }
6795
6796  done_safe_codings:
6797   /* If the above loop was terminated before P reaches PEND, it means
6798      SAFE_CODINGS was set to nil.  If we have not yet found an
6799      non-ASCII single-byte char, check it now.  */
6800   if (! *single_byte_char_found)
6801     while (p < pend)
6802       {
6803         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6804         p += len;
6805         if (! ASCII_BYTE_P (c)
6806             && SINGLE_BYTE_CHAR_P (c))
6807           {
6808             *single_byte_char_found = 1;
6809             break;
6810           }
6811       }
6812   return safe_codings;
6813 }
6814
6815 DEFUN ("find-coding-systems-region-internal",
6816        Ffind_coding_systems_region_internal,
6817        Sfind_coding_systems_region_internal, 2, 2, 0,
6818        doc: /* Internal use only.  */)
6819      (start, end)
6820      Lisp_Object start, end;
6821 {
6822   Lisp_Object work_table, safe_codings;
6823   int non_ascii_p = 0;
6824   int single_byte_char_found = 0;
6825   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6826
6827   if (STRINGP (start))
6828     {
6829       if (!STRING_MULTIBYTE (start))
6830         return Qt;
6831       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6832       p2 = p2end = p1end;
6833       if (SCHARS (start) != SBYTES (start))
6834         non_ascii_p = 1;
6835     }
6836   else
6837     {
6838       int from, to, stop;
6839
6840       CHECK_NUMBER_COERCE_MARKER (start);
6841       CHECK_NUMBER_COERCE_MARKER (end);
6842       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6843         args_out_of_range (start, end);
6844       if (NILP (current_buffer->enable_multibyte_characters))
6845         return Qt;
6846       from = CHAR_TO_BYTE (XINT (start));
6847       to = CHAR_TO_BYTE (XINT (end));
6848       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6849       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6850       if (stop == to)
6851         p2 = p2end = p1end;
6852       else
6853         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6854       if (XINT (end) - XINT (start) != to - from)
6855         non_ascii_p = 1;
6856     }
6857
6858   if (!non_ascii_p)
6859     {
6860       /* We are sure that the text contains no multibyte character.
6861          Check if it contains eight-bit-graphic.  */
6862       p = p1;
6863       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6864       if (p == p1end)
6865         {
6866           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6867           if (p == p2end)
6868             return Qt;
6869         }
6870     }
6871
6872   /* The text contains non-ASCII characters.  */
6873
6874   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6875   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6876
6877   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6878                                     &single_byte_char_found);
6879   if (p2 < p2end)
6880     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6881                                       &single_byte_char_found);
6882   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6883     safe_codings = Qt;
6884   else
6885     {
6886       /* Turn safe_codings to a list of coding systems... */
6887       Lisp_Object val;
6888
6889       if (single_byte_char_found)
6890         /* ... and append these for eight-bit chars.  */
6891         val = Fcons (Qraw_text,
6892                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6893       else
6894         /* ... and append generic coding systems.  */
6895         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6896
6897       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6898         val = Fcons (XCAR (XCAR (safe_codings)), val);
6899       safe_codings = val;
6900     }
6901
6902   return safe_codings;
6903 }
6904
6905
6906 /* Search from position POS for such characters that are unencodable
6907    accoding to SAFE_CHARS, and return a list of their positions.  P
6908    points where in the memory the character at POS exists.  Limit the
6909    search at PEND or when Nth unencodable characters are found.
6910
6911    If SAFE_CHARS is a char table, an element for an unencodable
6912    character is nil.
6913
6914    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6915
6916    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6917    eight-bit-graphic characters are unencodable.  */
6918
6919 static Lisp_Object
6920 unencodable_char_position (safe_chars, pos, p, pend, n)
6921      Lisp_Object safe_chars;
6922      int pos;
6923      unsigned char *p, *pend;
6924      int n;
6925 {
6926   Lisp_Object pos_list;
6927
6928   pos_list = Qnil;
6929   while (p < pend)
6930     {
6931       int len;
6932       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6933
6934       if (c >= 128
6935           && (CHAR_TABLE_P (safe_chars)
6936               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6937               : (NILP (safe_chars) || c < 256)))
6938         {
6939           pos_list = Fcons (make_number (pos), pos_list);
6940           if (--n <= 0)
6941             break;
6942         }
6943       pos++;
6944       p += len;
6945     }
6946   return Fnreverse (pos_list);
6947 }
6948
6949
6950 DEFUN ("unencodable-char-position", Funencodable_char_position,
6951        Sunencodable_char_position, 3, 5, 0,
6952        doc: /*
6953 Return position of first un-encodable character in a region.
6954 START and END specfiy the region and CODING-SYSTEM specifies the
6955 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6956
6957 If optional 4th argument COUNT is non-nil, it specifies at most how
6958 many un-encodable characters to search.  In this case, the value is a
6959 list of positions.
6960
6961 If optional 5th argument STRING is non-nil, it is a string to search
6962 for un-encodable characters.  In that case, START and END are indexes
6963 to the string.  */)
6964      (start, end, coding_system, count, string)
6965      Lisp_Object start, end, coding_system, count, string;
6966 {
6967   int n;
6968   Lisp_Object safe_chars;
6969   struct coding_system coding;
6970   Lisp_Object positions;
6971   int from, to;
6972   unsigned char *p, *pend;
6973
6974   if (NILP (string))
6975     {
6976       validate_region (&start, &end);
6977       from = XINT (start);
6978       to = XINT (end);
6979       if (NILP (current_buffer->enable_multibyte_characters))
6980         return Qnil;
6981       p = CHAR_POS_ADDR (from);
6982       if (to == GPT)
6983         pend = GPT_ADDR;
6984       else
6985         pend = CHAR_POS_ADDR (to);
6986     }
6987   else
6988     {
6989       CHECK_STRING (string);
6990       CHECK_NATNUM (start);
6991       CHECK_NATNUM (end);
6992       from = XINT (start);
6993       to = XINT (end);
6994       if (from > to
6995           || to > SCHARS (string))
6996         args_out_of_range_3 (string, start, end);
6997       if (! STRING_MULTIBYTE (string))
6998         return Qnil;
6999       p = SDATA (string) + string_char_to_byte (string, from);
7000       pend = SDATA (string) + string_char_to_byte (string, to);
7001     }
7002
7003   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7004
7005   if (NILP (count))
7006     n = 1;
7007   else
7008     {
7009       CHECK_NATNUM (count);
7010       n = XINT (count);
7011     }
7012
7013   if (coding.type == coding_type_no_conversion
7014       || coding.type == coding_type_raw_text)
7015     return Qnil;
7016
7017   if (coding.type == coding_type_undecided)
7018     safe_chars = Qnil;
7019   else
7020     safe_chars = coding_safe_chars (coding_system);
7021
7022   if (STRINGP (string)
7023       || from >= GPT || to <= GPT)
7024     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7025   else
7026     {
7027       Lisp_Object args[2];
7028
7029       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7030       n -= XINT (Flength (args[0]));
7031       if (n <= 0)
7032         positions = args[0];
7033       else
7034         {
7035           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7036                                                pend, n);
7037           positions = Fappend (2, args);
7038         }
7039     }
7040
7041   return  (NILP (count) ? Fcar (positions) : positions);
7042 }
7043
7044
7045 Lisp_Object
7046 code_convert_region1 (start, end, coding_system, encodep)
7047      Lisp_Object start, end, coding_system;
7048      int encodep;
7049 {
7050   struct coding_system coding;
7051   int from, to;
7052
7053   CHECK_NUMBER_COERCE_MARKER (start);
7054   CHECK_NUMBER_COERCE_MARKER (end);
7055   CHECK_SYMBOL (coding_system);
7056
7057   validate_region (&start, &end);
7058   from = XFASTINT (start);
7059   to = XFASTINT (end);
7060
7061   if (NILP (coding_system))
7062     return make_number (to - from);
7063
7064   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7065     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7066
7067   coding.mode |= CODING_MODE_LAST_BLOCK;
7068   coding.src_multibyte = coding.dst_multibyte
7069     = !NILP (current_buffer->enable_multibyte_characters);
7070   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7071                        &coding, encodep, 1);
7072   Vlast_coding_system_used = coding.symbol;
7073   return make_number (coding.produced_char);
7074 }
7075
7076 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7077        3, 3, "r\nzCoding system: ",
7078        doc: /* Decode the current region from the specified coding system.
7079 When called from a program, takes three arguments:
7080 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7081 This function sets `last-coding-system-used' to the precise coding system
7082 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7083 not fully specified.)
7084 It returns the length of the decoded text.  */)
7085      (start, end, coding_system)
7086      Lisp_Object start, end, coding_system;
7087 {
7088   return code_convert_region1 (start, end, coding_system, 0);
7089 }
7090
7091 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7092        3, 3, "r\nzCoding system: ",
7093        doc: /* Encode the current region into the specified coding system.
7094 When called from a program, takes three arguments:
7095 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7096 This function sets `last-coding-system-used' to the precise coding system
7097 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7098 not fully specified.)
7099 It returns the length of the encoded text.  */)
7100      (start, end, coding_system)
7101      Lisp_Object start, end, coding_system;
7102 {
7103   return code_convert_region1 (start, end, coding_system, 1);
7104 }
7105
7106 Lisp_Object
7107 code_convert_string1 (string, coding_system, nocopy, encodep)
7108      Lisp_Object string, coding_system, nocopy;
7109      int encodep;
7110 {
7111   struct coding_system coding;
7112
7113   CHECK_STRING (string);
7114   CHECK_SYMBOL (coding_system);
7115
7116   if (NILP (coding_system))
7117     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7118
7119   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7120     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7121
7122   coding.mode |= CODING_MODE_LAST_BLOCK;
7123   string = (encodep
7124             ? encode_coding_string (string, &coding, !NILP (nocopy))
7125             : decode_coding_string (string, &coding, !NILP (nocopy)));
7126   Vlast_coding_system_used = coding.symbol;
7127
7128   return string;
7129 }
7130
7131 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7132        2, 3, 0,
7133        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7134 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7135 if the decoding operation is trivial.
7136 This function sets `last-coding-system-used' to the precise coding system
7137 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7138 not fully specified.)  */)
7139      (string, coding_system, nocopy)
7140      Lisp_Object string, coding_system, nocopy;
7141 {
7142   return code_convert_string1 (string, coding_system, nocopy, 0);
7143 }
7144
7145 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7146        2, 3, 0,
7147        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7148 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7149 if the encoding operation is trivial.
7150 This function sets `last-coding-system-used' to the precise coding system
7151 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7152 not fully specified.)  */)
7153      (string, coding_system, nocopy)
7154      Lisp_Object string, coding_system, nocopy;
7155 {
7156   return code_convert_string1 (string, coding_system, nocopy, 1);
7157 }
7158
7159 /* Encode or decode STRING according to CODING_SYSTEM.
7160    Do not set Vlast_coding_system_used.
7161
7162    This function is called only from macros DECODE_FILE and
7163    ENCODE_FILE, thus we ignore character composition.  */
7164
7165 Lisp_Object
7166 code_convert_string_norecord (string, coding_system, encodep)
7167      Lisp_Object string, coding_system;
7168      int encodep;
7169 {
7170   struct coding_system coding;
7171
7172   CHECK_STRING (string);
7173   CHECK_SYMBOL (coding_system);
7174
7175   if (NILP (coding_system))
7176     return string;
7177
7178   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7179     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7180
7181   coding.composing = COMPOSITION_DISABLED;
7182   coding.mode |= CODING_MODE_LAST_BLOCK;
7183   return (encodep
7184           ? encode_coding_string (string, &coding, 1)
7185           : decode_coding_string (string, &coding, 1));
7186 }
7187 \f
7188 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7189        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7190 Return the corresponding character.  */)
7191      (code)
7192      Lisp_Object code;
7193 {
7194   unsigned char c1, c2, s1, s2;
7195   Lisp_Object val;
7196
7197   CHECK_NUMBER (code);
7198   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7199   if (s1 == 0)
7200     {
7201       if (s2 < 0x80)
7202         XSETFASTINT (val, s2);
7203       else if (s2 >= 0xA0 || s2 <= 0xDF)
7204         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7205       else
7206         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7207     }
7208   else
7209     {
7210       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7211           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7212         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7213       DECODE_SJIS (s1, s2, c1, c2);
7214       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7215     }
7216   return val;
7217 }
7218
7219 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7220        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7221 Return the corresponding code in SJIS.  */)
7222      (ch)
7223      Lisp_Object ch;
7224 {
7225   int charset, c1, c2, s1, s2;
7226   Lisp_Object val;
7227
7228   CHECK_NUMBER (ch);
7229   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7230   if (charset == CHARSET_ASCII)
7231     {
7232       val = ch;
7233     }
7234   else if (charset == charset_jisx0208
7235            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7236     {
7237       ENCODE_SJIS (c1, c2, s1, s2);
7238       XSETFASTINT (val, (s1 << 8) | s2);
7239     }
7240   else if (charset == charset_katakana_jisx0201
7241            && c1 > 0x20 && c2 < 0xE0)
7242     {
7243       XSETFASTINT (val, c1 | 0x80);
7244     }
7245   else
7246     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7247   return val;
7248 }
7249
7250 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7251        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7252 Return the corresponding character.  */)
7253      (code)
7254      Lisp_Object code;
7255 {
7256   int charset;
7257   unsigned char b1, b2, c1, c2;
7258   Lisp_Object val;
7259
7260   CHECK_NUMBER (code);
7261   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7262   if (b1 == 0)
7263     {
7264       if (b2 >= 0x80)
7265         error ("Invalid BIG5 code: %x", XFASTINT (code));
7266       val = code;
7267     }
7268   else
7269     {
7270       if ((b1 < 0xA1 || b1 > 0xFE)
7271           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7272         error ("Invalid BIG5 code: %x", XFASTINT (code));
7273       DECODE_BIG5 (b1, b2, charset, c1, c2);
7274       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7275     }
7276   return val;
7277 }
7278
7279 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7280        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7281 Return the corresponding character code in Big5.  */)
7282      (ch)
7283      Lisp_Object ch;
7284 {
7285   int charset, c1, c2, b1, b2;
7286   Lisp_Object val;
7287
7288   CHECK_NUMBER (ch);
7289   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7290   if (charset == CHARSET_ASCII)
7291     {
7292       val = ch;
7293     }
7294   else if ((charset == charset_big5_1
7295             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7296            || (charset == charset_big5_2
7297                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7298     {
7299       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7300       XSETFASTINT (val, (b1 << 8) | b2);
7301     }
7302   else
7303     error ("Can't encode to Big5: %d", XFASTINT (ch));
7304   return val;
7305 }
7306 \f
7307 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7308        Sset_terminal_coding_system_internal, 1, 2, 0,
7309        doc: /* Internal use only.  */)
7310      (coding_system, display)
7311      Lisp_Object coding_system;
7312      Lisp_Object display;
7313 {
7314   struct coding_system *terminal_coding = DISPLAY_TERMINAL_CODING (get_display (display, 1));
7315   CHECK_SYMBOL (coding_system);
7316   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
7317   /* We had better not send unsafe characters to terminal.  */
7318   terminal_coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7319   /* Character composition should be disabled.  */
7320   terminal_coding->composing = COMPOSITION_DISABLED;
7321   /* Error notification should be suppressed.  */
7322   terminal_coding->suppress_error = 1;
7323   terminal_coding->src_multibyte = 1;
7324   terminal_coding->dst_multibyte = 0;
7325   return Qnil;
7326 }
7327
7328 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7329        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7330        doc: /* Internal use only.  */)
7331      (coding_system)
7332      Lisp_Object coding_system;
7333 {
7334   CHECK_SYMBOL (coding_system);
7335   setup_coding_system (Fcheck_coding_system (coding_system),
7336                        &safe_terminal_coding);
7337   /* Character composition should be disabled.  */
7338   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7339   /* Error notification should be suppressed.  */
7340   safe_terminal_coding.suppress_error = 1;
7341   safe_terminal_coding.src_multibyte = 1;
7342   safe_terminal_coding.dst_multibyte = 0;
7343   return Qnil;
7344 }
7345
7346 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7347        Sterminal_coding_system, 0, 1, 0,
7348        doc: /* Return coding system specified for terminal output on the given display.
7349 DISPLAY may be a display id, a frame, or nil for the selected frame's display.  */)
7350      (display)
7351      Lisp_Object display;
7352 {
7353   return DISPLAY_TERMINAL_CODING (get_display (display, 1))->symbol;
7354 }
7355
7356 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7357        Sset_keyboard_coding_system_internal, 1, 2, 0,
7358        doc: /* Internal use only.  */)
7359      (coding_system, display)
7360      Lisp_Object coding_system;
7361      Lisp_Object display;
7362 {
7363   struct display *d = get_display (display, 1);
7364   CHECK_SYMBOL (coding_system);
7365
7366   setup_coding_system (Fcheck_coding_system (coding_system),
7367                        DISPLAY_KEYBOARD_CODING (d));
7368   /* Character composition should be disabled.  */
7369   DISPLAY_KEYBOARD_CODING (d)->composing = COMPOSITION_DISABLED;
7370   return Qnil;
7371 }
7372
7373 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7374        Skeyboard_coding_system, 0, 1, 0,
7375        doc: /* Return coding system specified for decoding keyboard input.  */)
7376      (display)
7377      Lisp_Object display;
7378 {
7379   return DISPLAY_KEYBOARD_CODING (get_display (display, 1))->symbol;
7380 }
7381
7382 \f
7383 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7384        Sfind_operation_coding_system,  1, MANY, 0,
7385        doc: /* Choose a coding system for an operation based on the target name.
7386 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7387 DECODING-SYSTEM is the coding system to use for decoding
7388 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7389 for encoding (in case OPERATION does encoding).
7390
7391 The first argument OPERATION specifies an I/O primitive:
7392   For file I/O, `insert-file-contents' or `write-region'.
7393   For process I/O, `call-process', `call-process-region', or `start-process'.
7394   For network I/O, `open-network-stream'.
7395
7396 The remaining arguments should be the same arguments that were passed
7397 to the primitive.  Depending on which primitive, one of those arguments
7398 is selected as the TARGET.  For example, if OPERATION does file I/O,
7399 whichever argument specifies the file name is TARGET.
7400
7401 TARGET has a meaning which depends on OPERATION:
7402   For file I/O, TARGET is a file name.
7403   For process I/O, TARGET is a process name.
7404   For network I/O, TARGET is a service name or a port number
7405
7406 This function looks up what specified for TARGET in,
7407 `file-coding-system-alist', `process-coding-system-alist',
7408 or `network-coding-system-alist' depending on OPERATION.
7409 They may specify a coding system, a cons of coding systems,
7410 or a function symbol to call.
7411 In the last case, we call the function with one argument,
7412 which is a list of all the arguments given to this function.
7413
7414 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7415      (nargs, args)
7416      int nargs;
7417      Lisp_Object *args;
7418 {
7419   Lisp_Object operation, target_idx, target, val;
7420   register Lisp_Object chain;
7421
7422   if (nargs < 2)
7423     error ("Too few arguments");
7424   operation = args[0];
7425   if (!SYMBOLP (operation)
7426       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7427     error ("Invalid first argument");
7428   if (nargs < 1 + XINT (target_idx))
7429     error ("Too few arguments for operation: %s",
7430            SDATA (SYMBOL_NAME (operation)));
7431   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7432      argument to write-region) is string, it must be treated as a
7433      target file name.  */
7434   if (EQ (operation, Qwrite_region)
7435       && nargs > 5
7436       && STRINGP (args[5]))
7437     target_idx = make_number (4);
7438   target = args[XINT (target_idx) + 1];
7439   if (!(STRINGP (target)
7440         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7441     error ("Invalid argument %d", XINT (target_idx) + 1);
7442
7443   chain = ((EQ (operation, Qinsert_file_contents)
7444             || EQ (operation, Qwrite_region))
7445            ? Vfile_coding_system_alist
7446            : (EQ (operation, Qopen_network_stream)
7447               ? Vnetwork_coding_system_alist
7448               : Vprocess_coding_system_alist));
7449   if (NILP (chain))
7450     return Qnil;
7451
7452   for (; CONSP (chain); chain = XCDR (chain))
7453     {
7454       Lisp_Object elt;
7455       elt = XCAR (chain);
7456
7457       if (CONSP (elt)
7458           && ((STRINGP (target)
7459                && STRINGP (XCAR (elt))
7460                && fast_string_match (XCAR (elt), target) >= 0)
7461               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7462         {
7463           val = XCDR (elt);
7464           /* Here, if VAL is both a valid coding system and a valid
7465              function symbol, we return VAL as a coding system.  */
7466           if (CONSP (val))
7467             return val;
7468           if (! SYMBOLP (val))
7469             return Qnil;
7470           if (! NILP (Fcoding_system_p (val)))
7471             return Fcons (val, val);
7472           if (! NILP (Ffboundp (val)))
7473             {
7474               val = call1 (val, Flist (nargs, args));
7475               if (CONSP (val))
7476                 return val;
7477               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7478                 return Fcons (val, val);
7479             }
7480           return Qnil;
7481         }
7482     }
7483   return Qnil;
7484 }
7485
7486 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7487        Supdate_coding_systems_internal, 0, 0, 0,
7488        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7489 When values of any coding categories are changed, you must
7490 call this function.  */)
7491      ()
7492 {
7493   int i;
7494
7495   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7496     {
7497       Lisp_Object val;
7498
7499       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7500       if (!NILP (val))
7501         {
7502           if (! coding_system_table[i])
7503             coding_system_table[i] = ((struct coding_system *)
7504                                       xmalloc (sizeof (struct coding_system)));
7505           setup_coding_system (val, coding_system_table[i]);
7506         }
7507       else if (coding_system_table[i])
7508         {
7509           xfree (coding_system_table[i]);
7510           coding_system_table[i] = NULL;
7511         }
7512     }
7513
7514   return Qnil;
7515 }
7516
7517 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7518        Sset_coding_priority_internal, 0, 0, 0,
7519        doc: /* Update internal database for the current value of `coding-category-list'.
7520 This function is internal use only.  */)
7521      ()
7522 {
7523   int i = 0, idx;
7524   Lisp_Object val;
7525
7526   val = Vcoding_category_list;
7527
7528   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7529     {
7530       if (! SYMBOLP (XCAR (val)))
7531         break;
7532       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7533       if (idx >= CODING_CATEGORY_IDX_MAX)
7534         break;
7535       coding_priorities[i++] = (1 << idx);
7536       val = XCDR (val);
7537     }
7538   /* If coding-category-list is valid and contains all coding
7539      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7540      the following code saves Emacs from crashing.  */
7541   while (i < CODING_CATEGORY_IDX_MAX)
7542     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7543
7544   return Qnil;
7545 }
7546
7547 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7548        Sdefine_coding_system_internal, 1, 1, 0,
7549        doc: /* Register CODING-SYSTEM as a base coding system.
7550 This function is internal use only.  */)
7551      (coding_system)
7552      Lisp_Object coding_system;
7553 {
7554   Lisp_Object safe_chars, slot;
7555
7556   if (NILP (Fcheck_coding_system (coding_system)))
7557     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7558   safe_chars = coding_safe_chars (coding_system);
7559   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7560     error ("No valid safe-chars property for %s",
7561            SDATA (SYMBOL_NAME (coding_system)));
7562   if (EQ (safe_chars, Qt))
7563     {
7564       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7565         XSETCAR (Vcoding_system_safe_chars,
7566                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7567     }
7568   else
7569     {
7570       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7571       if (NILP (slot))
7572         XSETCDR (Vcoding_system_safe_chars,
7573                  nconc2 (XCDR (Vcoding_system_safe_chars),
7574                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7575       else
7576         XSETCDR (slot, safe_chars);
7577     }
7578   return Qnil;
7579 }
7580
7581 #endif /* emacs */
7582
7583 \f
7584 /*** 9. Post-amble ***/
7585
7586 void
7587 init_coding_once ()
7588 {
7589   int i;
7590
7591   /* Emacs' internal format specific initialize routine.  */
7592   for (i = 0; i <= 0x20; i++)
7593     emacs_code_class[i] = EMACS_control_code;
7594   emacs_code_class[0x0A] = EMACS_linefeed_code;
7595   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7596   for (i = 0x21 ; i < 0x7F; i++)
7597     emacs_code_class[i] = EMACS_ascii_code;
7598   emacs_code_class[0x7F] = EMACS_control_code;
7599   for (i = 0x80; i < 0xFF; i++)
7600     emacs_code_class[i] = EMACS_invalid_code;
7601   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7602   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7603   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7604   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7605
7606   /* ISO2022 specific initialize routine.  */
7607   for (i = 0; i < 0x20; i++)
7608     iso_code_class[i] = ISO_control_0;
7609   for (i = 0x21; i < 0x7F; i++)
7610     iso_code_class[i] = ISO_graphic_plane_0;
7611   for (i = 0x80; i < 0xA0; i++)
7612     iso_code_class[i] = ISO_control_1;
7613   for (i = 0xA1; i < 0xFF; i++)
7614     iso_code_class[i] = ISO_graphic_plane_1;
7615   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7616   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7617   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7618   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7619   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7620   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7621   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7622   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7623   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7624   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7625
7626   setup_coding_system (Qnil, &safe_terminal_coding);
7627   setup_coding_system (Qnil, &default_buffer_file_coding);
7628
7629   bzero (coding_system_table, sizeof coding_system_table);
7630
7631   bzero (ascii_skip_code, sizeof ascii_skip_code);
7632   for (i = 0; i < 128; i++)
7633     ascii_skip_code[i] = 1;
7634
7635 #if defined (MSDOS) || defined (WINDOWSNT)
7636   system_eol_type = CODING_EOL_CRLF;
7637 #else
7638   system_eol_type = CODING_EOL_LF;
7639 #endif
7640
7641   inhibit_pre_post_conversion = 0;
7642 }
7643
7644 #ifdef emacs
7645
7646 void
7647 syms_of_coding ()
7648 {
7649   staticpro (&Vcode_conversion_workbuf_name);
7650   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7651
7652   Qtarget_idx = intern ("target-idx");
7653   staticpro (&Qtarget_idx);
7654
7655   Qcoding_system_history = intern ("coding-system-history");
7656   staticpro (&Qcoding_system_history);
7657   Fset (Qcoding_system_history, Qnil);
7658
7659   /* Target FILENAME is the first argument.  */
7660   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7661   /* Target FILENAME is the third argument.  */
7662   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7663
7664   Qcall_process = intern ("call-process");
7665   staticpro (&Qcall_process);
7666   /* Target PROGRAM is the first argument.  */
7667   Fput (Qcall_process, Qtarget_idx, make_number (0));
7668
7669   Qcall_process_region = intern ("call-process-region");
7670   staticpro (&Qcall_process_region);
7671   /* Target PROGRAM is the third argument.  */
7672   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7673
7674   Qstart_process = intern ("start-process");
7675   staticpro (&Qstart_process);
7676   /* Target PROGRAM is the third argument.  */
7677   Fput (Qstart_process, Qtarget_idx, make_number (2));
7678
7679   Qopen_network_stream = intern ("open-network-stream");
7680   staticpro (&Qopen_network_stream);
7681   /* Target SERVICE is the fourth argument.  */
7682   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7683
7684   Qcoding_system = intern ("coding-system");
7685   staticpro (&Qcoding_system);
7686
7687   Qeol_type = intern ("eol-type");
7688   staticpro (&Qeol_type);
7689
7690   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7691   staticpro (&Qbuffer_file_coding_system);
7692
7693   Qpost_read_conversion = intern ("post-read-conversion");
7694   staticpro (&Qpost_read_conversion);
7695
7696   Qpre_write_conversion = intern ("pre-write-conversion");
7697   staticpro (&Qpre_write_conversion);
7698
7699   Qno_conversion = intern ("no-conversion");
7700   staticpro (&Qno_conversion);
7701
7702   Qundecided = intern ("undecided");
7703   staticpro (&Qundecided);
7704
7705   Qcoding_system_p = intern ("coding-system-p");
7706   staticpro (&Qcoding_system_p);
7707
7708   Qcoding_system_error = intern ("coding-system-error");
7709   staticpro (&Qcoding_system_error);
7710
7711   Fput (Qcoding_system_error, Qerror_conditions,
7712         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7713   Fput (Qcoding_system_error, Qerror_message,
7714         build_string ("Invalid coding system"));
7715
7716   Qcoding_category = intern ("coding-category");
7717   staticpro (&Qcoding_category);
7718   Qcoding_category_index = intern ("coding-category-index");
7719   staticpro (&Qcoding_category_index);
7720
7721   Vcoding_category_table
7722     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7723   staticpro (&Vcoding_category_table);
7724   {
7725     int i;
7726     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7727       {
7728         XVECTOR (Vcoding_category_table)->contents[i]
7729           = intern (coding_category_name[i]);
7730         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7731               Qcoding_category_index, make_number (i));
7732       }
7733   }
7734
7735   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7736   staticpro (&Vcoding_system_safe_chars);
7737
7738   Qtranslation_table = intern ("translation-table");
7739   staticpro (&Qtranslation_table);
7740   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7741
7742   Qtranslation_table_id = intern ("translation-table-id");
7743   staticpro (&Qtranslation_table_id);
7744
7745   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7746   staticpro (&Qtranslation_table_for_decode);
7747
7748   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7749   staticpro (&Qtranslation_table_for_encode);
7750
7751   Qsafe_chars = intern ("safe-chars");
7752   staticpro (&Qsafe_chars);
7753
7754   Qchar_coding_system = intern ("char-coding-system");
7755   staticpro (&Qchar_coding_system);
7756
7757   /* Intern this now in case it isn't already done.
7758      Setting this variable twice is harmless.
7759      But don't staticpro it here--that is done in alloc.c.  */
7760   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7761   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7762   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7763
7764   Qvalid_codes = intern ("valid-codes");
7765   staticpro (&Qvalid_codes);
7766
7767   Qemacs_mule = intern ("emacs-mule");
7768   staticpro (&Qemacs_mule);
7769
7770   Qraw_text = intern ("raw-text");
7771   staticpro (&Qraw_text);
7772
7773   Qutf_8 = intern ("utf-8");
7774   staticpro (&Qutf_8);
7775
7776   Qcoding_system_define_form = intern ("coding-system-define-form");
7777   staticpro (&Qcoding_system_define_form);
7778
7779   defsubr (&Scoding_system_p);
7780   defsubr (&Sread_coding_system);
7781   defsubr (&Sread_non_nil_coding_system);
7782   defsubr (&Scheck_coding_system);
7783   defsubr (&Sdetect_coding_region);
7784   defsubr (&Sdetect_coding_string);
7785   defsubr (&Sfind_coding_systems_region_internal);
7786   defsubr (&Sunencodable_char_position);
7787   defsubr (&Sdecode_coding_region);
7788   defsubr (&Sencode_coding_region);
7789   defsubr (&Sdecode_coding_string);
7790   defsubr (&Sencode_coding_string);
7791   defsubr (&Sdecode_sjis_char);
7792   defsubr (&Sencode_sjis_char);
7793   defsubr (&Sdecode_big5_char);
7794   defsubr (&Sencode_big5_char);
7795   defsubr (&Sset_terminal_coding_system_internal);
7796   defsubr (&Sset_safe_terminal_coding_system_internal);
7797   defsubr (&Sterminal_coding_system);
7798   defsubr (&Sset_keyboard_coding_system_internal);
7799   defsubr (&Skeyboard_coding_system);
7800   defsubr (&Sfind_operation_coding_system);
7801   defsubr (&Supdate_coding_systems_internal);
7802   defsubr (&Sset_coding_priority_internal);
7803   defsubr (&Sdefine_coding_system_internal);
7804
7805   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7806                doc: /* List of coding systems.
7807
7808 Do not alter the value of this variable manually.  This variable should be
7809 updated by the functions `make-coding-system' and
7810 `define-coding-system-alias'.  */);
7811   Vcoding_system_list = Qnil;
7812
7813   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7814                doc: /* Alist of coding system names.
7815 Each element is one element list of coding system name.
7816 This variable is given to `completing-read' as TABLE argument.
7817
7818 Do not alter the value of this variable manually.  This variable should be
7819 updated by the functions `make-coding-system' and
7820 `define-coding-system-alias'.  */);
7821   Vcoding_system_alist = Qnil;
7822
7823   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7824                doc: /* List of coding-categories (symbols) ordered by priority.
7825
7826 On detecting a coding system, Emacs tries code detection algorithms
7827 associated with each coding-category one by one in this order.  When
7828 one algorithm agrees with a byte sequence of source text, the coding
7829 system bound to the corresponding coding-category is selected.
7830
7831 Don't modify this variable directly, but use `set-coding-priority'.  */);
7832   {
7833     int i;
7834
7835     Vcoding_category_list = Qnil;
7836     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7837       Vcoding_category_list
7838         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7839                  Vcoding_category_list);
7840   }
7841
7842   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7843                doc: /* Specify the coding system for read operations.
7844 It is useful to bind this variable with `let', but do not set it globally.
7845 If the value is a coding system, it is used for decoding on read operation.
7846 If not, an appropriate element is used from one of the coding system alists:
7847 There are three such tables, `file-coding-system-alist',
7848 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7849   Vcoding_system_for_read = Qnil;
7850
7851   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7852                doc: /* Specify the coding system for write operations.
7853 Programs bind this variable with `let', but you should not set it globally.
7854 If the value is a coding system, it is used for encoding of output,
7855 when writing it to a file and when sending it to a file or subprocess.
7856
7857 If this does not specify a coding system, an appropriate element
7858 is used from one of the coding system alists:
7859 There are three such tables, `file-coding-system-alist',
7860 `process-coding-system-alist', and `network-coding-system-alist'.
7861 For output to files, if the above procedure does not specify a coding system,
7862 the value of `buffer-file-coding-system' is used.  */);
7863   Vcoding_system_for_write = Qnil;
7864
7865   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7866                doc: /* Coding system used in the latest file or process I/O.
7867 Also set by `encode-coding-region', `decode-coding-region',
7868 `encode-coding-string' and `decode-coding-string'.  */);
7869   Vlast_coding_system_used = Qnil;
7870
7871   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7872                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7873 See info node `Coding Systems' and info node `Text and Binary' concerning
7874 such conversion.  */);
7875   inhibit_eol_conversion = 0;
7876
7877   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7878                doc: /* Non-nil means process buffer inherits coding system of process output.
7879 Bind it to t if the process output is to be treated as if it were a file
7880 read from some filesystem.  */);
7881   inherit_process_coding_system = 0;
7882
7883   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7884                doc: /* Alist to decide a coding system to use for a file I/O operation.
7885 The format is ((PATTERN . VAL) ...),
7886 where PATTERN is a regular expression matching a file name,
7887 VAL is a coding system, a cons of coding systems, or a function symbol.
7888 If VAL is a coding system, it is used for both decoding and encoding
7889 the file contents.
7890 If VAL is a cons of coding systems, the car part is used for decoding,
7891 and the cdr part is used for encoding.
7892 If VAL is a function symbol, the function must return a coding system
7893 or a cons of coding systems which are used as above.  The function gets
7894 the arguments with which `find-operation-coding-system' was called.
7895
7896 See also the function `find-operation-coding-system'
7897 and the variable `auto-coding-alist'.  */);
7898   Vfile_coding_system_alist = Qnil;
7899
7900   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7901     doc: /* Alist to decide a coding system to use for a process I/O operation.
7902 The format is ((PATTERN . VAL) ...),
7903 where PATTERN is a regular expression matching a program name,
7904 VAL is a coding system, a cons of coding systems, or a function symbol.
7905 If VAL is a coding system, it is used for both decoding what received
7906 from the program and encoding what sent to the program.
7907 If VAL is a cons of coding systems, the car part is used for decoding,
7908 and the cdr part is used for encoding.
7909 If VAL is a function symbol, the function must return a coding system
7910 or a cons of coding systems which are used as above.
7911
7912 See also the function `find-operation-coding-system'.  */);
7913   Vprocess_coding_system_alist = Qnil;
7914
7915   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7916     doc: /* Alist to decide a coding system to use for a network I/O operation.
7917 The format is ((PATTERN . VAL) ...),
7918 where PATTERN is a regular expression matching a network service name
7919 or is a port number to connect to,
7920 VAL is a coding system, a cons of coding systems, or a function symbol.
7921 If VAL is a coding system, it is used for both decoding what received
7922 from the network stream and encoding what sent to the network stream.
7923 If VAL is a cons of coding systems, the car part is used for decoding,
7924 and the cdr part is used for encoding.
7925 If VAL is a function symbol, the function must return a coding system
7926 or a cons of coding systems which are used as above.
7927
7928 See also the function `find-operation-coding-system'.  */);
7929   Vnetwork_coding_system_alist = Qnil;
7930
7931   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7932                doc: /* Coding system to use with system messages.
7933 Also used for decoding keyboard input on X Window system.  */);
7934   Vlocale_coding_system = Qnil;
7935
7936   /* The eol mnemonics are reset in startup.el system-dependently.  */
7937   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7938                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7939   eol_mnemonic_unix = build_string (":");
7940
7941   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7942                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7943   eol_mnemonic_dos = build_string ("\\");
7944
7945   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7946                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7947   eol_mnemonic_mac = build_string ("/");
7948
7949   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7950                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7951   eol_mnemonic_undecided = build_string (":");
7952
7953   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7954                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7955   Venable_character_translation = Qt;
7956
7957   DEFVAR_LISP ("standard-translation-table-for-decode",
7958                &Vstandard_translation_table_for_decode,
7959                doc: /* Table for translating characters while decoding.  */);
7960   Vstandard_translation_table_for_decode = Qnil;
7961
7962   DEFVAR_LISP ("standard-translation-table-for-encode",
7963                &Vstandard_translation_table_for_encode,
7964                doc: /* Table for translating characters while encoding.  */);
7965   Vstandard_translation_table_for_encode = Qnil;
7966
7967   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7968                doc: /* Alist of charsets vs revision numbers.
7969 While encoding, if a charset (car part of an element) is found,
7970 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7971   Vcharset_revision_alist = Qnil;
7972
7973   DEFVAR_LISP ("default-process-coding-system",
7974                &Vdefault_process_coding_system,
7975                doc: /* Cons of coding systems used for process I/O by default.
7976 The car part is used for decoding a process output,
7977 the cdr part is used for encoding a text to be sent to a process.  */);
7978   Vdefault_process_coding_system = Qnil;
7979
7980   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7981                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7982 This is a vector of length 256.
7983 If Nth element is non-nil, the existence of code N in a file
7984 \(or output of subprocess) doesn't prevent it to be detected as
7985 a coding system of ISO 2022 variant which has a flag
7986 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7987 or reading output of a subprocess.
7988 Only 128th through 159th elements has a meaning.  */);
7989   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7990
7991   DEFVAR_LISP ("select-safe-coding-system-function",
7992                &Vselect_safe_coding_system_function,
7993                doc: /* Function to call to select safe coding system for encoding a text.
7994
7995 If set, this function is called to force a user to select a proper
7996 coding system which can encode the text in the case that a default
7997 coding system used in each operation can't encode the text.
7998
7999 The default value is `select-safe-coding-system' (which see).  */);
8000   Vselect_safe_coding_system_function = Qnil;
8001
8002   DEFVAR_BOOL ("coding-system-require-warning",
8003                &coding_system_require_warning,
8004                doc: /* Internal use only.
8005 If non-nil, on writing a file, `select-safe-coding-system-function' is
8006 called even if `coding-system-for-write' is non-nil.  The command
8007 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8008   coding_system_require_warning = 0;
8009
8010
8011   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8012                &inhibit_iso_escape_detection,
8013                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8014
8015 By default, on reading a file, Emacs tries to detect how the text is
8016 encoded.  This code detection is sensitive to escape sequences.  If
8017 the sequence is valid as ISO2022, the code is determined as one of
8018 the ISO2022 encodings, and the file is decoded by the corresponding
8019 coding system (e.g. `iso-2022-7bit').
8020
8021 However, there may be a case that you want to read escape sequences in
8022 a file as is.  In such a case, you can set this variable to non-nil.
8023 Then, as the code detection ignores any escape sequences, no file is
8024 detected as encoded in some ISO2022 encoding.  The result is that all
8025 escape sequences become visible in a buffer.
8026
8027 The default value is nil, and it is strongly recommended not to change
8028 it.  That is because many Emacs Lisp source files that contain
8029 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8030 in Emacs's distribution, and they won't be decoded correctly on
8031 reading if you suppress escape sequence detection.
8032
8033 The other way to read escape sequences in a file without decoding is
8034 to explicitly specify some coding system that doesn't use ISO2022's
8035 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8036   inhibit_iso_escape_detection = 0;
8037
8038   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8039                doc: /* Char table for translating self-inserting characters.
8040 This is applied to the result of input methods, not their input.  See also
8041 `keyboard-translate-table'.  */);
8042     Vtranslation_table_for_input = Qnil;
8043 }
8044
8045 char *
8046 emacs_strerror (error_number)
8047      int error_number;
8048 {
8049   char *str;
8050
8051   synchronize_system_messages_locale ();
8052   str = strerror (error_number);
8053
8054   if (! NILP (Vlocale_coding_system))
8055     {
8056       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8057                                                       Vlocale_coding_system,
8058                                                       0);
8059       str = (char *) SDATA (dec);
8060     }
8061
8062   return str;
8063 }
8064
8065 #endif /* emacs */
8066
8067 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8068    (do not change this comment) */