src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  */
 116 #if 0
 117 int
 118 detect_coding_emacs_mule (src, src_end)
 119      unsigned char *src, *src_end;
 120 {
 121   ...
 122 }
 123 #endif
 124
 125 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 126
 127   These functions decode SRC_BYTES length of unibyte text at SOURCE
 128   encoded in CODING to Emacs' internal format.  The resulting
 129   multibyte text goes to a place pointed to by DESTINATION, the length
 130   of which should not exceed DST_BYTES.
 131
 132   These functions set the information of original and decoded texts in
 133   the members produced, produced_char, consumed, and consumed_char of
 134   the structure *CODING.  They also set the member result to one of
 135   CODING_FINISH_XXX indicating how the decoding finished.
 136
 137   DST_BYTES zero means that source area and destination area are
 138   overlapped, which means that we can produce a decoded text until it
 139   reaches at the head of not-yet-decoded source text.
 140
 141   Below is a template of these functions.  */
 142 #if 0
 143 static void
 144 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148 {
 149   ...
 150 }
 151 #endif
 152
 153 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 154
 155   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 156   internal multibyte format to CODING.  The resulting unibyte text
 157   goes to a place pointed to by DESTINATION, the length of which
 158   should not exceed DST_BYTES.
 159
 160   These functions set the information of original and encoded texts in
 161   the members produced, produced_char, consumed, and consumed_char of
 162   the structure *CODING.  They also set the member result to one of
 163   CODING_FINISH_XXX indicating how the encoding finished.
 164
 165   DST_BYTES zero means that source area and destination area are
 166   overlapped, which means that we can produce a encoded text until it
 167   reaches at the head of not-yet-encoded source text.
 168
 169   Below is a template of these functions.  */
 170 #if 0
 171 static void
 172 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 173      struct coding_system *coding;
 174      unsigned char *source, *destination;
 175      int src_bytes, dst_bytes;
 176 {
 177   ...
 178 }
 179 #endif
 180
 181 /*** COMMONLY USED MACROS ***/
 182
 183 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 184    get one, two, and three bytes from the source text respectively.
 185    If there are not enough bytes in the source, they jump to
 186    `label_end_of_loop'.  The caller should set variables `coding',
 187    `src' and `src_end' to appropriate pointer in advance.  These
 188    macros are called from decoding routines `decode_coding_XXX', thus
 189    it is assumed that the source text is unibyte.  */
 190
 191 #define ONE_MORE_BYTE(c1)                                       \
 192   do {                                                          \
 193     if (src >= src_end)                                         \
 194       {                                                         \
 195         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 196         goto label_end_of_loop;                                 \
 197       }                                                         \
 198     c1 = *src++;                                                \
 199   } while (0)
 200
 201 #define TWO_MORE_BYTES(c1, c2)                                  \
 202   do {                                                          \
 203     if (src + 1 >= src_end)                                     \
 204       {                                                         \
 205         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 206         goto label_end_of_loop;                                 \
 207       }                                                         \
 208     c1 = *src++;                                                \
 209     c2 = *src++;                                                \
 210   } while (0)
 211
 212
 213 /* Set C to the next character at the source text pointed by `src'.
 214    If there are not enough characters in the source, jump to
 215    `label_end_of_loop'.  The caller should set variables `coding'
 216    `src', `src_end', and `translation_table' to appropriate pointers
 217    in advance.  This macro is used in encoding routines
 218    `encode_coding_XXX', thus it assumes that the source text is in
 219    multibyte form except for 8-bit characters.  8-bit characters are
 220    in multibyte form if coding->src_multibyte is nonzero, else they
 221    are represented by a single byte.  */
 222
 223 #define ONE_MORE_CHAR(c)                                        \
 224   do {                                                          \
 225     int len = src_end - src;                                    \
 226     int bytes;                                                  \
 227     if (len <= 0)                                               \
 228       {                                                         \
 229         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 230         goto label_end_of_loop;                                 \
 231       }                                                         \
 232     if (coding->src_multibyte                                   \
 233         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 234       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 235     else                                                        \
 236       c = *src, bytes = 1;                                      \
 237     if (!NILP (translation_table))                              \
 238       c = translate_char (translation_table, c, 0, 0, 0);       \
 239     src += bytes;                                               \
 240   } while (0)
 241
 242
 243 /* Produce a multibyte form of characater C to `dst'.  Jump to
 244    `label_end_of_loop' if there's not enough space at `dst'.
 245
 246    If we are now in the middle of composition sequence, the decoded
 247    character may be ALTCHAR (for the current composition).  In that
 248    case, the character goes to coding->cmp_data->data instead of
 249    `dst'.
 250
 251    This macro is used in decoding routines.  */
 252
 253 #define EMIT_CHAR(c)                                                    \
 254   do {                                                                  \
 255     if (! COMPOSING_P (coding)                                          \
 256         || coding->composing == COMPOSITION_RELATIVE                    \
 257         || coding->composing == COMPOSITION_WITH_RULE)                  \
 258       {                                                                 \
 259         int bytes = CHAR_BYTES (c);                                     \
 260         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 261           {                                                             \
 262             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 263             goto label_end_of_loop;                                     \
 264           }                                                             \
 265         dst += CHAR_STRING (c, dst);                                    \
 266         coding->produced_char++;                                        \
 267       }                                                                 \
 268                                                                         \
 269     if (COMPOSING_P (coding)                                            \
 270         && coding->composing != COMPOSITION_RELATIVE)                   \
 271       {                                                                 \
 272         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 273         coding->composition_rule_follows                                \
 274           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 275       }                                                                 \
 276   } while (0)
 277
 278
 279 #define EMIT_ONE_BYTE(c)                                        \
 280   do {                                                          \
 281     if (dst >= (dst_bytes ? dst_end : src))                     \
 282       {                                                         \
 283         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 284         goto label_end_of_loop;                                 \
 285       }                                                         \
 286     *dst++ = c;                                                 \
 287   } while (0)
 288
 289 #define EMIT_TWO_BYTES(c1, c2)                                  \
 290   do {                                                          \
 291     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 292       {                                                         \
 293         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 294         goto label_end_of_loop;                                 \
 295       }                                                         \
 296     *dst++ = c1, *dst++ = c2;                                   \
 297   } while (0)
 298
 299 #define EMIT_BYTES(from, to)                                    \
 300   do {                                                          \
 301     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     while (from < to)                                           \
 307       *dst++ = *from++;                                         \
 308   } while (0)
 309
 310 \f
 311 /*** 1. Preamble ***/
 312
 313 #ifdef emacs
 314 #include <config.h>
 315 #endif
 316
 317 #include <stdio.h>
 318
 319 #ifdef emacs
 320
 321 #include "lisp.h"
 322 #include "buffer.h"
 323 #include "charset.h"
 324 #include "composite.h"
 325 #include "ccl.h"
 326 #include "coding.h"
 327 #include "window.h"
 328
 329 #else  /* not emacs */
 330
 331 #include "mulelib.h"
 332
 333 #endif /* not emacs */
 334
 335 Lisp_Object Qcoding_system, Qeol_type;
 336 Lisp_Object Qbuffer_file_coding_system;
 337 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 338 Lisp_Object Qno_conversion, Qundecided;
 339 Lisp_Object Qcoding_system_history;
 340 Lisp_Object Qsafe_charsets;
 341 Lisp_Object Qvalid_codes;
 342
 343 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 344 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 345 Lisp_Object Qstart_process, Qopen_network_stream;
 346 Lisp_Object Qtarget_idx;
 347
 348 Lisp_Object Vselect_safe_coding_system_function;
 349
 350 /* Mnemonic string for each format of end-of-line.  */
 351 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 352 /* Mnemonic string to indicate format of end-of-line is not yet
 353    decided.  */
 354 Lisp_Object eol_mnemonic_undecided;
 355
 356 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 357    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 358 int system_eol_type;
 359
 360 #ifdef emacs
 361
 362 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 363
 364 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 365
 366 /* Coding system emacs-mule and raw-text are for converting only
 367    end-of-line format.  */
 368 Lisp_Object Qemacs_mule, Qraw_text;
 369
 370 /* Coding-systems are handed between Emacs Lisp programs and C internal
 371    routines by the following three variables.  */
 372 /* Coding-system for reading files and receiving data from process.  */
 373 Lisp_Object Vcoding_system_for_read;
 374 /* Coding-system for writing files and sending data to process.  */
 375 Lisp_Object Vcoding_system_for_write;
 376 /* Coding-system actually used in the latest I/O.  */
 377 Lisp_Object Vlast_coding_system_used;
 378
 379 /* A vector of length 256 which contains information about special
 380    Latin codes (especially for dealing with Microsoft codes).  */
 381 Lisp_Object Vlatin_extra_code_table;
 382
 383 /* Flag to inhibit code conversion of end-of-line format.  */
 384 int inhibit_eol_conversion;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display.  */
 390 struct coding_system terminal_coding;
 391
 392 /* Coding system to be used to encode text for terminal display when
 393    terminal coding system is nil.  */
 394 struct coding_system safe_terminal_coding;
 395
 396 /* Coding system of what is sent from terminal keyboard.  */
 397 struct coding_system keyboard_coding;
 398
 399 /* Default coding system to be used to write a file.  */
 400 struct coding_system default_buffer_file_coding;
 401
 402 Lisp_Object Vfile_coding_system_alist;
 403 Lisp_Object Vprocess_coding_system_alist;
 404 Lisp_Object Vnetwork_coding_system_alist;
 405
 406 Lisp_Object Vlocale_coding_system;
 407
 408 #endif /* emacs */
 409
 410 Lisp_Object Qcoding_category, Qcoding_category_index;
 411
 412 /* List of symbols `coding-category-xxx' ordered by priority.  */
 413 Lisp_Object Vcoding_category_list;
 414
 415 /* Table of coding categories (Lisp symbols).  */
 416 Lisp_Object Vcoding_category_table;
 417
 418 /* Table of names of symbol for each coding-category.  */
 419 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 420   "coding-category-emacs-mule",
 421   "coding-category-sjis",
 422   "coding-category-iso-7",
 423   "coding-category-iso-7-tight",
 424   "coding-category-iso-8-1",
 425   "coding-category-iso-8-2",
 426   "coding-category-iso-7-else",
 427   "coding-category-iso-8-else",
 428   "coding-category-ccl",
 429   "coding-category-big5",
 430   "coding-category-utf-8",
 431   "coding-category-utf-16-be",
 432   "coding-category-utf-16-le",
 433   "coding-category-raw-text",
 434   "coding-category-binary"
 435 };
 436
 437 /* Table of pointers to coding systems corresponding to each coding
 438    categories.  */
 439 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 440
 441 /* Table of coding category masks.  Nth element is a mask for a coding
 442    cateogry of which priority is Nth.  */
 443 static
 444 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 445
 446 /* Flag to tell if we look up translation table on character code
 447    conversion.  */
 448 Lisp_Object Venable_character_translation;
 449 /* Standard translation table to look up on decoding (reading).  */
 450 Lisp_Object Vstandard_translation_table_for_decode;
 451 /* Standard translation table to look up on encoding (writing).  */
 452 Lisp_Object Vstandard_translation_table_for_encode;
 453
 454 Lisp_Object Qtranslation_table;
 455 Lisp_Object Qtranslation_table_id;
 456 Lisp_Object Qtranslation_table_for_decode;
 457 Lisp_Object Qtranslation_table_for_encode;
 458
 459 /* Alist of charsets vs revision number.  */
 460 Lisp_Object Vcharset_revision_alist;
 461
 462 /* Default coding systems used for process I/O.  */
 463 Lisp_Object Vdefault_process_coding_system;
 464
 465 /* Global flag to tell that we can't call post-read-conversion and
 466    pre-write-conversion functions.  Usually the value is zero, but it
 467    is set to 1 temporarily while such functions are running.  This is
 468    to avoid infinite recursive call.  */
 469 static int inhibit_pre_post_conversion;
 470
 471 \f
 472 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 473
 474 /* Emacs' internal format for encoding multiple character sets is a
 475    kind of multi-byte encoding, i.e. characters are encoded by
 476    variable-length sequences of one-byte codes.
 477
 478    ASCII characters and control characters (e.g. `tab', `newline') are
 479    represented by one-byte sequences which are their ASCII codes, in
 480    the range 0x00 through 0x7F.
 481
 482    8-bit characters of the range 0x80..0x9F are represented by
 483    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 484    code + 0x20).
 485
 486    8-bit characters of the range 0xA0..0xFF are represented by
 487    one-byte sequences which are their 8-bit code.
 488
 489    The other characters are represented by a sequence of `base
 490    leading-code', optional `extended leading-code', and one or two
 491    `position-code's.  The length of the sequence is determined by the
 492    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 493    whereas extended leading-code and position-code take the range 0xA0
 494    through 0xFF.  See `charset.h' for more details about leading-code
 495    and position-code.
 496
 497    --- CODE RANGE of Emacs' internal format ---
 498    character set        range
 499    -------------        -----
 500    ascii                0x00..0x7F
 501    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 502    eight-bit-graphic    0xA0..0xBF
 503    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 504    ---------------------------------------------
 505
 506   */
 507
 508 enum emacs_code_class_type emacs_code_class[256];
 509
 510 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 511    Check if a text is encoded in Emacs' internal format.  If it is,
 512    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 513
 514 int
 515 detect_coding_emacs_mule (src, src_end)
 516       unsigned char *src, *src_end;
 517 {
 518   unsigned char c;
 519   int composing = 0;
 520   /* Dummy for ONE_MORE_BYTE.  */
 521   struct coding_system dummy_coding;
 522   struct coding_system *coding = &dummy_coding;
 523
 524   while (1)
 525     {
 526       ONE_MORE_BYTE (c);
 527
 528       if (composing)
 529         {
 530           if (c < 0xA0)
 531             composing = 0;
 532           else if (c == 0xA0)
 533             {
 534               ONE_MORE_BYTE (c);
 535               c &= 0x7F;
 536             }
 537           else
 538             c -= 0x20;
 539         }
 540
 541       if (c < 0x20)
 542         {
 543           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 544             return 0;
 545         }
 546       else if (c >= 0x80 && c < 0xA0)
 547         {
 548           if (c == 0x80)
 549             /* Old leading code for a composite character.  */
 550             composing = 1;
 551           else
 552             {
 553               unsigned char *src_base = src - 1;
 554               int bytes;
 555
 556               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 557                                                bytes))
 558                 return 0;
 559               src = src_base + bytes;
 560             }
 561         }
 562     }
 563  label_end_of_loop:
 564   return CODING_CATEGORY_MASK_EMACS_MULE;
 565 }
 566
 567
 568 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 569
 570 static void
 571 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 572      struct coding_system *coding;
 573      unsigned char *source, *destination;
 574      int src_bytes, dst_bytes;
 575 {
 576   unsigned char *src = source;
 577   unsigned char *src_end = source + src_bytes;
 578   unsigned char *dst = destination;
 579   unsigned char *dst_end = destination + dst_bytes;
 580   /* SRC_BASE remembers the start position in source in each loop.
 581      The loop will be exited when there's not enough source code, or
 582      when there's not enough destination area to produce a
 583      character.  */
 584   unsigned char *src_base;
 585
 586   coding->produced_char = 0;
 587   while ((src_base = src) < src_end)
 588     {
 589       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 590       int bytes;
 591
 592       if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 593         {
 594           p = src;
 595           src += bytes;
 596         }
 597       else
 598         {
 599           bytes = CHAR_STRING (*src, tmp);
 600           p = tmp;
 601           src++;
 602         }
 603       if (dst + bytes >= (dst_bytes ? dst_end : src))
 604         {
 605           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 606           break;
 607         }
 608       while (bytes--) *dst++ = *p++;
 609       coding->produced_char++;
 610     }
 611   coding->consumed = coding->consumed_char = src_base - source;
 612   coding->produced = dst - destination;
 613 }
 614
 615 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 616   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 617
 618
 619 \f
 620 /*** 3. ISO2022 handlers ***/
 621
 622 /* The following note describes the coding system ISO2022 briefly.
 623    Since the intention of this note is to help understand the
 624    functions in this file, some parts are NOT ACCURATE or OVERLY
 625    SIMPLIFIED.  For thorough understanding, please refer to the
 626    original document of ISO2022.
 627
 628    ISO2022 provides many mechanisms to encode several character sets
 629    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 630    is encoded using bytes less than 128.  This may make the encoded
 631    text a little bit longer, but the text passes more easily through
 632    several gateways, some of which strip off MSB (Most Signigant Bit).
 633
 634    There are two kinds of character sets: control character set and
 635    graphic character set.  The former contains control characters such
 636    as `newline' and `escape' to provide control functions (control
 637    functions are also provided by escape sequences).  The latter
 638    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 639    two control character sets and many graphic character sets.
 640
 641    Graphic character sets are classified into one of the following
 642    four classes, according to the number of bytes (DIMENSION) and
 643    number of characters in one dimension (CHARS) of the set:
 644    - DIMENSION1_CHARS94
 645    - DIMENSION1_CHARS96
 646    - DIMENSION2_CHARS94
 647    - DIMENSION2_CHARS96
 648
 649    In addition, each character set is assigned an identification tag,
 650    unique for each set, called "final character" (denoted as <F>
 651    hereafter).  The <F> of each character set is decided by ECMA(*)
 652    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 653    (0x30..0x3F are for private use only).
 654
 655    Note (*): ECMA = European Computer Manufacturers Association
 656
 657    Here are examples of graphic character set [NAME(<F>)]:
 658         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 659         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 660         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 661         o DIMENSION2_CHARS96 -- none for the moment
 662
 663    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 664         C0 [0x00..0x1F] -- control character plane 0
 665         GL [0x20..0x7F] -- graphic character plane 0
 666         C1 [0x80..0x9F] -- control character plane 1
 667         GR [0xA0..0xFF] -- graphic character plane 1
 668
 669    A control character set is directly designated and invoked to C0 or
 670    C1 by an escape sequence.  The most common case is that:
 671    - ISO646's  control character set is designated/invoked to C0, and
 672    - ISO6429's control character set is designated/invoked to C1,
 673    and usually these designations/invocations are omitted in encoded
 674    text.  In a 7-bit environment, only C0 can be used, and a control
 675    character for C1 is encoded by an appropriate escape sequence to
 676    fit into the environment.  All control characters for C1 are
 677    defined to have corresponding escape sequences.
 678
 679    A graphic character set is at first designated to one of four
 680    graphic registers (G0 through G3), then these graphic registers are
 681    invoked to GL or GR.  These designations and invocations can be
 682    done independently.  The most common case is that G0 is invoked to
 683    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 684    these invocations and designations are omitted in encoded text.
 685    In a 7-bit environment, only GL can be used.
 686
 687    When a graphic character set of CHARS94 is invoked to GL, codes
 688    0x20 and 0x7F of the GL area work as control characters SPACE and
 689    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 690    be used.
 691
 692    There are two ways of invocation: locking-shift and single-shift.
 693    With locking-shift, the invocation lasts until the next different
 694    invocation, whereas with single-shift, the invocation affects the
 695    following character only and doesn't affect the locking-shift
 696    state.  Invocations are done by the following control characters or
 697    escape sequences:
 698
 699    ----------------------------------------------------------------------
 700    abbrev  function                  cntrl escape seq   description
 701    ----------------------------------------------------------------------
 702    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 703    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 704    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 705    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 706    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 707    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 708    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 709    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 710    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 711    ----------------------------------------------------------------------
 712    (*) These are not used by any known coding system.
 713
 714    Control characters for these functions are defined by macros
 715    ISO_CODE_XXX in `coding.h'.
 716
 717    Designations are done by the following escape sequences:
 718    ----------------------------------------------------------------------
 719    escape sequence      description
 720    ----------------------------------------------------------------------
 721    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 722    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 723    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 724    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 725    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 726    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 727    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 728    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 729    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 730    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 731    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 732    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 733    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 734    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 735    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 736    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 737    ----------------------------------------------------------------------
 738
 739    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 740    of dimension 1, chars 94, and final character <F>, etc...
 741
 742    Note (*): Although these designations are not allowed in ISO2022,
 743    Emacs accepts them on decoding, and produces them on encoding
 744    CHARS96 character sets in a coding system which is characterized as
 745    7-bit environment, non-locking-shift, and non-single-shift.
 746
 747    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 748    '(' can be omitted.  We refer to this as "short-form" hereafter.
 749
 750    Now you may notice that there are a lot of ways for encoding the
 751    same multilingual text in ISO2022.  Actually, there exist many
 752    coding systems such as Compound Text (used in X11's inter client
 753    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 754    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 755    localized platforms), and all of these are variants of ISO2022.
 756
 757    In addition to the above, Emacs handles two more kinds of escape
 758    sequences: ISO6429's direction specification and Emacs' private
 759    sequence for specifying character composition.
 760
 761    ISO6429's direction specification takes the following form:
 762         o CSI ']'      -- end of the current direction
 763         o CSI '0' ']'  -- end of the current direction
 764         o CSI '1' ']'  -- start of left-to-right text
 765         o CSI '2' ']'  -- start of right-to-left text
 766    The control character CSI (0x9B: control sequence introducer) is
 767    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 768
 769    Character composition specification takes the following form:
 770         o ESC '0' -- start relative composition
 771         o ESC '1' -- end composition
 772         o ESC '2' -- start rule-base composition (*)
 773         o ESC '3' -- start relative composition with alternate chars  (**)
 774         o ESC '4' -- start rule-base composition with alternate chars  (**)
 775   Since these are not standard escape sequences of any ISO standard,
 776   the use of them for these meaning is restricted to Emacs only.
 777
 778   (*) This form is used only in Emacs 20.5 and the older versions,
 779   but the newer versions can safely decode it.
 780   (**) This form is used only in Emacs 21.1 and the newer versions,
 781   and the older versions can't decode it.
 782
 783   Here's a list of examples usages of these composition escape
 784   sequences (categorized by `enum composition_method').
 785
 786   COMPOSITION_RELATIVE:
 787         ESC 0 CHAR [ CHAR ] ESC 1
 788   COMPOSITOIN_WITH_RULE:
 789         ESC 2 CHAR [ RULE CHAR ] ESC 1
 790   COMPOSITION_WITH_ALTCHARS:
 791         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 792   COMPOSITION_WITH_RULE_ALTCHARS:
 793         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 794
 795 enum iso_code_class_type iso_code_class[256];
 796
 797 #define CHARSET_OK(idx, charset)                                \
 798   (coding_system_table[idx]                                     \
 799    && (coding_system_table[idx]->safe_charsets[charset]         \
 800        || (CODING_SPEC_ISO_REQUESTED_DESIGNATION                \
 801             (coding_system_table[idx], charset)                 \
 802            != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)))
 803
 804 #define SHIFT_OUT_OK(idx) \
 805   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 806
 807 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 808    Check if a text is encoded in ISO2022.  If it is, returns an
 809    integer in which appropriate flag bits any of:
 810         CODING_CATEGORY_MASK_ISO_7
 811         CODING_CATEGORY_MASK_ISO_7_TIGHT
 812         CODING_CATEGORY_MASK_ISO_8_1
 813         CODING_CATEGORY_MASK_ISO_8_2
 814         CODING_CATEGORY_MASK_ISO_7_ELSE
 815         CODING_CATEGORY_MASK_ISO_8_ELSE
 816    are set.  If a code which should never appear in ISO2022 is found,
 817    returns 0.  */
 818
 819 int
 820 detect_coding_iso2022 (src, src_end)
 821      unsigned char *src, *src_end;
 822 {
 823   int mask = CODING_CATEGORY_MASK_ISO;
 824   int mask_found = 0;
 825   int reg[4], shift_out = 0, single_shifting = 0;
 826   int c, c1, i, charset;
 827   /* Dummy for ONE_MORE_BYTE.  */
 828   struct coding_system dummy_coding;
 829   struct coding_system *coding = &dummy_coding;
 830
 831   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 832   while (mask && src < src_end)
 833     {
 834       ONE_MORE_BYTE (c);
 835       switch (c)
 836         {
 837         case ISO_CODE_ESC:
 838           single_shifting = 0;
 839           ONE_MORE_BYTE (c);
 840           if (c >= '(' && c <= '/')
 841             {
 842               /* Designation sequence for a charset of dimension 1.  */
 843               ONE_MORE_BYTE (c1);
 844               if (c1 < ' ' || c1 >= 0x80
 845                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 846                 /* Invalid designation sequence.  Just ignore.  */
 847                 break;
 848               reg[(c - '(') % 4] = charset;
 849             }
 850           else if (c == '$')
 851             {
 852               /* Designation sequence for a charset of dimension 2.  */
 853               ONE_MORE_BYTE (c);
 854               if (c >= '@' && c <= 'B')
 855                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 856                 reg[0] = charset = iso_charset_table[1][0][c];
 857               else if (c >= '(' && c <= '/')
 858                 {
 859                   ONE_MORE_BYTE (c1);
 860                   if (c1 < ' ' || c1 >= 0x80
 861                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 862                     /* Invalid designation sequence.  Just ignore.  */
 863                     break;
 864                   reg[(c - '(') % 4] = charset;
 865                 }
 866               else
 867                 /* Invalid designation sequence.  Just ignore.  */
 868                 break;
 869             }
 870           else if (c == 'N' || c == 'O')
 871             {
 872               /* ESC <Fe> for SS2 or SS3.  */
 873               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 874               break;
 875             }
 876           else if (c >= '0' && c <= '4')
 877             {
 878               /* ESC <Fp> for start/end composition.  */
 879               mask_found |= CODING_CATEGORY_MASK_ISO;
 880               break;
 881             }
 882           else
 883             /* Invalid escape sequence.  Just ignore.  */
 884             break;
 885
 886           /* We found a valid designation sequence for CHARSET.  */
 887           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 888           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset))
 889             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 890           else
 891             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 892           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset))
 893             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 894           else
 895             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 896           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset))
 897             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 898           else
 899             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 900           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset))
 901             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 902           else
 903             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 904           break;
 905
 906         case ISO_CODE_SO:
 907           single_shifting = 0;
 908           if (shift_out == 0
 909               && (reg[1] >= 0
 910                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
 911                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
 912             {
 913               /* Locking shift out.  */
 914               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 915               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 916             }
 917           break;
 918
 919         case ISO_CODE_SI:
 920           single_shifting = 0;
 921           if (shift_out == 1)
 922             {
 923               /* Locking shift in.  */
 924               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
 925               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
 926             }
 927           break;
 928
 929         case ISO_CODE_CSI:
 930           single_shifting = 0;
 931         case ISO_CODE_SS2:
 932         case ISO_CODE_SS3:
 933           {
 934             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 935
 936             if (c != ISO_CODE_CSI)
 937               {
 938                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 939                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 940                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 941                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 942                     & CODING_FLAG_ISO_SINGLE_SHIFT)
 943                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 944                 single_shifting = 1;
 945               }
 946             if (VECTORP (Vlatin_extra_code_table)
 947                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 948               {
 949                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 950                     & CODING_FLAG_ISO_LATIN_EXTRA)
 951                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 952                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 953                     & CODING_FLAG_ISO_LATIN_EXTRA)
 954                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 955               }
 956             mask &= newmask;
 957             mask_found |= newmask;
 958           }
 959           break;
 960
 961         default:
 962           if (c < 0x80)
 963             {
 964               single_shifting = 0;
 965               break;
 966             }
 967           else if (c < 0xA0)
 968             {
 969               single_shifting = 0;
 970               if (VECTORP (Vlatin_extra_code_table)
 971                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 972                 {
 973                   int newmask = 0;
 974
 975                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
 976                       & CODING_FLAG_ISO_LATIN_EXTRA)
 977                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 978                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
 979                       & CODING_FLAG_ISO_LATIN_EXTRA)
 980                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 981                   mask &= newmask;
 982                   mask_found |= newmask;
 983                 }
 984               else
 985                 return 0;
 986             }
 987           else
 988             {
 989               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
 990                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 991               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
 992               /* Check the length of succeeding codes of the range
 993                  0xA0..0FF.  If the byte length is odd, we exclude
 994                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
 995                  when we are not single shifting.  */
 996               if (!single_shifting
 997                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
 998                 {
 999                   int i = 1;
1000                   while (src < src_end)
1001                     {
1002                       ONE_MORE_BYTE (c);
1003                       if (c < 0xA0)
1004                         break;
1005                       i++;
1006                     }
1007
1008                   if (i & 1 && src < src_end)
1009                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1010                   else
1011                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1012                 }
1013             }
1014           break;
1015         }
1016     }
1017  label_end_of_loop:
1018   return (mask & mask_found);
1019 }
1020
1021 /* Decode a character of which charset is CHARSET, the 1st position
1022    code is C1, the 2nd position code is C2, and return the decoded
1023    character code.  If the variable `translation_table' is non-nil,
1024    returned the translated code.  */
1025
1026 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1027   (NILP (translation_table)                     \
1028    ? MAKE_CHAR (charset, c1, c2)                \
1029    : translate_char (translation_table, -1, charset, c1, c2))
1030
1031 /* Set designation state into CODING.  */
1032 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1033   do {                                                                     \
1034     int charset;                                                           \
1035                                                                            \
1036     if (final_char < '0' || final_char >= 128)                             \
1037       goto label_invalid_code;                                             \
1038     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1039                                  make_number (chars),                      \
1040                                  make_number (final_char));                \
1041     if (charset >= 0                                                       \
1042         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1043             || coding->safe_charsets[charset]))                            \
1044       {                                                                    \
1045         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1046             && reg == 0                                                    \
1047             && charset == CHARSET_ASCII)                                   \
1048           {                                                                \
1049             /* We should insert this designation sequence as is so         \
1050                that it is surely written back to a file.  */               \
1051             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1052             goto label_invalid_code;                                       \
1053           }                                                                \
1054         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1055         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1056             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1057           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1058         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1059       }                                                                    \
1060     else                                                                   \
1061       {                                                                    \
1062         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1063         goto label_invalid_code;                                           \
1064       }                                                                    \
1065   } while (0)
1066
1067 /* Allocate a memory block for storing information about compositions.
1068    The block is chained to the already allocated blocks.  */
1069
1070 void
1071 coding_allocate_composition_data (coding, char_offset)
1072      struct coding_system *coding;
1073      int char_offset;
1074 {
1075   struct composition_data *cmp_data
1076     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1077
1078   cmp_data->char_offset = char_offset;
1079   cmp_data->used = 0;
1080   cmp_data->prev = coding->cmp_data;
1081   cmp_data->next = NULL;
1082   if (coding->cmp_data)
1083     coding->cmp_data->next = cmp_data;
1084   coding->cmp_data = cmp_data;
1085   coding->cmp_data_start = 0;
1086 }
1087
1088 /* Record the starting position START and METHOD of one composition.  */
1089
1090 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1091   do {                                                          \
1092     struct composition_data *cmp_data = coding->cmp_data;       \
1093     int *data = cmp_data->data + cmp_data->used;                \
1094     coding->cmp_data_start = cmp_data->used;                    \
1095     data[0] = -1;                                               \
1096     data[1] = cmp_data->char_offset + start;                    \
1097     data[3] = (int) method;                                     \
1098     cmp_data->used += 4;                                        \
1099   } while (0)
1100
1101 /* Record the ending position END of the current composition.  */
1102
1103 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1104   do {                                                          \
1105     struct composition_data *cmp_data = coding->cmp_data;       \
1106     int *data = cmp_data->data + coding->cmp_data_start;        \
1107     data[0] = cmp_data->used - coding->cmp_data_start;          \
1108     data[2] = cmp_data->char_offset + end;                      \
1109   } while (0)
1110
1111 /* Record one COMPONENT (alternate character or composition rule).  */
1112
1113 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1114   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1115
1116 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1117
1118 #define DECODE_COMPOSITION_START(c1)                                       \
1119   do {                                                                     \
1120     if (coding->composing == COMPOSITION_DISABLED)                         \
1121       {                                                                    \
1122         *dst++ = ISO_CODE_ESC;                                             \
1123         *dst++ = c1 & 0x7f;                                                \
1124         coding->produced_char += 2;                                        \
1125       }                                                                    \
1126     else if (!COMPOSING_P (coding))                                        \
1127       {                                                                    \
1128         /* This is surely the start of a composition.  We must be sure     \
1129            that coding->cmp_data has enough space to store the             \
1130            information about the composition.  If not, terminate the       \
1131            current decoding loop, allocate one more memory block for       \
1132            coding->cmp_data in the calller, then start the decoding        \
1133            loop again.  We can't allocate memory here directly because     \
1134            it may cause buffer/string relocation.  */                      \
1135         if (!coding->cmp_data                                              \
1136             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1137                 >= COMPOSITION_DATA_SIZE))                                 \
1138           {                                                                \
1139             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1140             goto label_end_of_loop;                                        \
1141           }                                                                \
1142         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1143                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1144                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1145                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1146         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1147                                       coding->composing);                  \
1148         coding->composition_rule_follows = 0;                              \
1149       }                                                                    \
1150     else                                                                   \
1151       {                                                                    \
1152         /* We are already handling a composition.  If the method is        \
1153            the following two, the codes following the current escape       \
1154            sequence are actual characters stored in a buffer.  */          \
1155         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1156             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1157           {                                                                \
1158             coding->composing = COMPOSITION_RELATIVE;                      \
1159             coding->composition_rule_follows = 0;                          \
1160           }                                                                \
1161       }                                                                    \
1162   } while (0)
1163
1164 /* Handle compositoin end sequence ESC 1.  */
1165
1166 #define DECODE_COMPOSITION_END(c1)                                      \
1167   do {                                                                  \
1168     if (coding->composing == COMPOSITION_DISABLED)                      \
1169       {                                                                 \
1170         *dst++ = ISO_CODE_ESC;                                          \
1171         *dst++ = c1;                                                    \
1172         coding->produced_char += 2;                                     \
1173       }                                                                 \
1174     else                                                                \
1175       {                                                                 \
1176         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1177         coding->composing = COMPOSITION_NO;                             \
1178       }                                                                 \
1179   } while (0)
1180
1181 /* Decode a composition rule from the byte C1 (and maybe one more byte
1182    from SRC) and store one encoded composition rule in
1183    coding->cmp_data.  */
1184
1185 #define DECODE_COMPOSITION_RULE(c1)                                     \
1186   do {                                                                  \
1187     int rule = 0;                                                       \
1188     (c1) -= 32;                                                         \
1189     if (c1 < 81)                /* old format (before ver.21) */        \
1190       {                                                                 \
1191         int gref = (c1) / 9;                                            \
1192         int nref = (c1) % 9;                                            \
1193         if (gref == 4) gref = 10;                                       \
1194         if (nref == 4) nref = 10;                                       \
1195         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1196       }                                                                 \
1197     else if (c1 < 93)           /* new format (after ver.21) */         \
1198       {                                                                 \
1199         ONE_MORE_BYTE (c2);                                             \
1200         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1201       }                                                                 \
1202     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1203     coding->composition_rule_follows = 0;                               \
1204   } while (0)
1205
1206
1207 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1208
1209 static void
1210 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1211      struct coding_system *coding;
1212      unsigned char *source, *destination;
1213      int src_bytes, dst_bytes;
1214 {
1215   unsigned char *src = source;
1216   unsigned char *src_end = source + src_bytes;
1217   unsigned char *dst = destination;
1218   unsigned char *dst_end = destination + dst_bytes;
1219   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1220   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1221   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1222   /* SRC_BASE remembers the start position in source in each loop.
1223      The loop will be exited when there's not enough source code
1224      (within macro ONE_MORE_BYTE), or when there's not enough
1225      destination area to produce a character (within macro
1226      EMIT_CHAR).  */
1227   unsigned char *src_base;
1228   int c, charset;
1229   Lisp_Object translation_table;
1230
1231   if (NILP (Venable_character_translation))
1232     translation_table = Qnil;
1233   else
1234     {
1235       translation_table = coding->translation_table_for_decode;
1236       if (NILP (translation_table))
1237         translation_table = Vstandard_translation_table_for_decode;
1238     }
1239
1240   coding->result = CODING_FINISH_NORMAL;
1241
1242   while (1)
1243     {
1244       int c1, c2;
1245
1246       src_base = src;
1247       ONE_MORE_BYTE (c1);
1248
1249       /* We produce no character or one character.  */
1250       switch (iso_code_class [c1])
1251         {
1252         case ISO_0x20_or_0x7F:
1253           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1254             {
1255               DECODE_COMPOSITION_RULE (c1);
1256               continue;
1257             }
1258           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1259             {
1260               /* This is SPACE or DEL.  */
1261               charset = CHARSET_ASCII;
1262               break;
1263             }
1264           /* This is a graphic character, we fall down ...  */
1265
1266         case ISO_graphic_plane_0:
1267           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1268             {
1269               DECODE_COMPOSITION_RULE (c1);
1270               continue;
1271             }
1272           charset = charset0;
1273           break;
1274
1275         case ISO_0xA0_or_0xFF:
1276           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1277               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1278             goto label_invalid_code;
1279           /* This is a graphic character, we fall down ... */
1280
1281         case ISO_graphic_plane_1:
1282           if (charset1 < 0)
1283             goto label_invalid_code;
1284           charset = charset1;
1285           break;
1286
1287         case ISO_control_0:
1288           if (COMPOSING_P (coding))
1289             DECODE_COMPOSITION_END ('1');
1290
1291           /* All ISO2022 control characters in this class have the
1292              same representation in Emacs internal format.  */
1293           if (c1 == '\n'
1294               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1295               && (coding->eol_type == CODING_EOL_CR
1296                   || coding->eol_type == CODING_EOL_CRLF))
1297             {
1298               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1299               goto label_end_of_loop;
1300             }
1301           charset = CHARSET_ASCII;
1302           break;
1303
1304         case ISO_control_1:
1305           if (COMPOSING_P (coding))
1306             DECODE_COMPOSITION_END ('1');
1307           goto label_invalid_code;
1308
1309         case ISO_carriage_return:
1310           if (COMPOSING_P (coding))
1311             DECODE_COMPOSITION_END ('1');
1312
1313           if (coding->eol_type == CODING_EOL_CR)
1314             c1 = '\n';
1315           else if (coding->eol_type == CODING_EOL_CRLF)
1316             {
1317               ONE_MORE_BYTE (c1);
1318               if (c1 != ISO_CODE_LF)
1319                 {
1320                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1321                     {
1322                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1323                       goto label_end_of_loop;
1324                     }
1325                   src--;
1326                   c1 = '\r';
1327                 }
1328             }
1329           charset = CHARSET_ASCII;
1330           break;
1331
1332         case ISO_shift_out:
1333           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1334               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1335             goto label_invalid_code;
1336           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1337           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1338           continue;
1339
1340         case ISO_shift_in:
1341           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1342             goto label_invalid_code;
1343           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1344           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1345           continue;
1346
1347         case ISO_single_shift_2_7:
1348         case ISO_single_shift_2:
1349           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1350             goto label_invalid_code;
1351           /* SS2 is handled as an escape sequence of ESC 'N' */
1352           c1 = 'N';
1353           goto label_escape_sequence;
1354
1355         case ISO_single_shift_3:
1356           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1357             goto label_invalid_code;
1358           /* SS2 is handled as an escape sequence of ESC 'O' */
1359           c1 = 'O';
1360           goto label_escape_sequence;
1361
1362         case ISO_control_sequence_introducer:
1363           /* CSI is handled as an escape sequence of ESC '[' ...  */
1364           c1 = '[';
1365           goto label_escape_sequence;
1366
1367         case ISO_escape:
1368           ONE_MORE_BYTE (c1);
1369         label_escape_sequence:
1370           /* Escape sequences handled by Emacs are invocation,
1371              designation, direction specification, and character
1372              composition specification.  */
1373           switch (c1)
1374             {
1375             case '&':           /* revision of following character set */
1376               ONE_MORE_BYTE (c1);
1377               if (!(c1 >= '@' && c1 <= '~'))
1378                 goto label_invalid_code;
1379               ONE_MORE_BYTE (c1);
1380               if (c1 != ISO_CODE_ESC)
1381                 goto label_invalid_code;
1382               ONE_MORE_BYTE (c1);
1383               goto label_escape_sequence;
1384
1385             case '$':           /* designation of 2-byte character set */
1386               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1387                 goto label_invalid_code;
1388               ONE_MORE_BYTE (c1);
1389               if (c1 >= '@' && c1 <= 'B')
1390                 {       /* designation of JISX0208.1978, GB2312.1980,
1391                            or JISX0208.1980 */
1392                   DECODE_DESIGNATION (0, 2, 94, c1);
1393                 }
1394               else if (c1 >= 0x28 && c1 <= 0x2B)
1395                 {       /* designation of DIMENSION2_CHARS94 character set */
1396                   ONE_MORE_BYTE (c2);
1397                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1398                 }
1399               else if (c1 >= 0x2C && c1 <= 0x2F)
1400                 {       /* designation of DIMENSION2_CHARS96 character set */
1401                   ONE_MORE_BYTE (c2);
1402                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1403                 }
1404               else
1405                 goto label_invalid_code;
1406               /* We must update these variables now.  */
1407               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1408               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1409               continue;
1410
1411             case 'n':           /* invocation of locking-shift-2 */
1412               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1413                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1414                 goto label_invalid_code;
1415               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1416               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1417               continue;
1418
1419             case 'o':           /* invocation of locking-shift-3 */
1420               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1421                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1422                 goto label_invalid_code;
1423               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1424               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1425               continue;
1426
1427             case 'N':           /* invocation of single-shift-2 */
1428               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1429                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1430                 goto label_invalid_code;
1431               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1432               ONE_MORE_BYTE (c1);
1433               break;
1434
1435             case 'O':           /* invocation of single-shift-3 */
1436               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1437                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1438                 goto label_invalid_code;
1439               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1440               ONE_MORE_BYTE (c1);
1441               break;
1442
1443             case '0': case '2': case '3': case '4': /* start composition */
1444               DECODE_COMPOSITION_START (c1);
1445               continue;
1446
1447             case '1':           /* end composition */
1448               DECODE_COMPOSITION_END (c1);
1449               continue;
1450
1451             case '[':           /* specification of direction */
1452               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1453                 goto label_invalid_code;
1454               /* For the moment, nested direction is not supported.
1455                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1456                  left-to-right, and nozero means right-to-left.  */
1457               ONE_MORE_BYTE (c1);
1458               switch (c1)
1459                 {
1460                 case ']':       /* end of the current direction */
1461                   coding->mode &= ~CODING_MODE_DIRECTION;
1462
1463                 case '0':       /* end of the current direction */
1464                 case '1':       /* start of left-to-right direction */
1465                   ONE_MORE_BYTE (c1);
1466                   if (c1 == ']')
1467                     coding->mode &= ~CODING_MODE_DIRECTION;
1468                   else
1469                     goto label_invalid_code;
1470                   break;
1471
1472                 case '2':       /* start of right-to-left direction */
1473                   ONE_MORE_BYTE (c1);
1474                   if (c1 == ']')
1475                     coding->mode |= CODING_MODE_DIRECTION;
1476                   else
1477                     goto label_invalid_code;
1478                   break;
1479
1480                 default:
1481                   goto label_invalid_code;
1482                 }
1483               continue;
1484
1485             default:
1486               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1487                 goto label_invalid_code;
1488               if (c1 >= 0x28 && c1 <= 0x2B)
1489                 {       /* designation of DIMENSION1_CHARS94 character set */
1490                   ONE_MORE_BYTE (c2);
1491                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1492                 }
1493               else if (c1 >= 0x2C && c1 <= 0x2F)
1494                 {       /* designation of DIMENSION1_CHARS96 character set */
1495                   ONE_MORE_BYTE (c2);
1496                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1497                 }
1498               else
1499                 goto label_invalid_code;
1500               /* We must update these variables now.  */
1501               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1502               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1503               continue;
1504             }
1505         }
1506
1507       /* Now we know CHARSET and 1st position code C1 of a character.
1508          Produce a multibyte sequence for that character while getting
1509          2nd position code C2 if necessary.  */
1510       if (CHARSET_DIMENSION (charset) == 2)
1511         {
1512           ONE_MORE_BYTE (c2);
1513           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1514             /* C2 is not in a valid range.  */
1515             goto label_invalid_code;
1516         }
1517       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1518       EMIT_CHAR (c);
1519       continue;
1520
1521     label_invalid_code:
1522       coding->errors++;
1523       if (COMPOSING_P (coding))
1524         DECODE_COMPOSITION_END ('1');
1525       src = src_base;
1526       c = *src++;
1527       EMIT_CHAR (c);
1528     }
1529
1530  label_end_of_loop:
1531   coding->consumed = coding->consumed_char = src_base - source;
1532   coding->produced = dst - destination;
1533   return;
1534 }
1535
1536
1537 /* ISO2022 encoding stuff.  */
1538
1539 /*
1540    It is not enough to say just "ISO2022" on encoding, we have to
1541    specify more details.  In Emacs, each coding system of ISO2022
1542    variant has the following specifications:
1543         1. Initial designation to G0 thru G3.
1544         2. Allows short-form designation?
1545         3. ASCII should be designated to G0 before control characters?
1546         4. ASCII should be designated to G0 at end of line?
1547         5. 7-bit environment or 8-bit environment?
1548         6. Use locking-shift?
1549         7. Use Single-shift?
1550    And the following two are only for Japanese:
1551         8. Use ASCII in place of JIS0201-1976-Roman?
1552         9. Use JISX0208-1983 in place of JISX0208-1978?
1553    These specifications are encoded in `coding->flags' as flag bits
1554    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1555    details.
1556 */
1557
1558 /* Produce codes (escape sequence) for designating CHARSET to graphic
1559    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1560    '@', 'A', or 'B' and the coding system CODING allows, produce
1561    designation sequence of short-form.  */
1562
1563 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1564   do {                                                                  \
1565     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1566     char *intermediate_char_94 = "()*+";                                \
1567     char *intermediate_char_96 = ",-./";                                \
1568     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1569                                                                         \
1570     if (revision < 255)                                                 \
1571       {                                                                 \
1572         *dst++ = ISO_CODE_ESC;                                          \
1573         *dst++ = '&';                                                   \
1574         *dst++ = '@' + revision;                                        \
1575       }                                                                 \
1576     *dst++ = ISO_CODE_ESC;                                              \
1577     if (CHARSET_DIMENSION (charset) == 1)                               \
1578       {                                                                 \
1579         if (CHARSET_CHARS (charset) == 94)                              \
1580           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1581         else                                                            \
1582           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1583       }                                                                 \
1584     else                                                                \
1585       {                                                                 \
1586         *dst++ = '$';                                                   \
1587         if (CHARSET_CHARS (charset) == 94)                              \
1588           {                                                             \
1589             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1590                 || reg != 0                                             \
1591                 || final_char < '@' || final_char > 'B')                \
1592               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1593           }                                                             \
1594         else                                                            \
1595           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1596       }                                                                 \
1597     *dst++ = final_char;                                                \
1598     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1599   } while (0)
1600
1601 /* The following two macros produce codes (control character or escape
1602    sequence) for ISO2022 single-shift functions (single-shift-2 and
1603    single-shift-3).  */
1604
1605 #define ENCODE_SINGLE_SHIFT_2                           \
1606   do {                                                  \
1607     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1608       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1609     else                                                \
1610       *dst++ = ISO_CODE_SS2;                            \
1611     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1612   } while (0)
1613
1614 #define ENCODE_SINGLE_SHIFT_3                           \
1615   do {                                                  \
1616     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1617       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1618     else                                                \
1619       *dst++ = ISO_CODE_SS3;                            \
1620     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1621   } while (0)
1622
1623 /* The following four macros produce codes (control character or
1624    escape sequence) for ISO2022 locking-shift functions (shift-in,
1625    shift-out, locking-shift-2, and locking-shift-3).  */
1626
1627 #define ENCODE_SHIFT_IN                         \
1628   do {                                          \
1629     *dst++ = ISO_CODE_SI;                       \
1630     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1631   } while (0)
1632
1633 #define ENCODE_SHIFT_OUT                        \
1634   do {                                          \
1635     *dst++ = ISO_CODE_SO;                       \
1636     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1637   } while (0)
1638
1639 #define ENCODE_LOCKING_SHIFT_2                  \
1640   do {                                          \
1641     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1642     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1643   } while (0)
1644
1645 #define ENCODE_LOCKING_SHIFT_3                  \
1646   do {                                          \
1647     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1648     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1649   } while (0)
1650
1651 /* Produce codes for a DIMENSION1 character whose character set is
1652    CHARSET and whose position-code is C1.  Designation and invocation
1653    sequences are also produced in advance if necessary.  */
1654
1655 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1656   do {                                                                  \
1657     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1658       {                                                                 \
1659         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1660           *dst++ = c1 & 0x7F;                                           \
1661         else                                                            \
1662           *dst++ = c1 | 0x80;                                           \
1663         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1664         break;                                                          \
1665       }                                                                 \
1666     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1667       {                                                                 \
1668         *dst++ = c1 & 0x7F;                                             \
1669         break;                                                          \
1670       }                                                                 \
1671     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1672       {                                                                 \
1673         *dst++ = c1 | 0x80;                                             \
1674         break;                                                          \
1675       }                                                                 \
1676     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1677              && !coding->safe_charsets[charset])                        \
1678       {                                                                 \
1679         /* We should not encode this character, instead produce one or  \
1680            two `?'s.  */                                                \
1681         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1682         if (CHARSET_WIDTH (charset) == 2)                               \
1683           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1684         break;                                                          \
1685       }                                                                 \
1686     else                                                                \
1687       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1688          must invoke it, or, at first, designate it to some graphic     \
1689          register.  Then repeat the loop to actually produce the        \
1690          character.  */                                                 \
1691       dst = encode_invocation_designation (charset, coding, dst);       \
1692   } while (1)
1693
1694 /* Produce codes for a DIMENSION2 character whose character set is
1695    CHARSET and whose position-codes are C1 and C2.  Designation and
1696    invocation codes are also produced in advance if necessary.  */
1697
1698 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1699   do {                                                                  \
1700     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1701       {                                                                 \
1702         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1703           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1704         else                                                            \
1705           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1706         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1707         break;                                                          \
1708       }                                                                 \
1709     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1710       {                                                                 \
1711         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1712         break;                                                          \
1713       }                                                                 \
1714     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1715       {                                                                 \
1716         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1717         break;                                                          \
1718       }                                                                 \
1719     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1720              && !coding->safe_charsets[charset])                        \
1721       {                                                                 \
1722         /* We should not encode this character, instead produce one or  \
1723            two `?'s.  */                                                \
1724         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1725         if (CHARSET_WIDTH (charset) == 2)                               \
1726           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1727         break;                                                          \
1728       }                                                                 \
1729     else                                                                \
1730       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1731          must invoke it, or, at first, designate it to some graphic     \
1732          register.  Then repeat the loop to actually produce the        \
1733          character.  */                                                 \
1734       dst = encode_invocation_designation (charset, coding, dst);       \
1735   } while (1)
1736
1737 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                           \
1738   do {                                                                  \
1739     int alt_charset = charset;                                          \
1740                                                                         \
1741     if (CHARSET_DEFINED_P (charset))                                    \
1742       {                                                                 \
1743         if (CHARSET_DIMENSION (charset) == 1)                           \
1744           {                                                             \
1745             if (charset == CHARSET_ASCII                                \
1746                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)           \
1747               alt_charset = charset_latin_jisx0201;                     \
1748             ENCODE_ISO_CHARACTER_DIMENSION1 (alt_charset, c1);          \
1749           }                                                             \
1750         else                                                            \
1751           {                                                             \
1752             if (charset == charset_jisx0208                             \
1753                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)          \
1754               alt_charset = charset_jisx0208_1978;                      \
1755             ENCODE_ISO_CHARACTER_DIMENSION2 (alt_charset, c1, c2);      \
1756           }                                                             \
1757       }                                                                 \
1758     else                                                                \
1759       {                                                                 \
1760         *dst++ = c1;                                                    \
1761         if (c2 >= 0)                                                    \
1762           *dst++ = c2;                                                  \
1763       }                                                                 \
1764   } while (0)
1765
1766 /* Produce designation and invocation codes at a place pointed by DST
1767    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1768    Return new DST.  */
1769
1770 unsigned char *
1771 encode_invocation_designation (charset, coding, dst)
1772      int charset;
1773      struct coding_system *coding;
1774      unsigned char *dst;
1775 {
1776   int reg;                      /* graphic register number */
1777
1778   /* At first, check designations.  */
1779   for (reg = 0; reg < 4; reg++)
1780     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1781       break;
1782
1783   if (reg >= 4)
1784     {
1785       /* CHARSET is not yet designated to any graphic registers.  */
1786       /* At first check the requested designation.  */
1787       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1788       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1789         /* Since CHARSET requests no special designation, designate it
1790            to graphic register 0.  */
1791         reg = 0;
1792
1793       ENCODE_DESIGNATION (charset, reg, coding);
1794     }
1795
1796   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1797       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1798     {
1799       /* Since the graphic register REG is not invoked to any graphic
1800          planes, invoke it to graphic plane 0.  */
1801       switch (reg)
1802         {
1803         case 0:                 /* graphic register 0 */
1804           ENCODE_SHIFT_IN;
1805           break;
1806
1807         case 1:                 /* graphic register 1 */
1808           ENCODE_SHIFT_OUT;
1809           break;
1810
1811         case 2:                 /* graphic register 2 */
1812           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1813             ENCODE_SINGLE_SHIFT_2;
1814           else
1815             ENCODE_LOCKING_SHIFT_2;
1816           break;
1817
1818         case 3:                 /* graphic register 3 */
1819           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1820             ENCODE_SINGLE_SHIFT_3;
1821           else
1822             ENCODE_LOCKING_SHIFT_3;
1823           break;
1824         }
1825     }
1826
1827   return dst;
1828 }
1829
1830 /* Produce 2-byte codes for encoded composition rule RULE.  */
1831
1832 #define ENCODE_COMPOSITION_RULE(rule)           \
1833   do {                                          \
1834     int gref, nref;                             \
1835     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1836     *dst++ = 32 + 81 + gref;                    \
1837     *dst++ = 32 + nref;                         \
1838   } while (0)
1839
1840 /* Produce codes for indicating the start of a composition sequence
1841    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1842    which specify information about the composition.  See the comment
1843    in coding.h for the format of DATA.  */
1844
1845 #define ENCODE_COMPOSITION_START(coding, data)                          \
1846   do {                                                                  \
1847     coding->composing = data[3];                                        \
1848     *dst++ = ISO_CODE_ESC;                                              \
1849     if (coding->composing == COMPOSITION_RELATIVE)                      \
1850       *dst++ = '0';                                                     \
1851     else                                                                \
1852       {                                                                 \
1853         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1854                   ? '3' : '4');                                         \
1855         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1856         coding->composition_rule_follows = 0;                           \
1857       }                                                                 \
1858   } while (0)
1859
1860 /* Produce codes for indicating the end of the current composition.  */
1861
1862 #define ENCODE_COMPOSITION_END(coding, data)                    \
1863   do {                                                          \
1864     *dst++ = ISO_CODE_ESC;                                      \
1865     *dst++ = '1';                                               \
1866     coding->cmp_data_start += data[0];                          \
1867     coding->composing = COMPOSITION_NO;                         \
1868     if (coding->cmp_data_start == coding->cmp_data->used        \
1869         && coding->cmp_data->next)                              \
1870       {                                                         \
1871         coding->cmp_data = coding->cmp_data->next;              \
1872         coding->cmp_data_start = 0;                             \
1873       }                                                         \
1874   } while (0)
1875
1876 /* Produce composition start sequence ESC 0.  Here, this sequence
1877    doesn't mean the start of a new composition but means that we have
1878    just produced components (alternate chars and composition rules) of
1879    the composition and the actual text follows in SRC.  */
1880
1881 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1882   do {                                          \
1883     *dst++ = ISO_CODE_ESC;                      \
1884     *dst++ = '0';                               \
1885     coding->composing = COMPOSITION_RELATIVE;   \
1886   } while (0)
1887
1888 /* The following three macros produce codes for indicating direction
1889    of text.  */
1890 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1891   do {                                                  \
1892     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1893       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1894     else                                                \
1895       *dst++ = ISO_CODE_CSI;                            \
1896   } while (0)
1897
1898 #define ENCODE_DIRECTION_R2L    \
1899   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1900
1901 #define ENCODE_DIRECTION_L2R    \
1902   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1903
1904 /* Produce codes for designation and invocation to reset the graphic
1905    planes and registers to initial state.  */
1906 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1907   do {                                                                      \
1908     int reg;                                                                \
1909     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1910       ENCODE_SHIFT_IN;                                                      \
1911     for (reg = 0; reg < 4; reg++)                                           \
1912       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1913           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1914               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1915         ENCODE_DESIGNATION                                                  \
1916           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1917   } while (0)
1918
1919 /* Produce designation sequences of charsets in the line started from
1920    SRC to a place pointed by DST, and return updated DST.
1921
1922    If the current block ends before any end-of-line, we may fail to
1923    find all the necessary designations.  */
1924
1925 static unsigned char *
1926 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
1927      struct coding_system *coding;
1928      Lisp_Object translation_table;
1929      unsigned char *src, *src_end, *dst;
1930 {
1931   int charset, c, found = 0, reg;
1932   /* Table of charsets to be designated to each graphic register.  */
1933   int r[4];
1934
1935   for (reg = 0; reg < 4; reg++)
1936     r[reg] = -1;
1937
1938   while (found < 4)
1939     {
1940       ONE_MORE_CHAR (c);
1941       if (c == '\n')
1942         break;
1943
1944       charset = CHAR_CHARSET (c);
1945       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1946       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
1947         {
1948           found++;
1949           r[reg] = charset;
1950         }
1951     }
1952
1953  label_end_of_loop:
1954   if (found)
1955     {
1956       for (reg = 0; reg < 4; reg++)
1957         if (r[reg] >= 0
1958             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1959           ENCODE_DESIGNATION (r[reg], reg, coding);
1960     }
1961
1962   return dst;
1963 }
1964
1965 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1966
1967 static void
1968 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1969      struct coding_system *coding;
1970      unsigned char *source, *destination;
1971      int src_bytes, dst_bytes;
1972 {
1973   unsigned char *src = source;
1974   unsigned char *src_end = source + src_bytes;
1975   unsigned char *dst = destination;
1976   unsigned char *dst_end = destination + dst_bytes;
1977   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1978      from DST_END to assure overflow checking is necessary only at the
1979      head of loop.  */
1980   unsigned char *adjusted_dst_end = dst_end - 19;
1981   /* SRC_BASE remembers the start position in source in each loop.
1982      The loop will be exited when there's not enough source text to
1983      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
1984      there's not enough destination area to produce encoded codes
1985      (within macro EMIT_BYTES).  */
1986   unsigned char *src_base;
1987   int c;
1988   Lisp_Object translation_table;
1989
1990   if (NILP (Venable_character_translation))
1991     translation_table = Qnil;
1992   else
1993     {
1994       translation_table = coding->translation_table_for_encode;
1995       if (NILP (translation_table))
1996         translation_table = Vstandard_translation_table_for_encode;
1997     }
1998
1999   coding->consumed_char = 0;
2000   coding->errors = 0;
2001   while (1)
2002     {
2003       int charset, c1, c2;
2004
2005       src_base = src;
2006
2007       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2008         {
2009           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2010           break;
2011         }
2012
2013       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2014           && CODING_SPEC_ISO_BOL (coding))
2015         {
2016           /* We have to produce designation sequences if any now.  */
2017           dst = encode_designation_at_bol (coding, translation_table,
2018                                            src, src_end, dst);
2019           CODING_SPEC_ISO_BOL (coding) = 0;
2020         }
2021
2022       /* Check composition start and end.  */
2023       if (coding->composing != COMPOSITION_DISABLED
2024           && coding->cmp_data_start < coding->cmp_data->used)
2025         {
2026           struct composition_data *cmp_data = coding->cmp_data;
2027           int *data = cmp_data->data + coding->cmp_data_start;
2028           int this_pos = cmp_data->char_offset + coding->consumed_char;
2029
2030           if (coding->composing == COMPOSITION_RELATIVE)
2031             {
2032               if (this_pos == data[2])
2033                 {
2034                   ENCODE_COMPOSITION_END (coding, data);
2035                   cmp_data = coding->cmp_data;
2036                   data = cmp_data->data + coding->cmp_data_start;
2037                 }
2038             }
2039           else if (COMPOSING_P (coding))
2040             {
2041               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2042               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2043                 /* We have consumed components of the composition.
2044                    What follows in SRC is the compositions's base
2045                    text.  */
2046                 ENCODE_COMPOSITION_FAKE_START (coding);
2047               else
2048                 {
2049                   int c = cmp_data->data[coding->cmp_data_index++];
2050                   if (coding->composition_rule_follows)
2051                     {
2052                       ENCODE_COMPOSITION_RULE (c);
2053                       coding->composition_rule_follows = 0;
2054                     }
2055                   else
2056                     {
2057                       SPLIT_CHAR (c, charset, c1, c2);
2058                       ENCODE_ISO_CHARACTER (charset, c1, c2);
2059                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2060                         coding->composition_rule_follows = 1;
2061                     }
2062                   continue;
2063                 }
2064             }
2065           if (!COMPOSING_P (coding))
2066             {
2067               if (this_pos == data[1])
2068                 {
2069                   ENCODE_COMPOSITION_START (coding, data);
2070                   continue;
2071                 }
2072             }
2073         }
2074
2075       ONE_MORE_CHAR (c);
2076
2077       /* Now encode the character C.  */
2078       if (c < 0x20 || c == 0x7F)
2079         {
2080           if (c == '\r')
2081             {
2082               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2083                 {
2084                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2085                     ENCODE_RESET_PLANE_AND_REGISTER;
2086                   *dst++ = c;
2087                   continue;
2088                 }
2089               /* fall down to treat '\r' as '\n' ...  */
2090               c = '\n';
2091             }
2092           if (c == '\n')
2093             {
2094               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2095                 ENCODE_RESET_PLANE_AND_REGISTER;
2096               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2097                 bcopy (coding->spec.iso2022.initial_designation,
2098                        coding->spec.iso2022.current_designation,
2099                        sizeof coding->spec.iso2022.initial_designation);
2100               if (coding->eol_type == CODING_EOL_LF
2101                   || coding->eol_type == CODING_EOL_UNDECIDED)
2102                 *dst++ = ISO_CODE_LF;
2103               else if (coding->eol_type == CODING_EOL_CRLF)
2104                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2105               else
2106                 *dst++ = ISO_CODE_CR;
2107               CODING_SPEC_ISO_BOL (coding) = 1;
2108             }
2109           else
2110             {
2111               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2112                 ENCODE_RESET_PLANE_AND_REGISTER;
2113               *dst++ = c;
2114             }
2115         }
2116       else if (ASCII_BYTE_P (c))
2117         ENCODE_ISO_CHARACTER (CHARSET_ASCII, c, /* dummy */ c1);
2118       else if (SINGLE_BYTE_CHAR_P (c))
2119         {
2120           *dst++ = c;
2121           coding->errors++;
2122         }
2123       else
2124         {
2125           SPLIT_CHAR (c, charset, c1, c2);
2126           ENCODE_ISO_CHARACTER (charset, c1, c2);
2127         }
2128
2129       coding->consumed_char++;
2130     }
2131
2132  label_end_of_loop:
2133   coding->consumed = src_base - source;
2134   coding->produced = coding->produced_char = dst - destination;
2135 }
2136
2137 \f
2138 /*** 4. SJIS and BIG5 handlers ***/
2139
2140 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2141    quite widely.  So, for the moment, Emacs supports them in the bare
2142    C code.  But, in the future, they may be supported only by CCL.  */
2143
2144 /* SJIS is a coding system encoding three character sets: ASCII, right
2145    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2146    as is.  A character of charset katakana-jisx0201 is encoded by
2147    "position-code + 0x80".  A character of charset japanese-jisx0208
2148    is encoded in 2-byte but two position-codes are divided and shifted
2149    so that it fit in the range below.
2150
2151    --- CODE RANGE of SJIS ---
2152    (character set)      (range)
2153    ASCII                0x00 .. 0x7F
2154    KATAKANA-JISX0201    0xA0 .. 0xDF
2155    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2156             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2157    -------------------------------
2158
2159 */
2160
2161 /* BIG5 is a coding system encoding two character sets: ASCII and
2162    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2163    character set and is encoded in two-byte.
2164
2165    --- CODE RANGE of BIG5 ---
2166    (character set)      (range)
2167    ASCII                0x00 .. 0x7F
2168    Big5 (1st byte)      0xA1 .. 0xFE
2169         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2170    --------------------------
2171
2172    Since the number of characters in Big5 is larger than maximum
2173    characters in Emacs' charset (96x96), it can't be handled as one
2174    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2175    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2176    contains frequently used characters and the latter contains less
2177    frequently used characters.  */
2178
2179 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2180    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2181    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2182    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2183
2184 /* Number of Big5 characters which have the same code in 1st byte.  */
2185 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2186
2187 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2188   do {                                                                  \
2189     unsigned int temp                                                   \
2190       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2191     if (b1 < 0xC9)                                                      \
2192       charset = charset_big5_1;                                         \
2193     else                                                                \
2194       {                                                                 \
2195         charset = charset_big5_2;                                       \
2196         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2197       }                                                                 \
2198     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2199     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2200   } while (0)
2201
2202 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2203   do {                                                                  \
2204     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2205     if (charset == charset_big5_2)                                      \
2206       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2207     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2208     b2 = temp % BIG5_SAME_ROW;                                          \
2209     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2210   } while (0)
2211
2212 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2213    Check if a text is encoded in SJIS.  If it is, return
2214    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2215
2216 int
2217 detect_coding_sjis (src, src_end)
2218      unsigned char *src, *src_end;
2219 {
2220   int c;
2221   /* Dummy for ONE_MORE_BYTE.  */
2222   struct coding_system dummy_coding;
2223   struct coding_system *coding = &dummy_coding;
2224
2225   while (1)
2226     {
2227       ONE_MORE_BYTE (c);
2228       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
2229         {
2230           ONE_MORE_BYTE (c);
2231           if (c < 0x40)
2232             return 0;
2233         }
2234     }
2235  label_end_of_loop:
2236   return CODING_CATEGORY_MASK_SJIS;
2237 }
2238
2239 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2240    Check if a text is encoded in BIG5.  If it is, return
2241    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2242
2243 int
2244 detect_coding_big5 (src, src_end)
2245      unsigned char *src, *src_end;
2246 {
2247   int c;
2248   /* Dummy for ONE_MORE_BYTE.  */
2249   struct coding_system dummy_coding;
2250   struct coding_system *coding = &dummy_coding;
2251
2252   while (1)
2253     {
2254       ONE_MORE_BYTE (c);
2255       if (c >= 0xA1)
2256         {
2257           ONE_MORE_BYTE (c);
2258           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2259             return 0;
2260         }
2261     }
2262  label_end_of_loop:
2263   return CODING_CATEGORY_MASK_BIG5;
2264 }
2265
2266 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2267    Check if a text is encoded in UTF-8.  If it is, return
2268    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2269
2270 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2271 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2272 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2273 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2274 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2275 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2276 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2277
2278 int
2279 detect_coding_utf_8 (src, src_end)
2280      unsigned char *src, *src_end;
2281 {
2282   unsigned char c;
2283   int seq_maybe_bytes;
2284   /* Dummy for ONE_MORE_BYTE.  */
2285   struct coding_system dummy_coding;
2286   struct coding_system *coding = &dummy_coding;
2287
2288   while (1)
2289     {
2290       ONE_MORE_BYTE (c);
2291       if (UTF_8_1_OCTET_P (c))
2292         continue;
2293       else if (UTF_8_2_OCTET_LEADING_P (c))
2294         seq_maybe_bytes = 1;
2295       else if (UTF_8_3_OCTET_LEADING_P (c))
2296         seq_maybe_bytes = 2;
2297       else if (UTF_8_4_OCTET_LEADING_P (c))
2298         seq_maybe_bytes = 3;
2299       else if (UTF_8_5_OCTET_LEADING_P (c))
2300         seq_maybe_bytes = 4;
2301       else if (UTF_8_6_OCTET_LEADING_P (c))
2302         seq_maybe_bytes = 5;
2303       else
2304         return 0;
2305
2306       do
2307         {
2308           ONE_MORE_BYTE (c);
2309           if (!UTF_8_EXTRA_OCTET_P (c))
2310             return 0;
2311           seq_maybe_bytes--;
2312         }
2313       while (seq_maybe_bytes > 0);
2314     }
2315
2316  label_end_of_loop:
2317   return CODING_CATEGORY_MASK_UTF_8;
2318 }
2319
2320 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2321    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2322    Little Endian (otherwise).  If it is, return
2323    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2324    else return 0.  */
2325
2326 #define UTF_16_INVALID_P(val)   \
2327   (((val) == 0xFFFE)            \
2328    || ((val) == 0xFFFF))
2329
2330 #define UTF_16_HIGH_SURROGATE_P(val) \
2331   (((val) & 0xD800) == 0xD800)
2332
2333 #define UTF_16_LOW_SURROGATE_P(val) \
2334   (((val) & 0xDC00) == 0xDC00)
2335
2336 int
2337 detect_coding_utf_16 (src, src_end)
2338      unsigned char *src, *src_end;
2339 {
2340   unsigned char c1, c2;
2341   /* Dummy for TWO_MORE_BYTES.  */
2342   struct coding_system dummy_coding;
2343   struct coding_system *coding = &dummy_coding;
2344
2345   TWO_MORE_BYTES (c1, c2);
2346
2347   if ((c1 == 0xFF) && (c2 == 0xFE))
2348     return CODING_CATEGORY_MASK_UTF_16_LE;
2349   else if ((c1 == 0xFE) && (c2 == 0xFF))
2350     return CODING_CATEGORY_MASK_UTF_16_BE;
2351
2352  label_end_of_loop:
2353   return 0;
2354 }
2355
2356 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2357    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2358
2359 static void
2360 decode_coding_sjis_big5 (coding, source, destination,
2361                          src_bytes, dst_bytes, sjis_p)
2362      struct coding_system *coding;
2363      unsigned char *source, *destination;
2364      int src_bytes, dst_bytes;
2365      int sjis_p;
2366 {
2367   unsigned char *src = source;
2368   unsigned char *src_end = source + src_bytes;
2369   unsigned char *dst = destination;
2370   unsigned char *dst_end = destination + dst_bytes;
2371   /* SRC_BASE remembers the start position in source in each loop.
2372      The loop will be exited when there's not enough source code
2373      (within macro ONE_MORE_BYTE), or when there's not enough
2374      destination area to produce a character (within macro
2375      EMIT_CHAR).  */
2376   unsigned char *src_base;
2377   Lisp_Object translation_table;
2378
2379   if (NILP (Venable_character_translation))
2380     translation_table = Qnil;
2381   else
2382     {
2383       translation_table = coding->translation_table_for_decode;
2384       if (NILP (translation_table))
2385         translation_table = Vstandard_translation_table_for_decode;
2386     }
2387
2388   coding->produced_char = 0;
2389   while (1)
2390     {
2391       int c, charset, c1, c2;
2392
2393       src_base = src;
2394       ONE_MORE_BYTE (c1);
2395
2396       if (c1 < 0x80)
2397         {
2398           charset = CHARSET_ASCII;
2399           if (c1 < 0x20)
2400             {
2401               if (c1 == '\r')
2402                 {
2403                   if (coding->eol_type == CODING_EOL_CRLF)
2404                     {
2405                       ONE_MORE_BYTE (c2);
2406                       if (c2 == '\n')
2407                         c1 = c2;
2408                       else if (coding->mode
2409                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2410                         {
2411                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2412                           goto label_end_of_loop;
2413                         }
2414                       else
2415                         /* To process C2 again, SRC is subtracted by 1.  */
2416                         src--;
2417                     }
2418                   else if (coding->eol_type == CODING_EOL_CR)
2419                     c1 = '\n';
2420                 }
2421               else if (c1 == '\n'
2422                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2423                        && (coding->eol_type == CODING_EOL_CR
2424                            || coding->eol_type == CODING_EOL_CRLF))
2425                 {
2426                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2427                   goto label_end_of_loop;
2428                 }
2429             }
2430         }
2431       else
2432         {
2433           if (sjis_p)
2434             {
2435               if (c1 >= 0xF0)
2436                 goto label_invalid_code;
2437               if (c1 < 0xA0 || c1 >= 0xE0)
2438                 {
2439                   /* SJIS -> JISX0208 */
2440                   ONE_MORE_BYTE (c2);
2441                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2442                     goto label_invalid_code;
2443                   DECODE_SJIS (c1, c2, c1, c2);
2444                   charset = charset_jisx0208;
2445                 }
2446               else
2447                 /* SJIS -> JISX0201-Kana */
2448                 charset = charset_katakana_jisx0201;
2449             }
2450           else
2451             {
2452               /* BIG5 -> Big5 */
2453               if (c1 < 0xA1 || c1 > 0xFE)
2454                 goto label_invalid_code;
2455               ONE_MORE_BYTE (c2);
2456               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2457                 goto label_invalid_code;
2458               DECODE_BIG5 (c1, c2, charset, c1, c2);
2459             }
2460         }
2461
2462       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2463       EMIT_CHAR (c);
2464       continue;
2465
2466     label_invalid_code:
2467       coding->errors++;
2468       src = src_base;
2469       c = *src++;
2470       EMIT_CHAR (c);
2471     }
2472
2473  label_end_of_loop:
2474   coding->consumed = coding->consumed_char = src_base - source;
2475   coding->produced = dst - destination;
2476   return;
2477 }
2478
2479 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2480    This function can encode charsets `ascii', `katakana-jisx0201',
2481    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2482    are sure that all these charsets are registered as official charset
2483    (i.e. do not have extended leading-codes).  Characters of other
2484    charsets are produced without any encoding.  If SJIS_P is 1, encode
2485    SJIS text, else encode BIG5 text.  */
2486
2487 static void
2488 encode_coding_sjis_big5 (coding, source, destination,
2489                          src_bytes, dst_bytes, sjis_p)
2490      struct coding_system *coding;
2491      unsigned char *source, *destination;
2492      int src_bytes, dst_bytes;
2493      int sjis_p;
2494 {
2495   unsigned char *src = source;
2496   unsigned char *src_end = source + src_bytes;
2497   unsigned char *dst = destination;
2498   unsigned char *dst_end = destination + dst_bytes;
2499   /* SRC_BASE remembers the start position in source in each loop.
2500      The loop will be exited when there's not enough source text to
2501      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2502      there's not enough destination area to produce encoded codes
2503      (within macro EMIT_BYTES).  */
2504   unsigned char *src_base;
2505   Lisp_Object translation_table;
2506
2507   if (NILP (Venable_character_translation))
2508     translation_table = Qnil;
2509   else
2510     {
2511       translation_table = coding->translation_table_for_decode;
2512       if (NILP (translation_table))
2513         translation_table = Vstandard_translation_table_for_decode;
2514     }
2515
2516   while (1)
2517     {
2518       int c, charset, c1, c2;
2519
2520       src_base = src;
2521       ONE_MORE_CHAR (c);
2522
2523       /* Now encode the character C.  */
2524       if (SINGLE_BYTE_CHAR_P (c))
2525         {
2526           switch (c)
2527             {
2528             case '\r':
2529               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2530                 {
2531                   EMIT_ONE_BYTE (c);
2532                   break;
2533                 }
2534               c = '\n';
2535             case '\n':
2536               if (coding->eol_type == CODING_EOL_CRLF)
2537                 {
2538                   EMIT_TWO_BYTES ('\r', c);
2539                   break;
2540                 }
2541               else if (coding->eol_type == CODING_EOL_CR)
2542                 c = '\r';
2543             default:
2544               EMIT_ONE_BYTE (c);
2545             }
2546         }
2547       else
2548         {
2549           SPLIT_CHAR (c, charset, c1, c2);
2550           if (sjis_p)
2551             {
2552               if (charset == charset_jisx0208
2553                   || charset == charset_jisx0208_1978)
2554                 {
2555                   ENCODE_SJIS (c1, c2, c1, c2);
2556                   EMIT_TWO_BYTES (c1, c2);
2557                 }
2558               else if (charset == charset_latin_jisx0201)
2559                 EMIT_ONE_BYTE (c1);
2560               else
2561                 /* There's no way other than producing the internal
2562                    codes as is.  */
2563                 EMIT_BYTES (src_base, src);
2564             }
2565           else
2566             {
2567               if (charset == charset_big5_1 || charset == charset_big5_2)
2568                 {
2569                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2570                   EMIT_TWO_BYTES (c1, c2);
2571                 }
2572               else
2573                 /* There's no way other than producing the internal
2574                    codes as is.  */
2575                 EMIT_BYTES (src_base, src);
2576             }
2577         }
2578       coding->consumed_char++;
2579     }
2580
2581  label_end_of_loop:
2582   coding->consumed = src_base - source;
2583   coding->produced = coding->produced_char = dst - destination;
2584 }
2585
2586 \f
2587 /*** 5. CCL handlers ***/
2588
2589 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2590    Check if a text is encoded in a coding system of which
2591    encoder/decoder are written in CCL program.  If it is, return
2592    CODING_CATEGORY_MASK_CCL, else return 0.  */
2593
2594 int
2595 detect_coding_ccl (src, src_end)
2596      unsigned char *src, *src_end;
2597 {
2598   unsigned char *valid;
2599   int c;
2600   /* Dummy for ONE_MORE_BYTE.  */
2601   struct coding_system dummy_coding;
2602   struct coding_system *coding = &dummy_coding;
2603
2604   /* No coding system is assigned to coding-category-ccl.  */
2605   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2606     return 0;
2607
2608   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2609   while (1)
2610     {
2611       ONE_MORE_BYTE (c);
2612       if (! valid[c])
2613         return 0;
2614     }
2615  label_end_of_loop:
2616   return CODING_CATEGORY_MASK_CCL;
2617 }
2618
2619 \f
2620 /*** 6. End-of-line handlers ***/
2621
2622 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2623
2624 static void
2625 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2626      struct coding_system *coding;
2627      unsigned char *source, *destination;
2628      int src_bytes, dst_bytes;
2629 {
2630   unsigned char *src = source;
2631   unsigned char *dst = destination;
2632   unsigned char *src_end = src + src_bytes;
2633   unsigned char *dst_end = dst + dst_bytes;
2634   Lisp_Object translation_table;
2635   /* SRC_BASE remembers the start position in source in each loop.
2636      The loop will be exited when there's not enough source code
2637      (within macro ONE_MORE_BYTE), or when there's not enough
2638      destination area to produce a character (within macro
2639      EMIT_CHAR).  */
2640   unsigned char *src_base;
2641   int c;
2642
2643   translation_table = Qnil;
2644   switch (coding->eol_type)
2645     {
2646     case CODING_EOL_CRLF:
2647       while (1)
2648         {
2649           src_base = src;
2650           ONE_MORE_BYTE (c);
2651           if (c == '\r')
2652             {
2653               ONE_MORE_BYTE (c);
2654               if (c != '\n')
2655                 {
2656                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2657                     {
2658                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2659                       goto label_end_of_loop;
2660                     }
2661                   src--;
2662                   c = '\r';
2663                 }
2664             }
2665           else if (c == '\n'
2666                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2667             {
2668               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2669               goto label_end_of_loop;
2670             }
2671           EMIT_CHAR (c);
2672         }
2673       break;
2674
2675     case CODING_EOL_CR:
2676       while (1)
2677         {
2678           src_base = src;
2679           ONE_MORE_BYTE (c);
2680           if (c == '\n')
2681             {
2682               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2683                 {
2684                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2685                   goto label_end_of_loop;
2686                 }
2687             }
2688           else if (c == '\r')
2689             c = '\n';
2690           EMIT_CHAR (c);
2691         }
2692       break;
2693
2694     default:                    /* no need for EOL handling */
2695       while (1)
2696         {
2697           src_base = src;
2698           ONE_MORE_BYTE (c);
2699           EMIT_CHAR (c);
2700         }
2701     }
2702
2703  label_end_of_loop:
2704   coding->consumed = coding->consumed_char = src_base - source;
2705   coding->produced = dst - destination;
2706   return;
2707 }
2708
2709 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2710    format of end-of-line according to `coding->eol_type'.  It also
2711    convert multibyte form 8-bit characers to unibyte if
2712    CODING->src_multibyte is nonzero.  If `coding->mode &
2713    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2714    also means end-of-line.  */
2715
2716 static void
2717 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2718      struct coding_system *coding;
2719      unsigned char *source, *destination;
2720      int src_bytes, dst_bytes;
2721 {
2722   unsigned char *src = source;
2723   unsigned char *dst = destination;
2724   unsigned char *src_end = src + src_bytes;
2725   unsigned char *dst_end = dst + dst_bytes;
2726   Lisp_Object translation_table;
2727   /* SRC_BASE remembers the start position in source in each loop.
2728      The loop will be exited when there's not enough source text to
2729      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2730      there's not enough destination area to produce encoded codes
2731      (within macro EMIT_BYTES).  */
2732   unsigned char *src_base;
2733   int c;
2734   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2735
2736   translation_table = Qnil;
2737   if (coding->src_multibyte
2738       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2739     {
2740       src_end--;
2741       src_bytes--;
2742       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2743     }
2744
2745   if (coding->eol_type == CODING_EOL_CRLF)
2746     {
2747       while (src < src_end)
2748         {
2749           src_base = src;
2750           c = *src++;
2751           if (c >= 0x20)
2752             EMIT_ONE_BYTE (c);
2753           else if (c == '\n' || (c == '\r' && selective_display))
2754             EMIT_TWO_BYTES ('\r', '\n');
2755           else
2756             EMIT_ONE_BYTE (c);
2757         }
2758       src_base = src;
2759     label_end_of_loop:
2760       ;
2761     }
2762   else
2763     {
2764       if (src_bytes <= dst_bytes)
2765         {
2766           safe_bcopy (src, dst, src_bytes);
2767           src_base = src_end;
2768           dst += src_bytes;
2769         }
2770       else
2771         {
2772           if (coding->src_multibyte
2773               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2774             dst_bytes--;
2775           safe_bcopy (src, dst, dst_bytes);
2776           src_base = src + dst_bytes;
2777           dst = destination + dst_bytes;
2778           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2779         }
2780       if (coding->eol_type == CODING_EOL_CR)
2781         {
2782           for (src = destination; src < dst; src++)
2783             if (*src == '\n') *src = '\r';
2784         }
2785       else if (selective_display)
2786         {
2787           for (src = destination; src < dst; src++)
2788             if (*src == '\r') *src = '\n';
2789         }
2790     }
2791   if (coding->src_multibyte)
2792     dst = destination + str_as_unibyte (destination, dst - destination);
2793
2794   coding->consumed = src_base - source;
2795   coding->produced = dst - destination;
2796 }
2797
2798 \f
2799 /*** 7. C library functions ***/
2800
2801 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2802    has a property `coding-system'.  The value of this property is a
2803    vector of length 5 (called as coding-vector).  Among elements of
2804    this vector, the first (element[0]) and the fifth (element[4])
2805    carry important information for decoding/encoding.  Before
2806    decoding/encoding, this information should be set in fields of a
2807    structure of type `coding_system'.
2808
2809    A value of property `coding-system' can be a symbol of another
2810    subsidiary coding-system.  In that case, Emacs gets coding-vector
2811    from that symbol.
2812
2813    `element[0]' contains information to be set in `coding->type'.  The
2814    value and its meaning is as follows:
2815
2816    0 -- coding_type_emacs_mule
2817    1 -- coding_type_sjis
2818    2 -- coding_type_iso2022
2819    3 -- coding_type_big5
2820    4 -- coding_type_ccl encoder/decoder written in CCL
2821    nil -- coding_type_no_conversion
2822    t -- coding_type_undecided (automatic conversion on decoding,
2823                                no-conversion on encoding)
2824
2825    `element[4]' contains information to be set in `coding->flags' and
2826    `coding->spec'.  The meaning varies by `coding->type'.
2827
2828    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2829    of length 32 (of which the first 13 sub-elements are used now).
2830    Meanings of these sub-elements are:
2831
2832    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2833         If the value is an integer of valid charset, the charset is
2834         assumed to be designated to graphic register N initially.
2835
2836         If the value is minus, it is a minus value of charset which
2837         reserves graphic register N, which means that the charset is
2838         not designated initially but should be designated to graphic
2839         register N just before encoding a character in that charset.
2840
2841         If the value is nil, graphic register N is never used on
2842         encoding.
2843
2844    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2845         Each value takes t or nil.  See the section ISO2022 of
2846         `coding.h' for more information.
2847
2848    If `coding->type' is `coding_type_big5', element[4] is t to denote
2849    BIG5-ETen or nil to denote BIG5-HKU.
2850
2851    If `coding->type' takes the other value, element[4] is ignored.
2852
2853    Emacs Lisp's coding system also carries information about format of
2854    end-of-line in a value of property `eol-type'.  If the value is
2855    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2856    means CODING_EOL_CR.  If it is not integer, it should be a vector
2857    of subsidiary coding systems of which property `eol-type' has one
2858    of above values.
2859
2860 */
2861
2862 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2863    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2864    is setup so that no conversion is necessary and return -1, else
2865    return 0.  */
2866
2867 int
2868 setup_coding_system (coding_system, coding)
2869      Lisp_Object coding_system;
2870      struct coding_system *coding;
2871 {
2872   Lisp_Object coding_spec, coding_type, eol_type, plist;
2873   Lisp_Object val;
2874   int i;
2875
2876   /* Initialize some fields required for all kinds of coding systems.  */
2877   coding->symbol = coding_system;
2878   coding->common_flags = 0;
2879   coding->mode = 0;
2880   coding->heading_ascii = -1;
2881   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2882   coding->composing = COMPOSITION_DISABLED;
2883   coding->cmp_data = NULL;
2884
2885   if (NILP (coding_system))
2886     goto label_invalid_coding_system;
2887
2888   coding_spec = Fget (coding_system, Qcoding_system);
2889
2890   if (!VECTORP (coding_spec)
2891       || XVECTOR (coding_spec)->size != 5
2892       || !CONSP (XVECTOR (coding_spec)->contents[3]))
2893     goto label_invalid_coding_system;
2894
2895   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
2896   if (VECTORP (eol_type))
2897     {
2898       coding->eol_type = CODING_EOL_UNDECIDED;
2899       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
2900     }
2901   else if (XFASTINT (eol_type) == 1)
2902     {
2903       coding->eol_type = CODING_EOL_CRLF;
2904       coding->common_flags
2905         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2906     }
2907   else if (XFASTINT (eol_type) == 2)
2908     {
2909       coding->eol_type = CODING_EOL_CR;
2910       coding->common_flags
2911         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2912     }
2913   else
2914     coding->eol_type = CODING_EOL_LF;
2915
2916   coding_type = XVECTOR (coding_spec)->contents[0];
2917   /* Try short cut.  */
2918   if (SYMBOLP (coding_type))
2919     {
2920       if (EQ (coding_type, Qt))
2921         {
2922           coding->type = coding_type_undecided;
2923           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
2924         }
2925       else
2926         coding->type = coding_type_no_conversion;
2927       return 0;
2928     }
2929
2930   /* Get values of coding system properties:
2931      `post-read-conversion', `pre-write-conversion',
2932      `translation-table-for-decode', `translation-table-for-encode'.  */
2933   plist = XVECTOR (coding_spec)->contents[3];
2934   /* Pre & post conversion functions should be disabled if
2935      inhibit_eol_conversion is nozero.  This is the case that a code
2936      conversion function is called while those functions are running.  */
2937   if (! inhibit_pre_post_conversion)
2938     {
2939       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
2940       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
2941     }
2942   val = Fplist_get (plist, Qtranslation_table_for_decode);
2943   if (SYMBOLP (val))
2944     val = Fget (val, Qtranslation_table_for_decode);
2945   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
2946   val = Fplist_get (plist, Qtranslation_table_for_encode);
2947   if (SYMBOLP (val))
2948     val = Fget (val, Qtranslation_table_for_encode);
2949   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
2950   val = Fplist_get (plist, Qcoding_category);
2951   if (!NILP (val))
2952     {
2953       val = Fget (val, Qcoding_category_index);
2954       if (INTEGERP (val))
2955         coding->category_idx = XINT (val);
2956       else
2957         goto label_invalid_coding_system;
2958     }
2959   else
2960     goto label_invalid_coding_system;
2961
2962   val = Fplist_get (plist, Qsafe_charsets);
2963   if (EQ (val, Qt))
2964     {
2965       for (i = 0; i <= MAX_CHARSET; i++)
2966         coding->safe_charsets[i] = 1;
2967     }
2968   else
2969     {
2970       bzero (coding->safe_charsets, MAX_CHARSET + 1);
2971       while (CONSP (val))
2972         {
2973           if ((i = get_charset_id (XCAR (val))) >= 0)
2974             coding->safe_charsets[i] = 1;
2975           val = XCDR (val);
2976         }
2977     }
2978
2979   /* If the coding system has non-nil `composition' property, enable
2980      composition handling.  */
2981   val = Fplist_get (plist, Qcomposition);
2982   if (!NILP (val))
2983     coding->composing = COMPOSITION_NO;
2984
2985   switch (XFASTINT (coding_type))
2986     {
2987     case 0:
2988       coding->type = coding_type_emacs_mule;
2989       if (!NILP (coding->post_read_conversion))
2990         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
2991       if (!NILP (coding->pre_write_conversion))
2992         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
2993       break;
2994
2995     case 1:
2996       coding->type = coding_type_sjis;
2997       coding->common_flags
2998         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
2999       break;
3000
3001     case 2:
3002       coding->type = coding_type_iso2022;
3003       coding->common_flags
3004         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3005       {
3006         Lisp_Object val, temp;
3007         Lisp_Object *flags;
3008         int i, charset, reg_bits = 0;
3009
3010         val = XVECTOR (coding_spec)->contents[4];
3011
3012         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3013           goto label_invalid_coding_system;
3014
3015         flags = XVECTOR (val)->contents;
3016         coding->flags
3017           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3018              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3019              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3020              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3021              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3022              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3023              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3024              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3025              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3026              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3027              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3028              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3029              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3030              );
3031
3032         /* Invoke graphic register 0 to plane 0.  */
3033         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3034         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3035         CODING_SPEC_ISO_INVOCATION (coding, 1)
3036           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3037         /* Not single shifting at first.  */
3038         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3039         /* Beginning of buffer should also be regarded as bol. */
3040         CODING_SPEC_ISO_BOL (coding) = 1;
3041
3042         for (charset = 0; charset <= MAX_CHARSET; charset++)
3043           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3044         val = Vcharset_revision_alist;
3045         while (CONSP (val))
3046           {
3047             charset = get_charset_id (Fcar_safe (XCAR (val)));
3048             if (charset >= 0
3049                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3050                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3051               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3052             val = XCDR (val);
3053           }
3054
3055         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3056            FLAGS[REG] can be one of below:
3057                 integer CHARSET: CHARSET occupies register I,
3058                 t: designate nothing to REG initially, but can be used
3059                   by any charsets,
3060                 list of integer, nil, or t: designate the first
3061                   element (if integer) to REG initially, the remaining
3062                   elements (if integer) is designated to REG on request,
3063                   if an element is t, REG can be used by any charsets,
3064                 nil: REG is never used.  */
3065         for (charset = 0; charset <= MAX_CHARSET; charset++)
3066           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3067             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3068         for (i = 0; i < 4; i++)
3069           {
3070             if (INTEGERP (flags[i])
3071                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3072                 || (charset = get_charset_id (flags[i])) >= 0)
3073               {
3074                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3075                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3076               }
3077             else if (EQ (flags[i], Qt))
3078               {
3079                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3080                 reg_bits |= 1 << i;
3081                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3082               }
3083             else if (CONSP (flags[i]))
3084               {
3085                 Lisp_Object tail;
3086                 tail = flags[i];
3087
3088                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3089                 if (INTEGERP (XCAR (tail))
3090                     && (charset = XINT (XCAR (tail)),
3091                         CHARSET_VALID_P (charset))
3092                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3093                   {
3094                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3095                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3096                   }
3097                 else
3098                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3099                 tail = XCDR (tail);
3100                 while (CONSP (tail))
3101                   {
3102                     if (INTEGERP (XCAR (tail))
3103                         && (charset = XINT (XCAR (tail)),
3104                             CHARSET_VALID_P (charset))
3105                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3106                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3107                         = i;
3108                     else if (EQ (XCAR (tail), Qt))
3109                       reg_bits |= 1 << i;
3110                     tail = XCDR (tail);
3111                   }
3112               }
3113             else
3114               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3115
3116             CODING_SPEC_ISO_DESIGNATION (coding, i)
3117               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3118           }
3119
3120         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3121           {
3122             /* REG 1 can be used only by locking shift in 7-bit env.  */
3123             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3124               reg_bits &= ~2;
3125             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3126               /* Without any shifting, only REG 0 and 1 can be used.  */
3127               reg_bits &= 3;
3128           }
3129
3130         if (reg_bits)
3131           for (charset = 0; charset <= MAX_CHARSET; charset++)
3132             {
3133               if (CHARSET_VALID_P (charset))
3134                 {
3135                   /* There exist some default graphic registers to be
3136                      used CHARSET.  */
3137
3138                   /* We had better avoid designating a charset of
3139                      CHARS96 to REG 0 as far as possible.  */
3140                   if (CHARSET_CHARS (charset) == 96)
3141                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3142                       = (reg_bits & 2
3143                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3144                   else
3145                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3146                       = (reg_bits & 1
3147                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3148                 }
3149             }
3150       }
3151       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3152       coding->spec.iso2022.last_invalid_designation_register = -1;
3153       break;
3154
3155     case 3:
3156       coding->type = coding_type_big5;
3157       coding->common_flags
3158         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3159       coding->flags
3160         = (NILP (XVECTOR (coding_spec)->contents[4])
3161            ? CODING_FLAG_BIG5_HKU
3162            : CODING_FLAG_BIG5_ETEN);
3163       break;
3164
3165     case 4:
3166       coding->type = coding_type_ccl;
3167       coding->common_flags
3168         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3169       {
3170         val = XVECTOR (coding_spec)->contents[4];
3171         if (! CONSP (val)
3172             || setup_ccl_program (&(coding->spec.ccl.decoder),
3173                                   XCAR (val)) < 0
3174             || setup_ccl_program (&(coding->spec.ccl.encoder),
3175                                   XCDR (val)) < 0)
3176           goto label_invalid_coding_system;
3177
3178         bzero (coding->spec.ccl.valid_codes, 256);
3179         val = Fplist_get (plist, Qvalid_codes);
3180         if (CONSP (val))
3181           {
3182             Lisp_Object this;
3183
3184             for (; CONSP (val); val = XCDR (val))
3185               {
3186                 this = XCAR (val);
3187                 if (INTEGERP (this)
3188                     && XINT (this) >= 0 && XINT (this) < 256)
3189                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3190                 else if (CONSP (this)
3191                          && INTEGERP (XCAR (this))
3192                          && INTEGERP (XCDR (this)))
3193                   {
3194                     int start = XINT (XCAR (this));
3195                     int end = XINT (XCDR (this));
3196
3197                     if (start >= 0 && start <= end && end < 256)
3198                       while (start <= end)
3199                         coding->spec.ccl.valid_codes[start++] = 1;
3200                   }
3201               }
3202           }
3203       }
3204       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3205       coding->spec.ccl.cr_carryover = 0;
3206       break;
3207
3208     case 5:
3209       coding->type = coding_type_raw_text;
3210       break;
3211
3212     default:
3213       goto label_invalid_coding_system;
3214     }
3215   return 0;
3216
3217  label_invalid_coding_system:
3218   coding->type = coding_type_no_conversion;
3219   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3220   coding->common_flags = 0;
3221   coding->eol_type = CODING_EOL_LF;
3222   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3223   return -1;
3224 }
3225
3226 /* Free memory blocks allocated for storing composition information.  */
3227
3228 void
3229 coding_free_composition_data (coding)
3230      struct coding_system *coding;
3231 {
3232   struct composition_data *cmp_data = coding->cmp_data, *next;
3233
3234   if (!cmp_data)
3235     return;
3236   /* Memory blocks are chained.  At first, rewind to the first, then,
3237      free blocks one by one.  */
3238   while (cmp_data->prev)
3239     cmp_data = cmp_data->prev;
3240   while (cmp_data)
3241     {
3242       next = cmp_data->next;
3243       xfree (cmp_data);
3244       cmp_data = next;
3245     }
3246   coding->cmp_data = NULL;
3247 }
3248
3249 /* Set `char_offset' member of all memory blocks pointed by
3250    coding->cmp_data to POS.  */
3251
3252 void
3253 coding_adjust_composition_offset (coding, pos)
3254      struct coding_system *coding;
3255      int pos;
3256 {
3257   struct composition_data *cmp_data;
3258
3259   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3260     cmp_data->char_offset = pos;
3261 }
3262
3263 /* Setup raw-text or one of its subsidiaries in the structure
3264    coding_system CODING according to the already setup value eol_type
3265    in CODING.  CODING should be setup for some coding system in
3266    advance.  */
3267
3268 void
3269 setup_raw_text_coding_system (coding)
3270      struct coding_system *coding;
3271 {
3272   if (coding->type != coding_type_raw_text)
3273     {
3274       coding->symbol = Qraw_text;
3275       coding->type = coding_type_raw_text;
3276       if (coding->eol_type != CODING_EOL_UNDECIDED)
3277         {
3278           Lisp_Object subsidiaries;
3279           subsidiaries = Fget (Qraw_text, Qeol_type);
3280
3281           if (VECTORP (subsidiaries)
3282               && XVECTOR (subsidiaries)->size == 3)
3283             coding->symbol
3284               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3285         }
3286       setup_coding_system (coding->symbol, coding);
3287     }
3288   return;
3289 }
3290
3291 /* Emacs has a mechanism to automatically detect a coding system if it
3292    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3293    it's impossible to distinguish some coding systems accurately
3294    because they use the same range of codes.  So, at first, coding
3295    systems are categorized into 7, those are:
3296
3297    o coding-category-emacs-mule
3298
3299         The category for a coding system which has the same code range
3300         as Emacs' internal format.  Assigned the coding-system (Lisp
3301         symbol) `emacs-mule' by default.
3302
3303    o coding-category-sjis
3304
3305         The category for a coding system which has the same code range
3306         as SJIS.  Assigned the coding-system (Lisp
3307         symbol) `japanese-shift-jis' by default.
3308
3309    o coding-category-iso-7
3310
3311         The category for a coding system which has the same code range
3312         as ISO2022 of 7-bit environment.  This doesn't use any locking
3313         shift and single shift functions.  This can encode/decode all
3314         charsets.  Assigned the coding-system (Lisp symbol)
3315         `iso-2022-7bit' by default.
3316
3317    o coding-category-iso-7-tight
3318
3319         Same as coding-category-iso-7 except that this can
3320         encode/decode only the specified charsets.
3321
3322    o coding-category-iso-8-1
3323
3324         The category for a coding system which has the same code range
3325         as ISO2022 of 8-bit environment and graphic plane 1 used only
3326         for DIMENSION1 charset.  This doesn't use any locking shift
3327         and single shift functions.  Assigned the coding-system (Lisp
3328         symbol) `iso-latin-1' by default.
3329
3330    o coding-category-iso-8-2
3331
3332         The category for a coding system which has the same code range
3333         as ISO2022 of 8-bit environment and graphic plane 1 used only
3334         for DIMENSION2 charset.  This doesn't use any locking shift
3335         and single shift functions.  Assigned the coding-system (Lisp
3336         symbol) `japanese-iso-8bit' by default.
3337
3338    o coding-category-iso-7-else
3339
3340         The category for a coding system which has the same code range
3341         as ISO2022 of 7-bit environemnt but uses locking shift or
3342         single shift functions.  Assigned the coding-system (Lisp
3343         symbol) `iso-2022-7bit-lock' by default.
3344
3345    o coding-category-iso-8-else
3346
3347         The category for a coding system which has the same code range
3348         as ISO2022 of 8-bit environemnt but uses locking shift or
3349         single shift functions.  Assigned the coding-system (Lisp
3350         symbol) `iso-2022-8bit-ss2' by default.
3351
3352    o coding-category-big5
3353
3354         The category for a coding system which has the same code range
3355         as BIG5.  Assigned the coding-system (Lisp symbol)
3356         `cn-big5' by default.
3357
3358    o coding-category-utf-8
3359
3360         The category for a coding system which has the same code range
3361         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3362         symbol) `utf-8' by default.
3363
3364    o coding-category-utf-16-be
3365
3366         The category for a coding system in which a text has an
3367         Unicode signature (cf. Unicode Standard) in the order of BIG
3368         endian at the head.  Assigned the coding-system (Lisp symbol)
3369         `utf-16-be' by default.
3370
3371    o coding-category-utf-16-le
3372
3373         The category for a coding system in which a text has an
3374         Unicode signature (cf. Unicode Standard) in the order of
3375         LITTLE endian at the head.  Assigned the coding-system (Lisp
3376         symbol) `utf-16-le' by default.
3377
3378    o coding-category-ccl
3379
3380         The category for a coding system of which encoder/decoder is
3381         written in CCL programs.  The default value is nil, i.e., no
3382         coding system is assigned.
3383
3384    o coding-category-binary
3385
3386         The category for a coding system not categorized in any of the
3387         above.  Assigned the coding-system (Lisp symbol)
3388         `no-conversion' by default.
3389
3390    Each of them is a Lisp symbol and the value is an actual
3391    `coding-system's (this is also a Lisp symbol) assigned by a user.
3392    What Emacs does actually is to detect a category of coding system.
3393    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3394    decide only one possible category, it selects a category of the
3395    highest priority.  Priorities of categories are also specified by a
3396    user in a Lisp variable `coding-category-list'.
3397
3398 */
3399
3400 static
3401 int ascii_skip_code[256];
3402
3403 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3404    If it detects possible coding systems, return an integer in which
3405    appropriate flag bits are set.  Flag bits are defined by macros
3406    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3407    it should point the table `coding_priorities'.  In that case, only
3408    the flag bit for a coding system of the highest priority is set in
3409    the returned value.
3410
3411    How many ASCII characters are at the head is returned as *SKIP.  */
3412
3413 static int
3414 detect_coding_mask (source, src_bytes, priorities, skip)
3415      unsigned char *source;
3416      int src_bytes, *priorities, *skip;
3417 {
3418   register unsigned char c;
3419   unsigned char *src = source, *src_end = source + src_bytes;
3420   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3421   int i, idx;
3422
3423   /* At first, skip all ASCII characters and control characters except
3424      for three ISO2022 specific control characters.  */
3425   ascii_skip_code[ISO_CODE_SO] = 0;
3426   ascii_skip_code[ISO_CODE_SI] = 0;
3427   ascii_skip_code[ISO_CODE_ESC] = 0;
3428
3429  label_loop_detect_coding:
3430   while (src < src_end && ascii_skip_code[*src]) src++;
3431   *skip = src - source;
3432
3433   if (src >= src_end)
3434     /* We found nothing other than ASCII.  There's nothing to do.  */
3435     return 0;
3436
3437   c = *src;
3438   /* The text seems to be encoded in some multilingual coding system.
3439      Now, try to find in which coding system the text is encoded.  */
3440   if (c < 0x80)
3441     {
3442       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3443       /* C is an ISO2022 specific control code of C0.  */
3444       mask = detect_coding_iso2022 (src, src_end);
3445       if (mask == 0)
3446         {
3447           /* No valid ISO2022 code follows C.  Try again.  */
3448           src++;
3449           if (c == ISO_CODE_ESC)
3450             ascii_skip_code[ISO_CODE_ESC] = 1;
3451           else
3452             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3453           goto label_loop_detect_coding;
3454         }
3455       if (priorities)
3456         {
3457           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3458             {
3459               if (mask & priorities[i])
3460                 return priorities[i];
3461             }
3462           return CODING_CATEGORY_MASK_RAW_TEXT;
3463         }
3464     }
3465   else
3466     {
3467       int try;
3468
3469       if (c < 0xA0)
3470         {
3471           /* C is the first byte of SJIS character code,
3472              or a leading-code of Emacs' internal format (emacs-mule),
3473              or the first byte of UTF-16.  */
3474           try = (CODING_CATEGORY_MASK_SJIS
3475                   | CODING_CATEGORY_MASK_EMACS_MULE
3476                   | CODING_CATEGORY_MASK_UTF_16_BE
3477                   | CODING_CATEGORY_MASK_UTF_16_LE);
3478
3479           /* Or, if C is a special latin extra code,
3480              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3481              or is an ISO2022 control-sequence-introducer (CSI),
3482              we should also consider the possibility of ISO2022 codings.  */
3483           if ((VECTORP (Vlatin_extra_code_table)
3484                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3485               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3486               || (c == ISO_CODE_CSI
3487                   && (src < src_end
3488                       && (*src == ']'
3489                           || ((*src == '0' || *src == '1' || *src == '2')
3490                               && src + 1 < src_end
3491                               && src[1] == ']')))))
3492             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3493                      | CODING_CATEGORY_MASK_ISO_8BIT);
3494         }
3495       else
3496         /* C is a character of ISO2022 in graphic plane right,
3497            or a SJIS's 1-byte character code (i.e. JISX0201),
3498            or the first byte of BIG5's 2-byte code,
3499            or the first byte of UTF-8/16.  */
3500         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3501                 | CODING_CATEGORY_MASK_ISO_8BIT
3502                 | CODING_CATEGORY_MASK_SJIS
3503                 | CODING_CATEGORY_MASK_BIG5
3504                 | CODING_CATEGORY_MASK_UTF_8
3505                 | CODING_CATEGORY_MASK_UTF_16_BE
3506                 | CODING_CATEGORY_MASK_UTF_16_LE);
3507
3508       /* Or, we may have to consider the possibility of CCL.  */
3509       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3510           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3511               ->spec.ccl.valid_codes)[c])
3512         try |= CODING_CATEGORY_MASK_CCL;
3513
3514       mask = 0;
3515       utf16_examined_p = iso2022_examined_p = 0;
3516       if (priorities)
3517         {
3518           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3519             {
3520               if (!iso2022_examined_p
3521                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3522                 {
3523                   mask |= detect_coding_iso2022 (src, src_end);
3524                   iso2022_examined_p = 1;
3525                 }
3526               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3527                 mask |= detect_coding_sjis (src, src_end);
3528               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3529                 mask |= detect_coding_utf_8 (src, src_end);
3530               else if (!utf16_examined_p
3531                        && (priorities[i] & try &
3532                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3533                 {
3534                   mask |= detect_coding_utf_16 (src, src_end);
3535                   utf16_examined_p = 1;
3536                 }
3537               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3538                 mask |= detect_coding_big5 (src, src_end);
3539               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3540                 mask |= detect_coding_emacs_mule (src, src_end);
3541               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3542                 mask |= detect_coding_ccl (src, src_end);
3543               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3544                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3545               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3546                 mask |= CODING_CATEGORY_MASK_BINARY;
3547               if (mask & priorities[i])
3548                 return priorities[i];
3549             }
3550           return CODING_CATEGORY_MASK_RAW_TEXT;
3551         }
3552       if (try & CODING_CATEGORY_MASK_ISO)
3553         mask |= detect_coding_iso2022 (src, src_end);
3554       if (try & CODING_CATEGORY_MASK_SJIS)
3555         mask |= detect_coding_sjis (src, src_end);
3556       if (try & CODING_CATEGORY_MASK_BIG5)
3557         mask |= detect_coding_big5 (src, src_end);
3558       if (try & CODING_CATEGORY_MASK_UTF_8)
3559         mask |= detect_coding_utf_8 (src, src_end);
3560       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3561         mask |= detect_coding_utf_16 (src, src_end);
3562       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3563         mask |= detect_coding_emacs_mule (src, src_end);
3564       if (try & CODING_CATEGORY_MASK_CCL)
3565         mask |= detect_coding_ccl (src, src_end);
3566     }
3567   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3568 }
3569
3570 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3571    The information of the detected coding system is set in CODING.  */
3572
3573 void
3574 detect_coding (coding, src, src_bytes)
3575      struct coding_system *coding;
3576      unsigned char *src;
3577      int src_bytes;
3578 {
3579   unsigned int idx;
3580   int skip, mask, i;
3581   Lisp_Object val;
3582
3583   val = Vcoding_category_list;
3584   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip);
3585   coding->heading_ascii = skip;
3586
3587   if (!mask) return;
3588
3589   /* We found a single coding system of the highest priority in MASK.  */
3590   idx = 0;
3591   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3592   if (! mask)
3593     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3594
3595   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3596
3597   if (coding->eol_type != CODING_EOL_UNDECIDED)
3598     {
3599       Lisp_Object tmp;
3600
3601       tmp = Fget (val, Qeol_type);
3602       if (VECTORP (tmp))
3603         val = XVECTOR (tmp)->contents[coding->eol_type];
3604     }
3605
3606   /* Setup this new coding system while preserving some slots.  */
3607   {
3608     int src_multibyte = coding->src_multibyte;
3609     int dst_multibyte = coding->dst_multibyte;
3610
3611     setup_coding_system (val, coding);
3612     coding->src_multibyte = src_multibyte;
3613     coding->dst_multibyte = dst_multibyte;
3614     coding->heading_ascii = skip;
3615   }
3616 }
3617
3618 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3619    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3620    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3621
3622    How many non-eol characters are at the head is returned as *SKIP.  */
3623
3624 #define MAX_EOL_CHECK_COUNT 3
3625
3626 static int
3627 detect_eol_type (source, src_bytes, skip)
3628      unsigned char *source;
3629      int src_bytes, *skip;
3630 {
3631   unsigned char *src = source, *src_end = src + src_bytes;
3632   unsigned char c;
3633   int total = 0;                /* How many end-of-lines are found so far.  */
3634   int eol_type = CODING_EOL_UNDECIDED;
3635   int this_eol_type;
3636
3637   *skip = 0;
3638
3639   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3640     {
3641       c = *src++;
3642       if (c == '\n' || c == '\r')
3643         {
3644           if (*skip == 0)
3645             *skip = src - 1 - source;
3646           total++;
3647           if (c == '\n')
3648             this_eol_type = CODING_EOL_LF;
3649           else if (src >= src_end || *src != '\n')
3650             this_eol_type = CODING_EOL_CR;
3651           else
3652             this_eol_type = CODING_EOL_CRLF, src++;
3653
3654           if (eol_type == CODING_EOL_UNDECIDED)
3655             /* This is the first end-of-line.  */
3656             eol_type = this_eol_type;
3657           else if (eol_type != this_eol_type)
3658             {
3659               /* The found type is different from what found before.  */
3660               eol_type = CODING_EOL_INCONSISTENT;
3661               break;
3662             }
3663         }
3664     }
3665
3666   if (*skip == 0)
3667     *skip = src_end - source;
3668   return eol_type;
3669 }
3670
3671 /* Like detect_eol_type, but detect EOL type in 2-octet
3672    big-endian/little-endian format for coding systems utf-16-be and
3673    utf-16-le.  */
3674
3675 static int
3676 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3677      unsigned char *source;
3678      int src_bytes, *skip;
3679 {
3680   unsigned char *src = source, *src_end = src + src_bytes;
3681   unsigned int c1, c2;
3682   int total = 0;                /* How many end-of-lines are found so far.  */
3683   int eol_type = CODING_EOL_UNDECIDED;
3684   int this_eol_type;
3685   int msb, lsb;
3686
3687   if (big_endian_p)
3688     msb = 0, lsb = 1;
3689   else
3690     msb = 1, lsb = 0;
3691
3692   *skip = 0;
3693
3694   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3695     {
3696       c1 = (src[msb] << 8) | (src[lsb]);
3697       src += 2;
3698
3699       if (c1 == '\n' || c1 == '\r')
3700         {
3701           if (*skip == 0)
3702             *skip = src - 2 - source;
3703           total++;
3704           if (c1 == '\n')
3705             {
3706               this_eol_type = CODING_EOL_LF;
3707             }
3708           else
3709             {
3710               if ((src + 1) >= src_end)
3711                 {
3712                   this_eol_type = CODING_EOL_CR;
3713                 }
3714               else
3715                 {
3716                   c2 = (src[msb] << 8) | (src[lsb]);
3717                   if (c2 == '\n')
3718                     this_eol_type = CODING_EOL_CRLF, src += 2;
3719                   else
3720                     this_eol_type = CODING_EOL_CR;
3721                 }
3722             }
3723
3724           if (eol_type == CODING_EOL_UNDECIDED)
3725             /* This is the first end-of-line.  */
3726             eol_type = this_eol_type;
3727           else if (eol_type != this_eol_type)
3728             {
3729               /* The found type is different from what found before.  */
3730               eol_type = CODING_EOL_INCONSISTENT;
3731               break;
3732             }
3733         }
3734     }
3735
3736   if (*skip == 0)
3737     *skip = src_end - source;
3738   return eol_type;
3739 }
3740
3741 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3742    is encoded.  If it detects an appropriate format of end-of-line, it
3743    sets the information in *CODING.  */
3744
3745 void
3746 detect_eol (coding, src, src_bytes)
3747      struct coding_system *coding;
3748      unsigned char *src;
3749      int src_bytes;
3750 {
3751   Lisp_Object val;
3752   int skip;
3753   int eol_type;
3754
3755   switch (coding->category_idx)
3756     {
3757     case CODING_CATEGORY_IDX_UTF_16_BE:
3758       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3759       break;
3760     case CODING_CATEGORY_IDX_UTF_16_LE:
3761       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3762       break;
3763     default:
3764       eol_type = detect_eol_type (src, src_bytes, &skip);
3765       break;
3766     }
3767
3768   if (coding->heading_ascii > skip)
3769     coding->heading_ascii = skip;
3770   else
3771     skip = coding->heading_ascii;
3772
3773   if (eol_type == CODING_EOL_UNDECIDED)
3774     return;
3775   if (eol_type == CODING_EOL_INCONSISTENT)
3776     {
3777 #if 0
3778       /* This code is suppressed until we find a better way to
3779          distinguish raw text file and binary file.  */
3780
3781       /* If we have already detected that the coding is raw-text, the
3782          coding should actually be no-conversion.  */
3783       if (coding->type == coding_type_raw_text)
3784         {
3785           setup_coding_system (Qno_conversion, coding);
3786           return;
3787         }
3788       /* Else, let's decode only text code anyway.  */
3789 #endif /* 0 */
3790       eol_type = CODING_EOL_LF;
3791     }
3792
3793   val = Fget (coding->symbol, Qeol_type);
3794   if (VECTORP (val) && XVECTOR (val)->size == 3)
3795     {
3796       int src_multibyte = coding->src_multibyte;
3797       int dst_multibyte = coding->dst_multibyte;
3798
3799       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3800       coding->src_multibyte = src_multibyte;
3801       coding->dst_multibyte = dst_multibyte;
3802       coding->heading_ascii = skip;
3803     }
3804 }
3805
3806 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3807
3808 #define DECODING_BUFFER_MAG(coding)                     \
3809   (coding->type == coding_type_iso2022                  \
3810    ? 3                                                  \
3811    : (coding->type == coding_type_ccl                   \
3812       ? coding->spec.ccl.decoder.buf_magnification      \
3813       : 2))
3814
3815 /* Return maximum size (bytes) of a buffer enough for decoding
3816    SRC_BYTES of text encoded in CODING.  */
3817
3818 int
3819 decoding_buffer_size (coding, src_bytes)
3820      struct coding_system *coding;
3821      int src_bytes;
3822 {
3823   return (src_bytes * DECODING_BUFFER_MAG (coding)
3824           + CONVERSION_BUFFER_EXTRA_ROOM);
3825 }
3826
3827 /* Return maximum size (bytes) of a buffer enough for encoding
3828    SRC_BYTES of text to CODING.  */
3829
3830 int
3831 encoding_buffer_size (coding, src_bytes)
3832      struct coding_system *coding;
3833      int src_bytes;
3834 {
3835   int magnification;
3836
3837   if (coding->type == coding_type_ccl)
3838     magnification = coding->spec.ccl.encoder.buf_magnification;
3839   else if (CODING_REQUIRE_ENCODING (coding))
3840     magnification = 3;
3841   else
3842     magnification = 1;
3843
3844   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3845 }
3846
3847 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3848 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3849 #endif
3850
3851 char *conversion_buffer;
3852 int conversion_buffer_size;
3853
3854 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3855    or decoding.  Sufficient memory is allocated automatically.  If we
3856    run out of memory, return NULL.  */
3857
3858 char *
3859 get_conversion_buffer (size)
3860      int size;
3861 {
3862   if (size > conversion_buffer_size)
3863     {
3864       char *buf;
3865       int real_size = conversion_buffer_size * 2;
3866
3867       while (real_size < size) real_size *= 2;
3868       buf = (char *) xmalloc (real_size);
3869       xfree (conversion_buffer);
3870       conversion_buffer = buf;
3871       conversion_buffer_size = real_size;
3872     }
3873   return conversion_buffer;
3874 }
3875
3876 int
3877 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
3878      struct coding_system *coding;
3879      unsigned char *source, *destination;
3880      int src_bytes, dst_bytes, encodep;
3881 {
3882   struct ccl_program *ccl
3883     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
3884   int result;
3885
3886   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
3887   if (encodep)
3888     ccl->eol_type = coding->eol_type;
3889   coding->produced = ccl_driver (ccl, source, destination,
3890                                  src_bytes, dst_bytes, &(coding->consumed));
3891   if (encodep)
3892     coding->produced_char = coding->produced;
3893   else
3894     {
3895       int bytes
3896         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
3897       coding->produced = str_as_multibyte (destination, bytes,
3898                                            coding->produced,
3899                                            &(coding->produced_char));
3900     }
3901
3902   switch (ccl->status)
3903     {
3904     case CCL_STAT_SUSPEND_BY_SRC:
3905       result = CODING_FINISH_INSUFFICIENT_SRC;
3906       break;
3907     case CCL_STAT_SUSPEND_BY_DST:
3908       result = CODING_FINISH_INSUFFICIENT_DST;
3909       break;
3910     case CCL_STAT_QUIT:
3911     case CCL_STAT_INVALID_CMD:
3912       result = CODING_FINISH_INTERRUPT;
3913       break;
3914     default:
3915       result = CODING_FINISH_NORMAL;
3916       break;
3917     }
3918   return result;
3919 }
3920
3921 /* Decode EOL format of the text at PTR of BYTES length destructively
3922    according to CODING->eol_type.  This is called after the CCL
3923    program produced a decoded text at PTR.  If we do CRLF->LF
3924    conversion, update CODING->produced and CODING->produced_char.  */
3925
3926 static void
3927 decode_eol_post_ccl (coding, ptr, bytes)
3928      struct coding_system *coding;
3929      unsigned char *ptr;
3930      int bytes;
3931 {
3932   Lisp_Object val, saved_coding_symbol;
3933   unsigned char *pend = ptr + bytes;
3934   int dummy;
3935
3936   /* Remember the current coding system symbol.  We set it back when
3937      an inconsistent EOL is found so that `last-coding-system-used' is
3938      set to the coding system that doesn't specify EOL conversion.  */
3939   saved_coding_symbol = coding->symbol;
3940
3941   coding->spec.ccl.cr_carryover = 0;
3942   if (coding->eol_type == CODING_EOL_UNDECIDED)
3943     {
3944       /* Here, to avoid the call of setup_coding_system, we directly
3945          call detect_eol_type.  */
3946       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
3947       val = Fget (coding->symbol, Qeol_type);
3948       if (VECTORP (val) && XVECTOR (val)->size == 3)
3949         coding->symbol = XVECTOR (val)->contents[coding->eol_type];
3950       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
3951     }
3952
3953   if (coding->eol_type == CODING_EOL_LF)
3954     {
3955       /* We have nothing to do.  */
3956       ptr = pend;
3957     }
3958   else if (coding->eol_type == CODING_EOL_CRLF)
3959     {
3960       unsigned char *pstart = ptr, *p = ptr;
3961
3962       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
3963           && *(pend - 1) == '\r')
3964         {
3965           /* If the last character is CR, we can't handle it here
3966              because LF will be in the not-yet-decoded source text.
3967              Recorded that the CR is not yet processed.  */
3968           coding->spec.ccl.cr_carryover = 1;
3969           coding->produced--;
3970           coding->produced_char--;
3971           pend--;
3972         }
3973       while (ptr < pend)
3974         {
3975           if (*ptr == '\r')
3976             {
3977               if (ptr + 1 < pend && *(ptr + 1) == '\n')
3978                 {
3979                   *p++ = '\n';
3980                   ptr += 2;
3981                 }
3982               else
3983                 {
3984                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3985                     goto undo_eol_conversion;
3986                   *p++ = *ptr++;
3987                 }
3988             }
3989           else if (*ptr == '\n'
3990                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3991             goto undo_eol_conversion;
3992           else
3993             *p++ = *ptr++;
3994           continue;
3995
3996         undo_eol_conversion:
3997           /* We have faced with inconsistent EOL format at PTR.
3998              Convert all LFs before PTR back to CRLFs.  */
3999           for (p--, ptr--; p >= pstart; p--)
4000             {
4001               if (*p == '\n')
4002                 *ptr-- = '\n', *ptr-- = '\r';
4003               else
4004                 *ptr-- = *p;
4005             }
4006           /*  If carryover is recorded, cancel it because we don't
4007               convert CRLF anymore.  */
4008           if (coding->spec.ccl.cr_carryover)
4009             {
4010               coding->spec.ccl.cr_carryover = 0;
4011               coding->produced++;
4012               coding->produced_char++;
4013               pend++;
4014             }
4015           p = ptr = pend;
4016           coding->eol_type = CODING_EOL_LF;
4017           coding->symbol = saved_coding_symbol;
4018         }
4019       if (p < pend)
4020         {
4021           /* As each two-byte sequence CRLF was converted to LF, (PEND
4022              - P) is the number of deleted characters.  */
4023           coding->produced -= pend - p;
4024           coding->produced_char -= pend - p;
4025         }
4026     }
4027   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4028     {
4029       unsigned char *p = ptr;
4030
4031       for (; ptr < pend; ptr++)
4032         {
4033           if (*ptr == '\r')
4034             *ptr = '\n';
4035           else if (*ptr == '\n'
4036                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4037             {
4038               for (; p < ptr; p++)
4039                 {
4040                   if (*p == '\n')
4041                     *p = '\r';
4042                 }
4043               ptr = pend;
4044               coding->eol_type = CODING_EOL_LF;
4045               coding->symbol = saved_coding_symbol;
4046             }
4047         }
4048     }
4049 }
4050
4051 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4052    decoding, it may detect coding system and format of end-of-line if
4053    those are not yet decided.  The source should be unibyte, the
4054    result is multibyte if CODING->dst_multibyte is nonzero, else
4055    unibyte.  */
4056
4057 int
4058 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4059      struct coding_system *coding;
4060      unsigned char *source, *destination;
4061      int src_bytes, dst_bytes;
4062 {
4063   if (coding->type == coding_type_undecided)
4064     detect_coding (coding, source, src_bytes);
4065
4066   if (coding->eol_type == CODING_EOL_UNDECIDED
4067       && coding->type != coding_type_ccl)
4068     detect_eol (coding, source, src_bytes);
4069
4070   coding->produced = coding->produced_char = 0;
4071   coding->consumed = coding->consumed_char = 0;
4072   coding->errors = 0;
4073   coding->result = CODING_FINISH_NORMAL;
4074
4075   switch (coding->type)
4076     {
4077     case coding_type_sjis:
4078       decode_coding_sjis_big5 (coding, source, destination,
4079                                src_bytes, dst_bytes, 1);
4080       break;
4081
4082     case coding_type_iso2022:
4083       decode_coding_iso2022 (coding, source, destination,
4084                              src_bytes, dst_bytes);
4085       break;
4086
4087     case coding_type_big5:
4088       decode_coding_sjis_big5 (coding, source, destination,
4089                                src_bytes, dst_bytes, 0);
4090       break;
4091
4092     case coding_type_emacs_mule:
4093       decode_coding_emacs_mule (coding, source, destination,
4094                                 src_bytes, dst_bytes);
4095       break;
4096
4097     case coding_type_ccl:
4098       if (coding->spec.ccl.cr_carryover)
4099         {
4100           /* Set the CR which is not processed by the previous call of
4101              decode_eol_post_ccl in DESTINATION.  */
4102           *destination = '\r';
4103           coding->produced++;
4104           coding->produced_char++;
4105           dst_bytes--;
4106         }
4107       ccl_coding_driver (coding, source,
4108                          destination + coding->spec.ccl.cr_carryover,
4109                          src_bytes, dst_bytes, 0);
4110       if (coding->eol_type != CODING_EOL_LF)
4111         decode_eol_post_ccl (coding, destination, coding->produced);
4112       break;
4113
4114     default:
4115       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4116     }
4117
4118   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4119       && coding->consumed == src_bytes)
4120     coding->result = CODING_FINISH_NORMAL;
4121
4122   if (coding->mode & CODING_MODE_LAST_BLOCK
4123       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4124     {
4125       unsigned char *src = source + coding->consumed;
4126       unsigned char *dst = destination + coding->produced;
4127
4128       src_bytes -= coding->consumed;
4129      coding->errors++;
4130       if (COMPOSING_P (coding))
4131         DECODE_COMPOSITION_END ('1');
4132       while (src_bytes--)
4133         {
4134           int c = *src++;
4135           dst += CHAR_STRING (c, dst);
4136           coding->produced_char++;
4137         }
4138       coding->consumed = coding->consumed_char = src - source;
4139       coding->produced = dst - destination;
4140     }
4141
4142   if (!coding->dst_multibyte)
4143     {
4144       coding->produced = str_as_unibyte (destination, coding->produced);
4145       coding->produced_char = coding->produced;
4146     }
4147
4148   return coding->result;
4149 }
4150
4151 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4152    multibyteness of the source is CODING->src_multibyte, the
4153    multibyteness of the result is always unibyte.  */
4154
4155 int
4156 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4157      struct coding_system *coding;
4158      unsigned char *source, *destination;
4159      int src_bytes, dst_bytes;
4160 {
4161   coding->produced = coding->produced_char = 0;
4162   coding->consumed = coding->consumed_char = 0;
4163   coding->errors = 0;
4164   coding->result = CODING_FINISH_NORMAL;
4165
4166   switch (coding->type)
4167     {
4168     case coding_type_sjis:
4169       encode_coding_sjis_big5 (coding, source, destination,
4170                                src_bytes, dst_bytes, 1);
4171       break;
4172
4173     case coding_type_iso2022:
4174       encode_coding_iso2022 (coding, source, destination,
4175                              src_bytes, dst_bytes);
4176       break;
4177
4178     case coding_type_big5:
4179       encode_coding_sjis_big5 (coding, source, destination,
4180                                src_bytes, dst_bytes, 0);
4181       break;
4182
4183     case coding_type_emacs_mule:
4184       encode_coding_emacs_mule (coding, source, destination,
4185                                 src_bytes, dst_bytes);
4186       break;
4187
4188     case coding_type_ccl:
4189       ccl_coding_driver (coding, source, destination,
4190                          src_bytes, dst_bytes, 1);
4191       break;
4192
4193     default:
4194       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4195     }
4196
4197   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4198       && coding->consumed == src_bytes)
4199     coding->result = CODING_FINISH_NORMAL;
4200
4201   if (coding->mode & CODING_MODE_LAST_BLOCK)
4202     {
4203       unsigned char *src = source + coding->consumed;
4204       unsigned char *src_end = src + src_bytes;
4205       unsigned char *dst = destination + coding->produced;
4206
4207       if (coding->type == coding_type_iso2022)
4208         ENCODE_RESET_PLANE_AND_REGISTER;
4209       if (COMPOSING_P (coding))
4210         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4211       if (coding->consumed < src_bytes)
4212         {
4213           int len = src_bytes - coding->consumed;
4214
4215           BCOPY_SHORT (source + coding->consumed, dst, len);
4216           if (coding->src_multibyte)
4217             len = str_as_unibyte (dst, len);
4218           dst += len;
4219           coding->consumed = src_bytes;
4220         }
4221       coding->produced = coding->produced_char = dst - destination;
4222     }
4223
4224   return coding->result;
4225 }
4226
4227 /* Scan text in the region between *BEG and *END (byte positions),
4228    skip characters which we don't have to decode by coding system
4229    CODING at the head and tail, then set *BEG and *END to the region
4230    of the text we actually have to convert.  The caller should move
4231    the gap out of the region in advance if the region is from a
4232    buffer.
4233
4234    If STR is not NULL, *BEG and *END are indices into STR.  */
4235
4236 static void
4237 shrink_decoding_region (beg, end, coding, str)
4238      int *beg, *end;
4239      struct coding_system *coding;
4240      unsigned char *str;
4241 {
4242   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4243   int eol_conversion;
4244   Lisp_Object translation_table;
4245
4246   if (coding->type == coding_type_ccl
4247       || coding->type == coding_type_undecided
4248       || coding->eol_type != CODING_EOL_LF
4249       || !NILP (coding->post_read_conversion)
4250       || coding->composing != COMPOSITION_DISABLED)
4251     {
4252       /* We can't skip any data.  */
4253       return;
4254     }
4255   if (coding->type == coding_type_no_conversion
4256       || coding->type == coding_type_raw_text
4257       || coding->type == coding_type_emacs_mule)
4258     {
4259       /* We need no conversion, but don't have to skip any data here.
4260          Decoding routine handles them effectively anyway.  */
4261       return;
4262     }
4263
4264   translation_table = coding->translation_table_for_decode;
4265   if (NILP (translation_table) && !NILP (Venable_character_translation))
4266     translation_table = Vstandard_translation_table_for_decode;
4267   if (CHAR_TABLE_P (translation_table))
4268     {
4269       int i;
4270       for (i = 0; i < 128; i++)
4271         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4272           break;
4273       if (i < 128)
4274         /* Some ASCII character should be translated.  We give up
4275            shrinking.  */
4276         return;
4277     }
4278
4279   if (coding->heading_ascii >= 0)
4280     /* Detection routine has already found how much we can skip at the
4281        head.  */
4282     *beg += coding->heading_ascii;
4283
4284   if (str)
4285     {
4286       begp_orig = begp = str + *beg;
4287       endp_orig = endp = str + *end;
4288     }
4289   else
4290     {
4291       begp_orig = begp = BYTE_POS_ADDR (*beg);
4292       endp_orig = endp = begp + *end - *beg;
4293     }
4294
4295   eol_conversion = (coding->eol_type == CODING_EOL_CR
4296                     || coding->eol_type == CODING_EOL_CRLF);
4297
4298   switch (coding->type)
4299     {
4300     case coding_type_sjis:
4301     case coding_type_big5:
4302       /* We can skip all ASCII characters at the head.  */
4303       if (coding->heading_ascii < 0)
4304         {
4305           if (eol_conversion)
4306             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4307           else
4308             while (begp < endp && *begp < 0x80) begp++;
4309         }
4310       /* We can skip all ASCII characters at the tail except for the
4311          second byte of SJIS or BIG5 code.  */
4312       if (eol_conversion)
4313         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4314       else
4315         while (begp < endp && endp[-1] < 0x80) endp--;
4316       /* Do not consider LF as ascii if preceded by CR, since that
4317          confuses eol decoding. */
4318       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4319         endp++;
4320       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4321         endp++;
4322       break;
4323
4324     case coding_type_iso2022:
4325       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4326         /* We can't skip any data.  */
4327         break;
4328       if (coding->heading_ascii < 0)
4329         {
4330           /* We can skip all ASCII characters at the head except for a
4331              few control codes.  */
4332           while (begp < endp && (c = *begp) < 0x80
4333                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4334                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4335                  && (!eol_conversion || c != ISO_CODE_LF))
4336             begp++;
4337         }
4338       switch (coding->category_idx)
4339         {
4340         case CODING_CATEGORY_IDX_ISO_8_1:
4341         case CODING_CATEGORY_IDX_ISO_8_2:
4342           /* We can skip all ASCII characters at the tail.  */
4343           if (eol_conversion)
4344             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4345           else
4346             while (begp < endp && endp[-1] < 0x80) endp--;
4347           /* Do not consider LF as ascii if preceded by CR, since that
4348              confuses eol decoding. */
4349           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4350             endp++;
4351           break;
4352
4353         case CODING_CATEGORY_IDX_ISO_7:
4354         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4355           {
4356             /* We can skip all charactes at the tail except for 8-bit
4357                codes and ESC and the following 2-byte at the tail.  */
4358             unsigned char *eight_bit = NULL;
4359
4360             if (eol_conversion)
4361               while (begp < endp
4362                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4363                 {
4364                   if (!eight_bit && c & 0x80) eight_bit = endp;
4365                   endp--;
4366                 }
4367             else
4368               while (begp < endp
4369                      && (c = endp[-1]) != ISO_CODE_ESC)
4370                 {
4371                   if (!eight_bit && c & 0x80) eight_bit = endp;
4372                   endp--;
4373                 }
4374             /* Do not consider LF as ascii if preceded by CR, since that
4375                confuses eol decoding. */
4376             if (begp < endp && endp < endp_orig
4377                 && endp[-1] == '\r' && endp[0] == '\n')
4378               endp++;
4379             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4380               {
4381                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4382                   /* This is an ASCII designation sequence.  We can
4383                      surely skip the tail.  But, if we have
4384                      encountered an 8-bit code, skip only the codes
4385                      after that.  */
4386                   endp = eight_bit ? eight_bit : endp + 2;
4387                 else
4388                   /* Hmmm, we can't skip the tail.  */
4389                   endp = endp_orig;
4390               }
4391             else if (eight_bit)
4392               endp = eight_bit;
4393           }
4394         }
4395       break;
4396
4397     default:
4398       abort ();
4399     }
4400   *beg += begp - begp_orig;
4401   *end += endp - endp_orig;
4402   return;
4403 }
4404
4405 /* Like shrink_decoding_region but for encoding.  */
4406
4407 static void
4408 shrink_encoding_region (beg, end, coding, str)
4409      int *beg, *end;
4410      struct coding_system *coding;
4411      unsigned char *str;
4412 {
4413   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4414   int eol_conversion;
4415   Lisp_Object translation_table;
4416
4417   if (coding->type == coding_type_ccl
4418       || coding->eol_type == CODING_EOL_CRLF
4419       || coding->eol_type == CODING_EOL_CR
4420       || coding->cmp_data && coding->cmp_data->used > 0)
4421     {
4422       /* We can't skip any data.  */
4423       return;
4424     }
4425   if (coding->type == coding_type_no_conversion
4426       || coding->type == coding_type_raw_text
4427       || coding->type == coding_type_emacs_mule
4428       || coding->type == coding_type_undecided)
4429     {
4430       /* We need no conversion, but don't have to skip any data here.
4431          Encoding routine handles them effectively anyway.  */
4432       return;
4433     }
4434
4435   translation_table = coding->translation_table_for_encode;
4436   if (NILP (translation_table) && !NILP (Venable_character_translation))
4437     translation_table = Vstandard_translation_table_for_encode;
4438   if (CHAR_TABLE_P (translation_table))
4439     {
4440       int i;
4441       for (i = 0; i < 128; i++)
4442         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4443           break;
4444       if (i < 128)
4445         /* Some ASCII character should be tranlsated.  We give up
4446            shrinking.  */
4447         return;
4448     }
4449
4450   if (str)
4451     {
4452       begp_orig = begp = str + *beg;
4453       endp_orig = endp = str + *end;
4454     }
4455   else
4456     {
4457       begp_orig = begp = BYTE_POS_ADDR (*beg);
4458       endp_orig = endp = begp + *end - *beg;
4459     }
4460
4461   eol_conversion = (coding->eol_type == CODING_EOL_CR
4462                     || coding->eol_type == CODING_EOL_CRLF);
4463
4464   /* Here, we don't have to check coding->pre_write_conversion because
4465      the caller is expected to have handled it already.  */
4466   switch (coding->type)
4467     {
4468     case coding_type_iso2022:
4469       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4470         /* We can't skip any data.  */
4471         break;
4472       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4473         {
4474           unsigned char *bol = begp;
4475           while (begp < endp && *begp < 0x80)
4476             {
4477               begp++;
4478               if (begp[-1] == '\n')
4479                 bol = begp;
4480             }
4481           begp = bol;
4482           goto label_skip_tail;
4483         }
4484       /* fall down ... */
4485
4486     case coding_type_sjis:
4487     case coding_type_big5:
4488       /* We can skip all ASCII characters at the head and tail.  */
4489       if (eol_conversion)
4490         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4491       else
4492         while (begp < endp && *begp < 0x80) begp++;
4493     label_skip_tail:
4494       if (eol_conversion)
4495         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4496       else
4497         while (begp < endp && *(endp - 1) < 0x80) endp--;
4498       break;
4499
4500     default:
4501       abort ();
4502     }
4503
4504   *beg += begp - begp_orig;
4505   *end += endp - endp_orig;
4506   return;
4507 }
4508
4509 /* As shrinking conversion region requires some overhead, we don't try
4510    shrinking if the length of conversion region is less than this
4511    value.  */
4512 static int shrink_conversion_region_threshhold = 1024;
4513
4514 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4515   do {                                                                  \
4516     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4517       {                                                                 \
4518         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4519         else shrink_decoding_region (beg, end, coding, str);            \
4520       }                                                                 \
4521   } while (0)
4522
4523 static Lisp_Object
4524 code_convert_region_unwind (dummy)
4525      Lisp_Object dummy;
4526 {
4527   inhibit_pre_post_conversion = 0;
4528   return Qnil;
4529 }
4530
4531 /* Store information about all compositions in the range FROM and TO
4532    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4533    buffer or a string, defaults to the current buffer.  */
4534
4535 void
4536 coding_save_composition (coding, from, to, obj)
4537      struct coding_system *coding;
4538      int from, to;
4539      Lisp_Object obj;
4540 {
4541   Lisp_Object prop;
4542   int start, end;
4543
4544   if (coding->composing == COMPOSITION_DISABLED)
4545     return;
4546   if (!coding->cmp_data)
4547     coding_allocate_composition_data (coding, from);
4548   if (!find_composition (from, to, &start, &end, &prop, obj)
4549       || end > to)
4550     return;
4551   if (start < from
4552       && (!find_composition (end, to, &start, &end, &prop, obj)
4553           || end > to))
4554     return;
4555   coding->composing = COMPOSITION_NO;
4556   do
4557     {
4558       if (COMPOSITION_VALID_P (start, end, prop))
4559         {
4560           enum composition_method method = COMPOSITION_METHOD (prop);
4561           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4562               >= COMPOSITION_DATA_SIZE)
4563             coding_allocate_composition_data (coding, from);
4564           /* For relative composition, we remember start and end
4565              positions, for the other compositions, we also remember
4566              components.  */
4567           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4568           if (method != COMPOSITION_RELATIVE)
4569             {
4570               /* We must store a*/
4571               Lisp_Object val, ch;
4572
4573               val = COMPOSITION_COMPONENTS (prop);
4574               if (CONSP (val))
4575                 while (CONSP (val))
4576                   {
4577                     ch = XCAR (val), val = XCDR (val);
4578                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4579                   }
4580               else if (VECTORP (val) || STRINGP (val))
4581                 {
4582                   int len = (VECTORP (val)
4583                              ? XVECTOR (val)->size : XSTRING (val)->size);
4584                   int i;
4585                   for (i = 0; i < len; i++)
4586                     {
4587                       ch = (STRINGP (val)
4588                             ? Faref (val, make_number (i))
4589                             : XVECTOR (val)->contents[i]);
4590                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4591                     }
4592                 }
4593               else              /* INTEGERP (val) */
4594                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4595             }
4596           CODING_ADD_COMPOSITION_END (coding, end - from);
4597         }
4598       start = end;
4599     }
4600   while (start < to
4601          && find_composition (start, to, &start, &end, &prop, obj)
4602          && end <= to);
4603
4604   /* Make coding->cmp_data point to the first memory block.  */
4605   while (coding->cmp_data->prev)
4606     coding->cmp_data = coding->cmp_data->prev;
4607   coding->cmp_data_start = 0;
4608 }
4609
4610 /* Reflect the saved information about compositions to OBJ.
4611    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4612    is a buffer or a string, defaults to the current buffer.  */
4613
4614 void
4615 coding_restore_composition (coding, obj)
4616      struct coding_system *coding;
4617      Lisp_Object obj;
4618 {
4619   struct composition_data *cmp_data = coding->cmp_data;
4620
4621   if (!cmp_data)
4622     return;
4623
4624   while (cmp_data->prev)
4625     cmp_data = cmp_data->prev;
4626
4627   while (cmp_data)
4628     {
4629       int i;
4630
4631       for (i = 0; i < cmp_data->used; i += cmp_data->data[i])
4632         {
4633           int *data = cmp_data->data + i;
4634           enum composition_method method = (enum composition_method) data[3];
4635           Lisp_Object components;
4636
4637           if (method == COMPOSITION_RELATIVE)
4638             components = Qnil;
4639           else
4640             {
4641               int len = data[0] - 4, j;
4642               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4643
4644               for (j = 0; j < len; j++)
4645                 args[j] = make_number (data[4 + j]);
4646               components = (method == COMPOSITION_WITH_ALTCHARS
4647                             ? Fstring (len, args) : Fvector (len, args));
4648             }
4649           compose_text (data[1], data[2], components, Qnil, obj);
4650         }
4651       cmp_data = cmp_data->next;
4652     }
4653 }
4654
4655 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4656    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4657    coding system CODING, and return the status code of code conversion
4658    (currently, this value has no meaning).
4659
4660    How many characters (and bytes) are converted to how many
4661    characters (and bytes) are recorded in members of the structure
4662    CODING.
4663
4664    If REPLACE is nonzero, we do various things as if the original text
4665    is deleted and a new text is inserted.  See the comments in
4666    replace_range (insdel.c) to know what we are doing.
4667
4668    If REPLACE is zero, it is assumed that the source text is unibyte.
4669    Otherwize, it is assumed that the source text is multibyte.  */
4670
4671 int
4672 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4673      int from, from_byte, to, to_byte, encodep, replace;
4674      struct coding_system *coding;
4675 {
4676   int len = to - from, len_byte = to_byte - from_byte;
4677   int require, inserted, inserted_byte;
4678   int head_skip, tail_skip, total_skip = 0;
4679   Lisp_Object saved_coding_symbol;
4680   int first = 1;
4681   unsigned char *src, *dst;
4682   Lisp_Object deletion;
4683   int orig_point = PT, orig_len = len;
4684   int prev_Z;
4685   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4686
4687   coding->src_multibyte = replace && multibyte_p;
4688   coding->dst_multibyte = multibyte_p;
4689
4690   deletion = Qnil;
4691   saved_coding_symbol = Qnil;
4692
4693   if (from < PT && PT < to)
4694     {
4695       TEMP_SET_PT_BOTH (from, from_byte);
4696       orig_point = from;
4697     }
4698
4699   if (replace)
4700     {
4701       int saved_from = from;
4702
4703       prepare_to_modify_buffer (from, to, &from);
4704       if (saved_from != from)
4705         {
4706           to = from + len;
4707           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4708           len_byte = to_byte - from_byte;
4709         }
4710     }
4711
4712   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4713     {
4714       /* We must detect encoding of text and eol format.  */
4715
4716       if (from < GPT && to > GPT)
4717         move_gap_both (from, from_byte);
4718       if (coding->type == coding_type_undecided)
4719         {
4720           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4721           if (coding->type == coding_type_undecided)
4722             /* It seems that the text contains only ASCII, but we
4723                should not left it undecided because the deeper
4724                decoding routine (decode_coding) tries to detect the
4725                encodings again in vain.  */
4726             coding->type = coding_type_emacs_mule;
4727         }
4728       if (coding->eol_type == CODING_EOL_UNDECIDED
4729           && coding->type != coding_type_ccl)
4730         {
4731           saved_coding_symbol = coding->symbol;
4732           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4733           if (coding->eol_type == CODING_EOL_UNDECIDED)
4734             coding->eol_type = CODING_EOL_LF;
4735           /* We had better recover the original eol format if we
4736              encounter an inconsitent eol format while decoding.  */
4737           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4738         }
4739     }
4740
4741   /* Now we convert the text.  */
4742
4743   /* For encoding, we must process pre-write-conversion in advance.  */
4744   if (! inhibit_pre_post_conversion
4745       && encodep
4746       && SYMBOLP (coding->pre_write_conversion)
4747       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4748     {
4749       /* The function in pre-write-conversion may put a new text in a
4750          new buffer.  */
4751       struct buffer *prev = current_buffer;
4752       Lisp_Object new;
4753       int count = specpdl_ptr - specpdl;
4754
4755       record_unwind_protect (code_convert_region_unwind, Qnil);
4756       /* We should not call any more pre-write/post-read-conversion
4757          functions while this pre-write-conversion is running.  */
4758       inhibit_pre_post_conversion = 1;
4759       call2 (coding->pre_write_conversion,
4760              make_number (from), make_number (to));
4761       inhibit_pre_post_conversion = 0;
4762       /* Discard the unwind protect.  */
4763       specpdl_ptr--;
4764
4765       if (current_buffer != prev)
4766         {
4767           len = ZV - BEGV;
4768           new = Fcurrent_buffer ();
4769           set_buffer_internal_1 (prev);
4770           del_range_2 (from, from_byte, to, to_byte, 0);
4771           TEMP_SET_PT_BOTH (from, from_byte);
4772           insert_from_buffer (XBUFFER (new), 1, len, 0);
4773           Fkill_buffer (new);
4774           if (orig_point >= to)
4775             orig_point += len - orig_len;
4776           else if (orig_point > from)
4777             orig_point = from;
4778           orig_len = len;
4779           to = from + len;
4780           from_byte = CHAR_TO_BYTE (from);
4781           to_byte = CHAR_TO_BYTE (to);
4782           len_byte = to_byte - from_byte;
4783           TEMP_SET_PT_BOTH (from, from_byte);
4784         }
4785     }
4786
4787   if (replace)
4788     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4789
4790   if (coding->composing != COMPOSITION_DISABLED)
4791     {
4792       if (encodep)
4793         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4794       else
4795         coding_allocate_composition_data (coding, from);
4796     }
4797
4798   /* Try to skip the heading and tailing ASCIIs.  */
4799   {
4800     int from_byte_orig = from_byte, to_byte_orig = to_byte;
4801
4802     if (from < GPT && GPT < to)
4803       move_gap_both (from, from_byte);
4804     SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4805     if (from_byte == to_byte
4806         && (encodep || NILP (coding->post_read_conversion))
4807         && ! CODING_REQUIRE_FLUSHING (coding))
4808       {
4809         coding->produced = len_byte;
4810         coding->produced_char = len;
4811         if (!replace)
4812           /* We must record and adjust for this new text now.  */
4813           adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4814         return 0;
4815       }
4816
4817     head_skip = from_byte - from_byte_orig;
4818     tail_skip = to_byte_orig - to_byte;
4819     total_skip = head_skip + tail_skip;
4820     from += head_skip;
4821     to -= tail_skip;
4822     len -= total_skip; len_byte -= total_skip;
4823   }
4824
4825   /* The code conversion routine can not preserve text properties for
4826      now.  So, we must remove all text properties in the region.
4827      Here, we must suppress all modification hooks.  */
4828   if (replace)
4829     {
4830       int saved_inhibit_modification_hooks = inhibit_modification_hooks;
4831       inhibit_modification_hooks = 1;
4832       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4833       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4834     }
4835
4836   /* For converion, we must put the gap before the text in addition to
4837      making the gap larger for efficient decoding.  The required gap
4838      size starts from 2000 which is the magic number used in make_gap.
4839      But, after one batch of conversion, it will be incremented if we
4840      find that it is not enough .  */
4841   require = 2000;
4842
4843   if (GAP_SIZE  < require)
4844     make_gap (require - GAP_SIZE);
4845   move_gap_both (from, from_byte);
4846
4847   inserted = inserted_byte = 0;
4848
4849   GAP_SIZE += len_byte;
4850   ZV -= len;
4851   Z -= len;
4852   ZV_BYTE -= len_byte;
4853   Z_BYTE -= len_byte;
4854
4855   if (GPT - BEG < BEG_UNCHANGED)
4856     BEG_UNCHANGED = GPT - BEG;
4857   if (Z - GPT < END_UNCHANGED)
4858     END_UNCHANGED = Z - GPT;
4859
4860   if (!encodep && coding->src_multibyte)
4861     {
4862       /* Decoding routines expects that the source text is unibyte.
4863          We must convert 8-bit characters of multibyte form to
4864          unibyte.  */
4865       int len_byte_orig = len_byte;
4866       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
4867       if (len_byte < len_byte_orig)
4868         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
4869                     len_byte);
4870       coding->src_multibyte = 0;
4871     }
4872
4873   for (;;)
4874     {
4875       int result;
4876
4877       /* The buffer memory is now:
4878          +--------+converted-text+---------+-------original-text-------+---+
4879          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
4880                   |<---------------------- GAP ----------------------->|  */
4881       src = GAP_END_ADDR - len_byte;
4882       dst = GPT_ADDR + inserted_byte;
4883
4884       if (encodep)
4885         result = encode_coding (coding, src, dst, len_byte, 0);
4886       else
4887         result = decode_coding (coding, src, dst, len_byte, 0);
4888
4889       /* The buffer memory is now:
4890          +--------+-------converted-text----+--+------original-text----+---+
4891          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
4892                   |<---------------------- GAP ----------------------->|  */
4893
4894       inserted += coding->produced_char;
4895       inserted_byte += coding->produced;
4896       len_byte -= coding->consumed;
4897
4898       if (result == CODING_FINISH_INSUFFICIENT_CMP)
4899         {
4900           coding_allocate_composition_data (coding, from + inserted);
4901           continue;
4902         }
4903
4904       src += coding->consumed;
4905       dst += coding->produced;
4906
4907       if (result == CODING_FINISH_NORMAL)
4908         {
4909           src += len_byte;
4910           break;
4911         }
4912       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
4913         {
4914           unsigned char *pend = dst, *p = pend - inserted_byte;
4915           Lisp_Object eol_type;
4916
4917           /* Encode LFs back to the original eol format (CR or CRLF).  */
4918           if (coding->eol_type == CODING_EOL_CR)
4919             {
4920               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
4921             }
4922           else
4923             {
4924               int count = 0;
4925
4926               while (p < pend) if (*p++ == '\n') count++;
4927               if (src - dst < count)
4928                 {
4929                   /* We don't have sufficient room for encoding LFs
4930                      back to CRLF.  We must record converted and
4931                      not-yet-converted text back to the buffer
4932                      content, enlarge the gap, then record them out of
4933                      the buffer contents again.  */
4934                   int add = len_byte + inserted_byte;
4935
4936                   GAP_SIZE -= add;
4937                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
4938                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
4939                   make_gap (count - GAP_SIZE);
4940                   GAP_SIZE += add;
4941                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
4942                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
4943                   /* Don't forget to update SRC, DST, and PEND.  */
4944                   src = GAP_END_ADDR - len_byte;
4945                   dst = GPT_ADDR + inserted_byte;
4946                   pend = dst;
4947                 }
4948               inserted += count;
4949               inserted_byte += count;
4950               coding->produced += count;
4951               p = dst = pend + count;
4952               while (count)
4953                 {
4954                   *--p = *--pend;
4955                   if (*p == '\n') count--, *--p = '\r';
4956                 }
4957             }
4958
4959           /* Suppress eol-format conversion in the further conversion.  */
4960           coding->eol_type = CODING_EOL_LF;
4961
4962           /* Set the coding system symbol to that for Unix-like EOL.  */
4963           eol_type = Fget (saved_coding_symbol, Qeol_type);
4964           if (VECTORP (eol_type)
4965               && XVECTOR (eol_type)->size == 3
4966               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
4967             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
4968           else
4969             coding->symbol = saved_coding_symbol;
4970
4971           continue;
4972         }
4973       if (len_byte <= 0)
4974         {
4975           if (coding->type != coding_type_ccl
4976               || coding->mode & CODING_MODE_LAST_BLOCK)
4977             break;
4978           coding->mode |= CODING_MODE_LAST_BLOCK;
4979           continue;
4980         }
4981       if (result == CODING_FINISH_INSUFFICIENT_SRC)
4982         {
4983           /* The source text ends in invalid codes.  Let's just
4984              make them valid buffer contents, and finish conversion.  */
4985           inserted += len_byte;
4986           inserted_byte += len_byte;
4987           while (len_byte--)
4988             *dst++ = *src++;
4989           break;
4990         }
4991       if (result == CODING_FINISH_INTERRUPT)
4992         {
4993           /* The conversion procedure was interrupted by a user.  */
4994           break;
4995         }
4996       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
4997       if (coding->consumed < 1)
4998         {
4999           /* It's quite strange to require more memory without
5000              consuming any bytes.  Perhaps CCL program bug.  */
5001           break;
5002         }
5003       if (first)
5004         {
5005           /* We have just done the first batch of conversion which was
5006              stoped because of insufficient gap.  Let's reconsider the
5007              required gap size (i.e. SRT - DST) now.
5008
5009              We have converted ORIG bytes (== coding->consumed) into
5010              NEW bytes (coding->produced).  To convert the remaining
5011              LEN bytes, we may need REQUIRE bytes of gap, where:
5012                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5013                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5014              Here, we are sure that NEW >= ORIG.  */
5015           float ratio = coding->produced - coding->consumed;
5016           ratio /= coding->consumed;
5017           require = len_byte * ratio;
5018           first = 0;
5019         }
5020       if ((src - dst) < (require + 2000))
5021         {
5022           /* See the comment above the previous call of make_gap.  */
5023           int add = len_byte + inserted_byte;
5024
5025           GAP_SIZE -= add;
5026           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5027           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5028           make_gap (require + 2000);
5029           GAP_SIZE += add;
5030           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5031           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5032         }
5033     }
5034   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5035
5036   if (encodep && coding->dst_multibyte)
5037     {
5038       /* The output is unibyte.  We must convert 8-bit characters to
5039          multibyte form.  */
5040       if (inserted_byte * 2 > GAP_SIZE)
5041         {
5042           GAP_SIZE -= inserted_byte;
5043           ZV += inserted_byte; Z += inserted_byte;
5044           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5045           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5046           make_gap (inserted_byte - GAP_SIZE);
5047           GAP_SIZE += inserted_byte;
5048           ZV -= inserted_byte; Z -= inserted_byte;
5049           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5050           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5051         }
5052       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5053     }
5054
5055   /* If we have shrinked the conversion area, adjust it now.  */
5056   if (total_skip > 0)
5057     {
5058       if (tail_skip > 0)
5059         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5060       inserted += total_skip; inserted_byte += total_skip;
5061       GAP_SIZE += total_skip;
5062       GPT -= head_skip; GPT_BYTE -= head_skip;
5063       ZV -= total_skip; ZV_BYTE -= total_skip;
5064       Z -= total_skip; Z_BYTE -= total_skip;
5065       from -= head_skip; from_byte -= head_skip;
5066       to += tail_skip; to_byte += tail_skip;
5067     }
5068
5069   prev_Z = Z;
5070   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5071   inserted = Z - prev_Z;
5072
5073   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5074     coding_restore_composition (coding, Fcurrent_buffer ());
5075   coding_free_composition_data (coding);
5076
5077   if (! inhibit_pre_post_conversion
5078       && ! encodep && ! NILP (coding->post_read_conversion))
5079     {
5080       Lisp_Object val;
5081       int count = specpdl_ptr - specpdl;
5082
5083       if (from != PT)
5084         TEMP_SET_PT_BOTH (from, from_byte);
5085       prev_Z = Z;
5086       record_unwind_protect (code_convert_region_unwind, Qnil);
5087       /* We should not call any more pre-write/post-read-conversion
5088          functions while this post-read-conversion is running.  */
5089       inhibit_pre_post_conversion = 1;
5090       val = call1 (coding->post_read_conversion, make_number (inserted));
5091       inhibit_pre_post_conversion = 0;
5092       /* Discard the unwind protect.  */
5093       specpdl_ptr--;
5094       CHECK_NUMBER (val, 0);
5095       inserted += Z - prev_Z;
5096     }
5097
5098   if (orig_point >= from)
5099     {
5100       if (orig_point >= from + orig_len)
5101         orig_point += inserted - orig_len;
5102       else
5103         orig_point = from;
5104       TEMP_SET_PT (orig_point);
5105     }
5106
5107   if (replace)
5108     {
5109       signal_after_change (from, to - from, inserted);
5110       update_compositions (from, from + inserted, CHECK_BORDER);
5111     }
5112
5113   {
5114     coding->consumed = to_byte - from_byte;
5115     coding->consumed_char = to - from;
5116     coding->produced = inserted_byte;
5117     coding->produced_char = inserted;
5118   }
5119
5120   return 0;
5121 }
5122
5123 Lisp_Object
5124 run_pre_post_conversion_on_str (str, coding, encodep)
5125      Lisp_Object str;
5126      struct coding_system *coding;
5127      int encodep;
5128 {
5129   int count = specpdl_ptr - specpdl;
5130   struct gcpro gcpro1;
5131   struct buffer *prev = current_buffer;
5132   int multibyte = STRING_MULTIBYTE (str);
5133
5134   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5135   record_unwind_protect (code_convert_region_unwind, Qnil);
5136   GCPRO1 (str);
5137   temp_output_buffer_setup (" *code-converting-work*");
5138   set_buffer_internal (XBUFFER (Vstandard_output));
5139   /* We must insert the contents of STR as is without
5140      unibyte<->multibyte conversion.  For that, we adjust the
5141      multibyteness of the working buffer to that of STR.  */
5142   Ferase_buffer ();
5143   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5144   insert_from_string (str, 0, 0,
5145                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5146   UNGCPRO;
5147   inhibit_pre_post_conversion = 1;
5148   if (encodep)
5149     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5150   else
5151     {
5152       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5153       call1 (coding->post_read_conversion, make_number (Z - BEG));
5154     }
5155   inhibit_pre_post_conversion = 0;
5156   str = make_buffer_string (BEG, Z, 0);
5157   return unbind_to (count, str);
5158 }
5159
5160 Lisp_Object
5161 decode_coding_string (str, coding, nocopy)
5162      Lisp_Object str;
5163      struct coding_system *coding;
5164      int nocopy;
5165 {
5166   int len;
5167   char *buf;
5168   int from, to, to_byte;
5169   struct gcpro gcpro1;
5170   Lisp_Object saved_coding_symbol;
5171   int result;
5172
5173   from = 0;
5174   to = XSTRING (str)->size;
5175   to_byte = STRING_BYTES (XSTRING (str));
5176
5177   saved_coding_symbol = Qnil;
5178   if (CODING_REQUIRE_DETECTION (coding))
5179     {
5180       /* See the comments in code_convert_region.  */
5181       if (coding->type == coding_type_undecided)
5182         {
5183           detect_coding (coding, XSTRING (str)->data, to_byte);
5184           if (coding->type == coding_type_undecided)
5185             coding->type = coding_type_emacs_mule;
5186         }
5187       if (coding->eol_type == CODING_EOL_UNDECIDED
5188           && coding->type != coding_type_ccl)
5189         {
5190           saved_coding_symbol = coding->symbol;
5191           detect_eol (coding, XSTRING (str)->data, to_byte);
5192           if (coding->eol_type == CODING_EOL_UNDECIDED)
5193             coding->eol_type = CODING_EOL_LF;
5194           /* We had better recover the original eol format if we
5195              encounter an inconsitent eol format while decoding.  */
5196           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5197         }
5198     }
5199
5200   if (! CODING_REQUIRE_DECODING (coding))
5201     {
5202       if (!STRING_MULTIBYTE (str))
5203         {
5204           str = Fstring_as_multibyte (str);
5205           nocopy = 1;
5206         }
5207       return (nocopy ? str : Fcopy_sequence (str));
5208     }
5209
5210   if (STRING_MULTIBYTE (str))
5211     {
5212       /* Decoding routines expect the source text to be unibyte.  */
5213       str = Fstring_as_unibyte (str);
5214       nocopy = 1;
5215       coding->src_multibyte = 0;
5216     }
5217   coding->dst_multibyte = 1;
5218
5219   if (coding->composing != COMPOSITION_DISABLED)
5220     coding_allocate_composition_data (coding, from);
5221
5222   /* Try to skip the heading and tailing ASCIIs.  */
5223   {
5224     int from_orig = from;
5225
5226     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5227                               0);
5228     if (from == to_byte)
5229       return (nocopy ? str : Fcopy_sequence (str));
5230   }
5231
5232   len = decoding_buffer_size (coding, to_byte - from);
5233   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5234   GCPRO1 (str);
5235   buf = get_conversion_buffer (len);
5236   UNGCPRO;
5237
5238   if (from > 0)
5239     bcopy (XSTRING (str)->data, buf, from);
5240   result = decode_coding (coding, XSTRING (str)->data + from,
5241                          buf + from, to_byte - from, len);
5242   if (result == CODING_FINISH_INCONSISTENT_EOL)
5243     {
5244       /* We simply try to decode the whole string again but without
5245          eol-conversion this time.  */
5246       coding->eol_type = CODING_EOL_LF;
5247       coding->symbol = saved_coding_symbol;
5248       coding_free_composition_data (coding);
5249       return decode_coding_string (str, coding, nocopy);
5250     }
5251
5252   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5253          STRING_BYTES (XSTRING (str)) - to_byte);
5254
5255   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5256   str = make_multibyte_string (buf, len + coding->produced_char,
5257                                len + coding->produced);
5258
5259   if (coding->cmp_data && coding->cmp_data->used)
5260     coding_restore_composition (coding, str);
5261   coding_free_composition_data (coding);
5262
5263   if (SYMBOLP (coding->post_read_conversion)
5264       && !NILP (Ffboundp (coding->post_read_conversion)))
5265     str = run_pre_post_conversion_on_str (str, coding, 0);
5266
5267   return str;
5268 }
5269
5270 Lisp_Object
5271 encode_coding_string (str, coding, nocopy)
5272      Lisp_Object str;
5273      struct coding_system *coding;
5274      int nocopy;
5275 {
5276   int len;
5277   char *buf;
5278   int from, to, to_byte;
5279   struct gcpro gcpro1;
5280   Lisp_Object saved_coding_symbol;
5281   int result;
5282
5283   if (SYMBOLP (coding->pre_write_conversion)
5284       && !NILP (Ffboundp (coding->pre_write_conversion)))
5285     str = run_pre_post_conversion_on_str (str, coding, 1);
5286
5287   from = 0;
5288   to = XSTRING (str)->size;
5289   to_byte = STRING_BYTES (XSTRING (str));
5290
5291   saved_coding_symbol = Qnil;
5292   if (! CODING_REQUIRE_ENCODING (coding))
5293     {
5294       if (STRING_MULTIBYTE (str))
5295         {
5296           str = Fstring_as_unibyte (str);
5297           nocopy = 1;
5298         }
5299       return (nocopy ? str : Fcopy_sequence (str));
5300     }
5301
5302   /* Encoding routines determine the multibyteness of the source text
5303      by coding->src_multibyte.  */
5304   coding->src_multibyte = STRING_MULTIBYTE (str);
5305   coding->dst_multibyte = 0;
5306
5307   if (coding->composing != COMPOSITION_DISABLED)
5308     coding_save_composition (coding, from, to, str);
5309
5310   /* Try to skip the heading and tailing ASCIIs.  */
5311   {
5312     int from_orig = from;
5313
5314     SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5315                               1);
5316     if (from == to_byte)
5317       return (nocopy ? str : Fcopy_sequence (str));
5318   }
5319
5320   len = encoding_buffer_size (coding, to_byte - from);
5321   len += from + STRING_BYTES (XSTRING (str)) - to_byte;
5322   GCPRO1 (str);
5323   buf = get_conversion_buffer (len);
5324   UNGCPRO;
5325
5326   if (from > 0)
5327     bcopy (XSTRING (str)->data, buf, from);
5328   result = encode_coding (coding, XSTRING (str)->data + from,
5329                           buf + from, to_byte - from, len);
5330   bcopy (XSTRING (str)->data + to_byte, buf + from + coding->produced,
5331          STRING_BYTES (XSTRING (str)) - to_byte);
5332
5333   len = from + STRING_BYTES (XSTRING (str)) - to_byte;
5334   str = make_unibyte_string (buf, len + coding->produced);
5335   coding_free_composition_data (coding);
5336
5337   return str;
5338 }
5339
5340 \f
5341 #ifdef emacs
5342 /*** 8. Emacs Lisp library functions ***/
5343
5344 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5345   "Return t if OBJECT is nil or a coding-system.\n\
5346 See the documentation of `make-coding-system' for information\n\
5347 about coding-system objects.")
5348   (obj)
5349      Lisp_Object obj;
5350 {
5351   if (NILP (obj))
5352     return Qt;
5353   if (!SYMBOLP (obj))
5354     return Qnil;
5355   /* Get coding-spec vector for OBJ.  */
5356   obj = Fget (obj, Qcoding_system);
5357   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5358           ? Qt : Qnil);
5359 }
5360
5361 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5362        Sread_non_nil_coding_system, 1, 1, 0,
5363   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5364   (prompt)
5365      Lisp_Object prompt;
5366 {
5367   Lisp_Object val;
5368   do
5369     {
5370       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5371                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5372     }
5373   while (XSTRING (val)->size == 0);
5374   return (Fintern (val, Qnil));
5375 }
5376
5377 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5378   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5379 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5380   (prompt, default_coding_system)
5381      Lisp_Object prompt, default_coding_system;
5382 {
5383   Lisp_Object val;
5384   if (SYMBOLP (default_coding_system))
5385     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5386   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5387                           Qt, Qnil, Qcoding_system_history,
5388                           default_coding_system, Qnil);
5389   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5390 }
5391
5392 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5393        1, 1, 0,
5394   "Check validity of CODING-SYSTEM.\n\
5395 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5396 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5397 The value of property should be a vector of length 5.")
5398   (coding_system)
5399      Lisp_Object coding_system;
5400 {
5401   CHECK_SYMBOL (coding_system, 0);
5402   if (!NILP (Fcoding_system_p (coding_system)))
5403     return coding_system;
5404   while (1)
5405     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5406 }
5407 \f
5408 Lisp_Object
5409 detect_coding_system (src, src_bytes, highest)
5410      unsigned char *src;
5411      int src_bytes, highest;
5412 {
5413   int coding_mask, eol_type;
5414   Lisp_Object val, tmp;
5415   int dummy;
5416
5417   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy);
5418   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5419   if (eol_type == CODING_EOL_INCONSISTENT)
5420     eol_type = CODING_EOL_UNDECIDED;
5421
5422   if (!coding_mask)
5423     {
5424       val = Qundecided;
5425       if (eol_type != CODING_EOL_UNDECIDED)
5426         {
5427           Lisp_Object val2;
5428           val2 = Fget (Qundecided, Qeol_type);
5429           if (VECTORP (val2))
5430             val = XVECTOR (val2)->contents[eol_type];
5431         }
5432       return (highest ? val : Fcons (val, Qnil));
5433     }
5434
5435   /* At first, gather possible coding systems in VAL.  */
5436   val = Qnil;
5437   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5438     {
5439       Lisp_Object category_val, category_index;
5440
5441       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5442       category_val = Fsymbol_value (XCAR (tmp));
5443       if (!NILP (category_val)
5444           && NATNUMP (category_index)
5445           && (coding_mask & (1 << XFASTINT (category_index))))
5446         {
5447           val = Fcons (category_val, val);
5448           if (highest)
5449             break;
5450         }
5451     }
5452   if (!highest)
5453     val = Fnreverse (val);
5454
5455   /* Then, replace the elements with subsidiary coding systems.  */
5456   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5457     {
5458       if (eol_type != CODING_EOL_UNDECIDED
5459           && eol_type != CODING_EOL_INCONSISTENT)
5460         {
5461           Lisp_Object eol;
5462           eol = Fget (XCAR (tmp), Qeol_type);
5463           if (VECTORP (eol))
5464             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5465         }
5466     }
5467   return (highest ? XCAR (val) : val);
5468 }
5469
5470 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5471        2, 3, 0,
5472   "Detect coding system of the text in the region between START and END.\n\
5473 Return a list of possible coding systems ordered by priority.\n\
5474 \n\
5475 If only ASCII characters are found, it returns a list of single element\n\
5476 `undecided' or its subsidiary coding system according to a detected\n\
5477 end-of-line format.\n\
5478 \n\
5479 If optional argument HIGHEST is non-nil, return the coding system of\n\
5480 highest priority.")
5481   (start, end, highest)
5482      Lisp_Object start, end, highest;
5483 {
5484   int from, to;
5485   int from_byte, to_byte;
5486
5487   CHECK_NUMBER_COERCE_MARKER (start, 0);
5488   CHECK_NUMBER_COERCE_MARKER (end, 1);
5489
5490   validate_region (&start, &end);
5491   from = XINT (start), to = XINT (end);
5492   from_byte = CHAR_TO_BYTE (from);
5493   to_byte = CHAR_TO_BYTE (to);
5494
5495   if (from < GPT && to >= GPT)
5496     move_gap_both (to, to_byte);
5497
5498   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5499                                to_byte - from_byte,
5500                                !NILP (highest));
5501 }
5502
5503 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5504        1, 2, 0,
5505   "Detect coding system of the text in STRING.\n\
5506 Return a list of possible coding systems ordered by priority.\n\
5507 \n\
5508 If only ASCII characters are found, it returns a list of single element\n\
5509 `undecided' or its subsidiary coding system according to a detected\n\
5510 end-of-line format.\n\
5511 \n\
5512 If optional argument HIGHEST is non-nil, return the coding system of\n\
5513 highest priority.")
5514   (string, highest)
5515      Lisp_Object string, highest;
5516 {
5517   CHECK_STRING (string, 0);
5518
5519   return detect_coding_system (XSTRING (string)->data,
5520                                STRING_BYTES (XSTRING (string)),
5521                                !NILP (highest));
5522 }
5523
5524 Lisp_Object
5525 code_convert_region1 (start, end, coding_system, encodep)
5526      Lisp_Object start, end, coding_system;
5527      int encodep;
5528 {
5529   struct coding_system coding;
5530   int from, to, len;
5531
5532   CHECK_NUMBER_COERCE_MARKER (start, 0);
5533   CHECK_NUMBER_COERCE_MARKER (end, 1);
5534   CHECK_SYMBOL (coding_system, 2);
5535
5536   validate_region (&start, &end);
5537   from = XFASTINT (start);
5538   to = XFASTINT (end);
5539
5540   if (NILP (coding_system))
5541     return make_number (to - from);
5542
5543   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5544     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5545
5546   coding.mode |= CODING_MODE_LAST_BLOCK;
5547   coding.src_multibyte = coding.dst_multibyte
5548     = !NILP (current_buffer->enable_multibyte_characters);
5549   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5550                        &coding, encodep, 1);
5551   Vlast_coding_system_used = coding.symbol;
5552   return make_number (coding.produced_char);
5553 }
5554
5555 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5556        3, 3, "r\nzCoding system: ",
5557   "Decode the current region by specified coding system.\n\
5558 When called from a program, takes three arguments:\n\
5559 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5560 This function sets `last-coding-system-used' to the precise coding system\n\
5561 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5562 not fully specified.)\n\
5563 It returns the length of the decoded text.")
5564   (start, end, coding_system)
5565      Lisp_Object start, end, coding_system;
5566 {
5567   return code_convert_region1 (start, end, coding_system, 0);
5568 }
5569
5570 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5571        3, 3, "r\nzCoding system: ",
5572   "Encode the current region by specified coding system.\n\
5573 When called from a program, takes three arguments:\n\
5574 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5575 This function sets `last-coding-system-used' to the precise coding system\n\
5576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5577 not fully specified.)\n\
5578 It returns the length of the encoded text.")
5579   (start, end, coding_system)
5580      Lisp_Object start, end, coding_system;
5581 {
5582   return code_convert_region1 (start, end, coding_system, 1);
5583 }
5584
5585 Lisp_Object
5586 code_convert_string1 (string, coding_system, nocopy, encodep)
5587      Lisp_Object string, coding_system, nocopy;
5588      int encodep;
5589 {
5590   struct coding_system coding;
5591
5592   CHECK_STRING (string, 0);
5593   CHECK_SYMBOL (coding_system, 1);
5594
5595   if (NILP (coding_system))
5596     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
5597
5598   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5599     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5600
5601   coding.mode |= CODING_MODE_LAST_BLOCK;
5602   string = (encodep
5603             ? encode_coding_string (string, &coding, !NILP (nocopy))
5604             : decode_coding_string (string, &coding, !NILP (nocopy)));
5605   Vlast_coding_system_used = coding.symbol;
5606
5607   return string;
5608 }
5609
5610 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
5611        2, 3, 0,
5612   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
5613 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5614 if the decoding operation is trivial.\n\
5615 This function sets `last-coding-system-used' to the precise coding system\n\
5616 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5617 not fully specified.)")
5618   (string, coding_system, nocopy)
5619      Lisp_Object string, coding_system, nocopy;
5620 {
5621   return code_convert_string1 (string, coding_system, nocopy, 0);
5622 }
5623
5624 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
5625        2, 3, 0,
5626   "Encode STRING to CODING-SYSTEM, and return the result.\n\
5627 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
5628 if the encoding operation is trivial.\n\
5629 This function sets `last-coding-system-used' to the precise coding system\n\
5630 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5631 not fully specified.)")
5632   (string, coding_system, nocopy)
5633      Lisp_Object string, coding_system, nocopy;
5634 {
5635   return code_convert_string1 (string, coding_system, nocopy, 1);
5636 }
5637
5638 /* Encode or decode STRING according to CODING_SYSTEM.
5639    Do not set Vlast_coding_system_used.
5640
5641    This function is called only from macros DECODE_FILE and
5642    ENCODE_FILE, thus we ignore character composition.  */
5643
5644 Lisp_Object
5645 code_convert_string_norecord (string, coding_system, encodep)
5646      Lisp_Object string, coding_system;
5647      int encodep;
5648 {
5649   struct coding_system coding;
5650
5651   CHECK_STRING (string, 0);
5652   CHECK_SYMBOL (coding_system, 1);
5653
5654   if (NILP (coding_system))
5655     return string;
5656
5657   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5658     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5659
5660   coding.composing = COMPOSITION_DISABLED;
5661   coding.mode |= CODING_MODE_LAST_BLOCK;
5662   return (encodep
5663           ? encode_coding_string (string, &coding, 1)
5664           : decode_coding_string (string, &coding, 1));
5665 }
5666 \f
5667 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
5668   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
5669 Return the corresponding character.")
5670   (code)
5671      Lisp_Object code;
5672 {
5673   unsigned char c1, c2, s1, s2;
5674   Lisp_Object val;
5675
5676   CHECK_NUMBER (code, 0);
5677   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
5678   if (s1 == 0)
5679     {
5680       if (s2 < 0x80)
5681         XSETFASTINT (val, s2);
5682       else if (s2 >= 0xA0 || s2 <= 0xDF)
5683         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
5684       else
5685         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5686     }
5687   else
5688     {
5689       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
5690           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
5691         error ("Invalid Shift JIS code: %x", XFASTINT (code));
5692       DECODE_SJIS (s1, s2, c1, c2);
5693       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
5694     }
5695   return val;
5696 }
5697
5698 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
5699   "Encode a Japanese character CHAR to shift_jis encoding.\n\
5700 Return the corresponding code in SJIS.")
5701   (ch)
5702      Lisp_Object ch;
5703 {
5704   int charset, c1, c2, s1, s2;
5705   Lisp_Object val;
5706
5707   CHECK_NUMBER (ch, 0);
5708   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5709   if (charset == CHARSET_ASCII)
5710     {
5711       val = ch;
5712     }
5713   else if (charset == charset_jisx0208
5714            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
5715     {
5716       ENCODE_SJIS (c1, c2, s1, s2);
5717       XSETFASTINT (val, (s1 << 8) | s2);
5718     }
5719   else if (charset == charset_katakana_jisx0201
5720            && c1 > 0x20 && c2 < 0xE0)
5721     {
5722       XSETFASTINT (val, c1 | 0x80);
5723     }
5724   else
5725     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
5726   return val;
5727 }
5728
5729 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
5730   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
5731 Return the corresponding character.")
5732   (code)
5733      Lisp_Object code;
5734 {
5735   int charset;
5736   unsigned char b1, b2, c1, c2;
5737   Lisp_Object val;
5738
5739   CHECK_NUMBER (code, 0);
5740   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
5741   if (b1 == 0)
5742     {
5743       if (b2 >= 0x80)
5744         error ("Invalid BIG5 code: %x", XFASTINT (code));
5745       val = code;
5746     }
5747   else
5748     {
5749       if ((b1 < 0xA1 || b1 > 0xFE)
5750           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
5751         error ("Invalid BIG5 code: %x", XFASTINT (code));
5752       DECODE_BIG5 (b1, b2, charset, c1, c2);
5753       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
5754     }
5755   return val;
5756 }
5757
5758 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
5759   "Encode the Big5 character CHAR to BIG5 coding system.\n\
5760 Return the corresponding character code in Big5.")
5761   (ch)
5762      Lisp_Object ch;
5763 {
5764   int charset, c1, c2, b1, b2;
5765   Lisp_Object val;
5766
5767   CHECK_NUMBER (ch, 0);
5768   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
5769   if (charset == CHARSET_ASCII)
5770     {
5771       val = ch;
5772     }
5773   else if ((charset == charset_big5_1
5774             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
5775            || (charset == charset_big5_2
5776                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
5777     {
5778       ENCODE_BIG5 (charset, c1, c2, b1, b2);
5779       XSETFASTINT (val, (b1 << 8) | b2);
5780     }
5781   else
5782     error ("Can't encode to Big5: %d", XFASTINT (ch));
5783   return val;
5784 }
5785 \f
5786 DEFUN ("set-terminal-coding-system-internal",
5787        Fset_terminal_coding_system_internal,
5788        Sset_terminal_coding_system_internal, 1, 1, 0, "")
5789   (coding_system)
5790      Lisp_Object coding_system;
5791 {
5792   CHECK_SYMBOL (coding_system, 0);
5793   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
5794   /* We had better not send unsafe characters to terminal.  */
5795   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
5796   /* Characer composition should be disabled.  */
5797   terminal_coding.composing = COMPOSITION_DISABLED;
5798   terminal_coding.src_multibyte = 1;
5799   terminal_coding.dst_multibyte = 0;
5800   return Qnil;
5801 }
5802
5803 DEFUN ("set-safe-terminal-coding-system-internal",
5804        Fset_safe_terminal_coding_system_internal,
5805        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
5806   (coding_system)
5807      Lisp_Object coding_system;
5808 {
5809   CHECK_SYMBOL (coding_system, 0);
5810   setup_coding_system (Fcheck_coding_system (coding_system),
5811                        &safe_terminal_coding);
5812   /* Characer composition should be disabled.  */
5813   safe_terminal_coding.composing = COMPOSITION_DISABLED;
5814   safe_terminal_coding.src_multibyte = 1;
5815   safe_terminal_coding.dst_multibyte = 0;
5816   return Qnil;
5817 }
5818
5819 DEFUN ("terminal-coding-system",
5820        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
5821   "Return coding system specified for terminal output.")
5822   ()
5823 {
5824   return terminal_coding.symbol;
5825 }
5826
5827 DEFUN ("set-keyboard-coding-system-internal",
5828        Fset_keyboard_coding_system_internal,
5829        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
5830   (coding_system)
5831      Lisp_Object coding_system;
5832 {
5833   CHECK_SYMBOL (coding_system, 0);
5834   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
5835   /* Characer composition should be disabled.  */
5836   keyboard_coding.composing = COMPOSITION_DISABLED;
5837   return Qnil;
5838 }
5839
5840 DEFUN ("keyboard-coding-system",
5841        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
5842   "Return coding system specified for decoding keyboard input.")
5843   ()
5844 {
5845   return keyboard_coding.symbol;
5846 }
5847
5848 \f
5849 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
5850        Sfind_operation_coding_system,  1, MANY, 0,
5851   "Choose a coding system for an operation based on the target name.\n\
5852 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
5853 DECODING-SYSTEM is the coding system to use for decoding\n\
5854 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
5855 for encoding (in case OPERATION does encoding).\n\
5856 \n\
5857 The first argument OPERATION specifies an I/O primitive:\n\
5858   For file I/O, `insert-file-contents' or `write-region'.\n\
5859   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
5860   For network I/O, `open-network-stream'.\n\
5861 \n\
5862 The remaining arguments should be the same arguments that were passed\n\
5863 to the primitive.  Depending on which primitive, one of those arguments\n\
5864 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
5865 whichever argument specifies the file name is TARGET.\n\
5866 \n\
5867 TARGET has a meaning which depends on OPERATION:\n\
5868   For file I/O, TARGET is a file name.\n\
5869   For process I/O, TARGET is a process name.\n\
5870   For network I/O, TARGET is a service name or a port number\n\
5871 \n\
5872 This function looks up what specified for TARGET in,\n\
5873 `file-coding-system-alist', `process-coding-system-alist',\n\
5874 or `network-coding-system-alist' depending on OPERATION.\n\
5875 They may specify a coding system, a cons of coding systems,\n\
5876 or a function symbol to call.\n\
5877 In the last case, we call the function with one argument,\n\
5878 which is a list of all the arguments given to this function.")
5879   (nargs, args)
5880      int nargs;
5881      Lisp_Object *args;
5882 {
5883   Lisp_Object operation, target_idx, target, val;
5884   register Lisp_Object chain;
5885
5886   if (nargs < 2)
5887     error ("Too few arguments");
5888   operation = args[0];
5889   if (!SYMBOLP (operation)
5890       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
5891     error ("Invalid first arguement");
5892   if (nargs < 1 + XINT (target_idx))
5893     error ("Too few arguments for operation: %s",
5894            XSYMBOL (operation)->name->data);
5895   target = args[XINT (target_idx) + 1];
5896   if (!(STRINGP (target)
5897         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
5898     error ("Invalid %dth argument", XINT (target_idx) + 1);
5899
5900   chain = ((EQ (operation, Qinsert_file_contents)
5901             || EQ (operation, Qwrite_region))
5902            ? Vfile_coding_system_alist
5903            : (EQ (operation, Qopen_network_stream)
5904               ? Vnetwork_coding_system_alist
5905               : Vprocess_coding_system_alist));
5906   if (NILP (chain))
5907     return Qnil;
5908
5909   for (; CONSP (chain); chain = XCDR (chain))
5910     {
5911       Lisp_Object elt;
5912       elt = XCAR (chain);
5913
5914       if (CONSP (elt)
5915           && ((STRINGP (target)
5916                && STRINGP (XCAR (elt))
5917                && fast_string_match (XCAR (elt), target) >= 0)
5918               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
5919         {
5920           val = XCDR (elt);
5921           /* Here, if VAL is both a valid coding system and a valid
5922              function symbol, we return VAL as a coding system.  */
5923           if (CONSP (val))
5924             return val;
5925           if (! SYMBOLP (val))
5926             return Qnil;
5927           if (! NILP (Fcoding_system_p (val)))
5928             return Fcons (val, val);
5929           if (! NILP (Ffboundp (val)))
5930             {
5931               val = call1 (val, Flist (nargs, args));
5932               if (CONSP (val))
5933                 return val;
5934               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
5935                 return Fcons (val, val);
5936             }
5937           return Qnil;
5938         }
5939     }
5940   return Qnil;
5941 }
5942
5943 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
5944        Supdate_coding_systems_internal, 0, 0, 0,
5945   "Update internal database for ISO2022 and CCL based coding systems.\n\
5946 When values of any coding categories are changed, you must\n\
5947 call this function")
5948   ()
5949 {
5950   int i;
5951
5952   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
5953     {
5954       Lisp_Object val;
5955
5956       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
5957       if (!NILP (val))
5958         {
5959           if (! coding_system_table[i])
5960             coding_system_table[i] = ((struct coding_system *)
5961                                       xmalloc (sizeof (struct coding_system)));
5962           setup_coding_system (val, coding_system_table[i]);
5963         }
5964       else if (coding_system_table[i])
5965         {
5966           xfree (coding_system_table[i]);
5967           coding_system_table[i] = NULL;
5968         }
5969     }
5970
5971   return Qnil;
5972 }
5973
5974 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
5975        Sset_coding_priority_internal, 0, 0, 0,
5976   "Update internal database for the current value of `coding-category-list'.\n\
5977 This function is internal use only.")
5978   ()
5979 {
5980   int i = 0, idx;
5981   Lisp_Object val;
5982
5983   val = Vcoding_category_list;
5984
5985   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
5986     {
5987       if (! SYMBOLP (XCAR (val)))
5988         break;
5989       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
5990       if (idx >= CODING_CATEGORY_IDX_MAX)
5991         break;
5992       coding_priorities[i++] = (1 << idx);
5993       val = XCDR (val);
5994     }
5995   /* If coding-category-list is valid and contains all coding
5996      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
5997      the following code saves Emacs from crashing.  */
5998   while (i < CODING_CATEGORY_IDX_MAX)
5999     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6000
6001   return Qnil;
6002 }
6003
6004 #endif /* emacs */
6005
6006 \f
6007 /*** 9. Post-amble ***/
6008
6009 void
6010 init_coding ()
6011 {
6012   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
6013 }
6014
6015 void
6016 init_coding_once ()
6017 {
6018   int i;
6019
6020   /* Emacs' internal format specific initialize routine.  */
6021   for (i = 0; i <= 0x20; i++)
6022     emacs_code_class[i] = EMACS_control_code;
6023   emacs_code_class[0x0A] = EMACS_linefeed_code;
6024   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6025   for (i = 0x21 ; i < 0x7F; i++)
6026     emacs_code_class[i] = EMACS_ascii_code;
6027   emacs_code_class[0x7F] = EMACS_control_code;
6028   for (i = 0x80; i < 0xFF; i++)
6029     emacs_code_class[i] = EMACS_invalid_code;
6030   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6031   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6032   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6033   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6034
6035   /* ISO2022 specific initialize routine.  */
6036   for (i = 0; i < 0x20; i++)
6037     iso_code_class[i] = ISO_control_0;
6038   for (i = 0x21; i < 0x7F; i++)
6039     iso_code_class[i] = ISO_graphic_plane_0;
6040   for (i = 0x80; i < 0xA0; i++)
6041     iso_code_class[i] = ISO_control_1;
6042   for (i = 0xA1; i < 0xFF; i++)
6043     iso_code_class[i] = ISO_graphic_plane_1;
6044   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6045   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6046   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6047   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6048   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6049   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6050   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6051   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6052   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6053   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6054
6055   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
6056
6057   setup_coding_system (Qnil, &keyboard_coding);
6058   setup_coding_system (Qnil, &terminal_coding);
6059   setup_coding_system (Qnil, &safe_terminal_coding);
6060   setup_coding_system (Qnil, &default_buffer_file_coding);
6061
6062   bzero (coding_system_table, sizeof coding_system_table);
6063
6064   bzero (ascii_skip_code, sizeof ascii_skip_code);
6065   for (i = 0; i < 128; i++)
6066     ascii_skip_code[i] = 1;
6067
6068 #if defined (MSDOS) || defined (WINDOWSNT)
6069   system_eol_type = CODING_EOL_CRLF;
6070 #else
6071   system_eol_type = CODING_EOL_LF;
6072 #endif
6073
6074   inhibit_pre_post_conversion = 0;
6075 }
6076
6077 #ifdef emacs
6078
6079 void
6080 syms_of_coding ()
6081 {
6082   Qtarget_idx = intern ("target-idx");
6083   staticpro (&Qtarget_idx);
6084
6085   Qcoding_system_history = intern ("coding-system-history");
6086   staticpro (&Qcoding_system_history);
6087   Fset (Qcoding_system_history, Qnil);
6088
6089   /* Target FILENAME is the first argument.  */
6090   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6091   /* Target FILENAME is the third argument.  */
6092   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6093
6094   Qcall_process = intern ("call-process");
6095   staticpro (&Qcall_process);
6096   /* Target PROGRAM is the first argument.  */
6097   Fput (Qcall_process, Qtarget_idx, make_number (0));
6098
6099   Qcall_process_region = intern ("call-process-region");
6100   staticpro (&Qcall_process_region);
6101   /* Target PROGRAM is the third argument.  */
6102   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6103
6104   Qstart_process = intern ("start-process");
6105   staticpro (&Qstart_process);
6106   /* Target PROGRAM is the third argument.  */
6107   Fput (Qstart_process, Qtarget_idx, make_number (2));
6108
6109   Qopen_network_stream = intern ("open-network-stream");
6110   staticpro (&Qopen_network_stream);
6111   /* Target SERVICE is the fourth argument.  */
6112   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6113
6114   Qcoding_system = intern ("coding-system");
6115   staticpro (&Qcoding_system);
6116
6117   Qeol_type = intern ("eol-type");
6118   staticpro (&Qeol_type);
6119
6120   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6121   staticpro (&Qbuffer_file_coding_system);
6122
6123   Qpost_read_conversion = intern ("post-read-conversion");
6124   staticpro (&Qpost_read_conversion);
6125
6126   Qpre_write_conversion = intern ("pre-write-conversion");
6127   staticpro (&Qpre_write_conversion);
6128
6129   Qno_conversion = intern ("no-conversion");
6130   staticpro (&Qno_conversion);
6131
6132   Qundecided = intern ("undecided");
6133   staticpro (&Qundecided);
6134
6135   Qcoding_system_p = intern ("coding-system-p");
6136   staticpro (&Qcoding_system_p);
6137
6138   Qcoding_system_error = intern ("coding-system-error");
6139   staticpro (&Qcoding_system_error);
6140
6141   Fput (Qcoding_system_error, Qerror_conditions,
6142         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6143   Fput (Qcoding_system_error, Qerror_message,
6144         build_string ("Invalid coding system"));
6145
6146   Qcoding_category = intern ("coding-category");
6147   staticpro (&Qcoding_category);
6148   Qcoding_category_index = intern ("coding-category-index");
6149   staticpro (&Qcoding_category_index);
6150
6151   Vcoding_category_table
6152     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6153   staticpro (&Vcoding_category_table);
6154   {
6155     int i;
6156     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6157       {
6158         XVECTOR (Vcoding_category_table)->contents[i]
6159           = intern (coding_category_name[i]);
6160         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6161               Qcoding_category_index, make_number (i));
6162       }
6163   }
6164
6165   Qtranslation_table = intern ("translation-table");
6166   staticpro (&Qtranslation_table);
6167   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6168
6169   Qtranslation_table_id = intern ("translation-table-id");
6170   staticpro (&Qtranslation_table_id);
6171
6172   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6173   staticpro (&Qtranslation_table_for_decode);
6174
6175   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6176   staticpro (&Qtranslation_table_for_encode);
6177
6178   Qsafe_charsets = intern ("safe-charsets");
6179   staticpro (&Qsafe_charsets);
6180
6181   Qvalid_codes = intern ("valid-codes");
6182   staticpro (&Qvalid_codes);
6183
6184   Qemacs_mule = intern ("emacs-mule");
6185   staticpro (&Qemacs_mule);
6186
6187   Qraw_text = intern ("raw-text");
6188   staticpro (&Qraw_text);
6189
6190   defsubr (&Scoding_system_p);
6191   defsubr (&Sread_coding_system);
6192   defsubr (&Sread_non_nil_coding_system);
6193   defsubr (&Scheck_coding_system);
6194   defsubr (&Sdetect_coding_region);
6195   defsubr (&Sdetect_coding_string);
6196   defsubr (&Sdecode_coding_region);
6197   defsubr (&Sencode_coding_region);
6198   defsubr (&Sdecode_coding_string);
6199   defsubr (&Sencode_coding_string);
6200   defsubr (&Sdecode_sjis_char);
6201   defsubr (&Sencode_sjis_char);
6202   defsubr (&Sdecode_big5_char);
6203   defsubr (&Sencode_big5_char);
6204   defsubr (&Sset_terminal_coding_system_internal);
6205   defsubr (&Sset_safe_terminal_coding_system_internal);
6206   defsubr (&Sterminal_coding_system);
6207   defsubr (&Sset_keyboard_coding_system_internal);
6208   defsubr (&Skeyboard_coding_system);
6209   defsubr (&Sfind_operation_coding_system);
6210   defsubr (&Supdate_coding_systems_internal);
6211   defsubr (&Sset_coding_priority_internal);
6212
6213   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6214     "List of coding systems.\n\
6215 \n\
6216 Do not alter the value of this variable manually.  This variable should be\n\
6217 updated by the functions `make-coding-system' and\n\
6218 `define-coding-system-alias'.");
6219   Vcoding_system_list = Qnil;
6220
6221   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6222     "Alist of coding system names.\n\
6223 Each element is one element list of coding system name.\n\
6224 This variable is given to `completing-read' as TABLE argument.\n\
6225 \n\
6226 Do not alter the value of this variable manually.  This variable should be\n\
6227 updated by the functions `make-coding-system' and\n\
6228 `define-coding-system-alias'.");
6229   Vcoding_system_alist = Qnil;
6230
6231   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6232     "List of coding-categories (symbols) ordered by priority.");
6233   {
6234     int i;
6235
6236     Vcoding_category_list = Qnil;
6237     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6238       Vcoding_category_list
6239         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6240                  Vcoding_category_list);
6241   }
6242
6243   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6244     "Specify the coding system for read operations.\n\
6245 It is useful to bind this variable with `let', but do not set it globally.\n\
6246 If the value is a coding system, it is used for decoding on read operation.\n\
6247 If not, an appropriate element is used from one of the coding system alists:\n\
6248 There are three such tables, `file-coding-system-alist',\n\
6249 `process-coding-system-alist', and `network-coding-system-alist'.");
6250   Vcoding_system_for_read = Qnil;
6251
6252   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6253     "Specify the coding system for write operations.\n\
6254 Programs bind this variable with `let', but you should not set it globally.\n\
6255 If the value is a coding system, it is used for encoding of output,\n\
6256 when writing it to a file and when sending it to a file or subprocess.\n\
6257 \n\
6258 If this does not specify a coding system, an appropriate element\n\
6259 is used from one of the coding system alists:\n\
6260 There are three such tables, `file-coding-system-alist',\n\
6261 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6262 For output to files, if the above procedure does not specify a coding system,\n\
6263 the value of `buffer-file-coding-system' is used.");
6264   Vcoding_system_for_write = Qnil;
6265
6266   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6267     "Coding system used in the latest file or process I/O.");
6268   Vlast_coding_system_used = Qnil;
6269
6270   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6271     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6272 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6273 such conversion.");
6274   inhibit_eol_conversion = 0;
6275
6276   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6277     "Non-nil means process buffer inherits coding system of process output.\n\
6278 Bind it to t if the process output is to be treated as if it were a file\n\
6279 read from some filesystem.");
6280   inherit_process_coding_system = 0;
6281
6282   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6283     "Alist to decide a coding system to use for a file I/O operation.\n\
6284 The format is ((PATTERN . VAL) ...),\n\
6285 where PATTERN is a regular expression matching a file name,\n\
6286 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6287 If VAL is a coding system, it is used for both decoding and encoding\n\
6288 the file contents.\n\
6289 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6290 and the cdr part is used for encoding.\n\
6291 If VAL is a function symbol, the function must return a coding system\n\
6292 or a cons of coding systems which are used as above.\n\
6293 \n\
6294 See also the function `find-operation-coding-system'\n\
6295 and the variable `auto-coding-alist'.");
6296   Vfile_coding_system_alist = Qnil;
6297
6298   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6299     "Alist to decide a coding system to use for a process I/O operation.\n\
6300 The format is ((PATTERN . VAL) ...),\n\
6301 where PATTERN is a regular expression matching a program name,\n\
6302 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6303 If VAL is a coding system, it is used for both decoding what received\n\
6304 from the program and encoding what sent to the program.\n\
6305 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6306 and the cdr part is used for encoding.\n\
6307 If VAL is a function symbol, the function must return a coding system\n\
6308 or a cons of coding systems which are used as above.\n\
6309 \n\
6310 See also the function `find-operation-coding-system'.");
6311   Vprocess_coding_system_alist = Qnil;
6312
6313   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6314     "Alist to decide a coding system to use for a network I/O operation.\n\
6315 The format is ((PATTERN . VAL) ...),\n\
6316 where PATTERN is a regular expression matching a network service name\n\
6317 or is a port number to connect to,\n\
6318 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6319 If VAL is a coding system, it is used for both decoding what received\n\
6320 from the network stream and encoding what sent to the network stream.\n\
6321 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6322 and the cdr part is used for encoding.\n\
6323 If VAL is a function symbol, the function must return a coding system\n\
6324 or a cons of coding systems which are used as above.\n\
6325 \n\
6326 See also the function `find-operation-coding-system'.");
6327   Vnetwork_coding_system_alist = Qnil;
6328
6329   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6330     "Coding system to use with system messages.");
6331   Vlocale_coding_system = Qnil;
6332
6333   /* The eol mnemonics are reset in startup.el system-dependently.  */
6334   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6335     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6336   eol_mnemonic_unix = build_string (":");
6337
6338   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6339     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6340   eol_mnemonic_dos = build_string ("\\");
6341
6342   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6343     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6344   eol_mnemonic_mac = build_string ("/");
6345
6346   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6347     "*String displayed in mode line when end-of-line format is not yet determined.");
6348   eol_mnemonic_undecided = build_string (":");
6349
6350   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6351     "*Non-nil enables character translation while encoding and decoding.");
6352   Venable_character_translation = Qt;
6353
6354   DEFVAR_LISP ("standard-translation-table-for-decode",
6355     &Vstandard_translation_table_for_decode,
6356     "Table for translating characters while decoding.");
6357   Vstandard_translation_table_for_decode = Qnil;
6358
6359   DEFVAR_LISP ("standard-translation-table-for-encode",
6360     &Vstandard_translation_table_for_encode,
6361     "Table for translationg characters while encoding.");
6362   Vstandard_translation_table_for_encode = Qnil;
6363
6364   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6365     "Alist of charsets vs revision numbers.\n\
6366 While encoding, if a charset (car part of an element) is found,\n\
6367 designate it with the escape sequence identifing revision (cdr part of the element).");
6368   Vcharset_revision_alist = Qnil;
6369
6370   DEFVAR_LISP ("default-process-coding-system",
6371                &Vdefault_process_coding_system,
6372     "Cons of coding systems used for process I/O by default.\n\
6373 The car part is used for decoding a process output,\n\
6374 the cdr part is used for encoding a text to be sent to a process.");
6375   Vdefault_process_coding_system = Qnil;
6376
6377   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6378     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6379 This is a vector of length 256.\n\
6380 If Nth element is non-nil, the existence of code N in a file\n\
6381 \(or output of subprocess) doesn't prevent it to be detected as\n\
6382 a coding system of ISO 2022 variant which has a flag\n\
6383 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6384 or reading output of a subprocess.\n\
6385 Only 128th through 159th elements has a meaning.");
6386   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6387
6388   DEFVAR_LISP ("select-safe-coding-system-function",
6389                &Vselect_safe_coding_system_function,
6390     "Function to call to select safe coding system for encoding a text.\n\
6391 \n\
6392 If set, this function is called to force a user to select a proper\n\
6393 coding system which can encode the text in the case that a default\n\
6394 coding system used in each operation can't encode the text.\n\
6395 \n\
6396 The default value is `select-safe-coding-system' (which see).");
6397   Vselect_safe_coding_system_function = Qnil;
6398
6399 }
6400
6401 char *
6402 emacs_strerror (error_number)
6403      int error_number;
6404 {
6405   char *str;
6406
6407   synchronize_system_messages_locale ();
6408   str = strerror (error_number);
6409
6410   if (! NILP (Vlocale_coding_system))
6411     {
6412       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6413                                                       Vlocale_coding_system,
6414                                                       0);
6415       str = (char *) XSTRING (dec)->data;
6416     }
6417
6418   return str;
6419 }
6420
6421 #endif /* emacs */
6422