src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* Flag to inhibit code conversion of end-of-line format.  */
 282 int inhibit_eol_conversion;
 283
 284 /* Coding-system of what terminal accept for displaying.  */
 285 struct coding_system terminal_coding;
 286
 287 /* Coding-system of what is sent from terminal keyboard.  */
 288 struct coding_system keyboard_coding;
 289
 290 Lisp_Object Vfile_coding_system_alist;
 291 Lisp_Object Vprocess_coding_system_alist;
 292 Lisp_Object Vnetwork_coding_system_alist;
 293
 294 #endif /* emacs */
 295
 296 Lisp_Object Qcoding_category_index;
 297
 298 /* List of symbols `coding-category-xxx' ordered by priority.  */
 299 Lisp_Object Vcoding_category_list;
 300
 301 /* Table of coding-systems currently assigned to each coding-category.  */
 302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 303
 304 /* Table of names of symbol for each coding-category.  */
 305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 306   "coding-category-emacs-mule",
 307   "coding-category-sjis",
 308   "coding-category-iso-7",
 309   "coding-category-iso-8-1",
 310   "coding-category-iso-8-2",
 311   "coding-category-iso-7-else",
 312   "coding-category-iso-8-else",
 313   "coding-category-big5",
 314   "coding-category-binary"
 315 };
 316
 317 /* Flag to tell if we look up unification table on character code
 318    conversion.  */
 319 Lisp_Object Venable_character_unification;
 320 /* Standard unification table to look up on decoding (reading).  */
 321 Lisp_Object Vstandard_character_unification_table_for_decode;
 322 /* Standard unification table to look up on encoding (writing).  */
 323 Lisp_Object Vstandard_character_unification_table_for_encode;
 324
 325 Lisp_Object Qcharacter_unification_table;
 326 Lisp_Object Qcharacter_unification_table_for_decode;
 327 Lisp_Object Qcharacter_unification_table_for_encode;
 328
 329 /* Alist of charsets vs revision number.  */
 330 Lisp_Object Vcharset_revision_alist;
 331
 332 /* Default coding systems used for process I/O.  */
 333 Lisp_Object Vdefault_process_coding_system;
 334
 335 \f
 336 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 337
 338 /* Emacs' internal format for encoding multiple character sets is a
 339    kind of multi-byte encoding, i.e. characters are encoded by
 340    variable-length sequences of one-byte codes.  ASCII characters
 341    and control characters (e.g. `tab', `newline') are represented by
 342    one-byte sequences which are their ASCII codes, in the range 0x00
 343    through 0x7F.  The other characters are represented by a sequence
 344    of `base leading-code', optional `extended leading-code', and one
 345    or two `position-code's.  The length of the sequence is determined
 346    by the base leading-code.  Leading-code takes the range 0x80
 347    through 0x9F, whereas extended leading-code and position-code take
 348    the range 0xA0 through 0xFF.  See `charset.h' for more details
 349    about leading-code and position-code.
 350
 351    There's one exception to this rule.  Special leading-code
 352    `leading-code-composition' denotes that the following several
 353    characters should be composed into one character.  Leading-codes of
 354    components (except for ASCII) are added 0x20.  An ASCII character
 355    component is represented by a 2-byte sequence of `0xA0' and
 356    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 357    details of composite character.  Hence, we can summarize the code
 358    range as follows:
 359
 360    --- CODE RANGE of Emacs' internal format ---
 361    (character set)      (range)
 362    ASCII                0x00 .. 0x7F
 363    ELSE (1st byte)      0x80 .. 0x9F
 364         (rest bytes)    0xA0 .. 0xFF
 365    ---------------------------------------------
 366
 367   */
 368
 369 enum emacs_code_class_type emacs_code_class[256];
 370
 371 /* Go to the next statement only if *SRC is accessible and the code is
 372    greater than 0xA0.  */
 373 #define CHECK_CODE_RANGE_A0_FF  \
 374   do {                          \
 375     if (src >= src_end)         \
 376       goto label_end_of_switch; \
 377     else if (*src++ < 0xA0)     \
 378       return 0;                 \
 379   } while (0)
 380
 381 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 382    Check if a text is encoded in Emacs' internal format.  If it is,
 383    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 384
 385 int
 386 detect_coding_emacs_mule (src, src_end)
 387      unsigned char *src, *src_end;
 388 {
 389   unsigned char c;
 390   int composing = 0;
 391
 392   while (src < src_end)
 393     {
 394       c = *src++;
 395
 396       if (composing)
 397         {
 398           if (c < 0xA0)
 399             composing = 0;
 400           else
 401             c -= 0x20;
 402         }
 403
 404       switch (emacs_code_class[c])
 405         {
 406         case EMACS_ascii_code:
 407         case EMACS_linefeed_code:
 408           break;
 409
 410         case EMACS_control_code:
 411           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 412             return 0;
 413           break;
 414
 415         case EMACS_invalid_code:
 416           return 0;
 417
 418         case EMACS_leading_code_composition: /* c == 0x80 */
 419           if (composing)
 420             CHECK_CODE_RANGE_A0_FF;
 421           else
 422             composing = 1;
 423           break;
 424
 425         case EMACS_leading_code_4:
 426           CHECK_CODE_RANGE_A0_FF;
 427           /* fall down to check it two more times ...  */
 428
 429         case EMACS_leading_code_3:
 430           CHECK_CODE_RANGE_A0_FF;
 431           /* fall down to check it one more time ...  */
 432
 433         case EMACS_leading_code_2:
 434           CHECK_CODE_RANGE_A0_FF;
 435           break;
 436
 437         default:
 438         label_end_of_switch:
 439           break;
 440         }
 441     }
 442   return CODING_CATEGORY_MASK_EMACS_MULE;
 443 }
 444
 445 \f
 446 /*** 3. ISO2022 handlers ***/
 447
 448 /* The following note describes the coding system ISO2022 briefly.
 449    Since the intention of this note is to help in understanding of
 450    the programs in this file, some parts are NOT ACCURATE or OVERLY
 451    SIMPLIFIED.  For the thorough understanding, please refer to the
 452    original document of ISO2022.
 453
 454    ISO2022 provides many mechanisms to encode several character sets
 455    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 456    all text is encoded by codes of less than 128.  This may make the
 457    encoded text a little bit longer, but the text gets more stability
 458    to pass through several gateways (some of them strip off the MSB).
 459
 460    There are two kinds of character set: control character set and
 461    graphic character set.  The former contains control characters such
 462    as `newline' and `escape' to provide control functions (control
 463    functions are provided also by escape sequences).  The latter
 464    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 465    two control character sets and many graphic character sets.
 466
 467    Graphic character sets are classified into one of the following
 468    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 469    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 470    bytes (DIMENSION) and the number of characters in one dimension
 471    (CHARS) of the set.  In addition, each character set is assigned an
 472    identification tag (called "final character" and denoted as <F>
 473    here after) which is unique in each class.  <F> of each character
 474    set is decided by ECMA(*) when it is registered in ISO.  Code range
 475    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 476
 477    Note (*): ECMA = European Computer Manufacturers Association
 478
 479    Here are examples of graphic character set [NAME(<F>)]:
 480         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 481         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 482         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 483         o DIMENSION2_CHARS96 -- none for the moment
 484
 485    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 486         C0 [0x00..0x1F] -- control character plane 0
 487         GL [0x20..0x7F] -- graphic character plane 0
 488         C1 [0x80..0x9F] -- control character plane 1
 489         GR [0xA0..0xFF] -- graphic character plane 1
 490
 491    A control character set is directly designated and invoked to C0 or
 492    C1 by an escape sequence.  The most common case is that ISO646's
 493    control character set is designated/invoked to C0 and ISO6429's
 494    control character set is designated/invoked to C1, and usually
 495    these designations/invocations are omitted in a coded text.  With
 496    7-bit environment, only C0 can be used, and a control character for
 497    C1 is encoded by an appropriate escape sequence to fit in the
 498    environment.  All control characters for C1 are defined the
 499    corresponding escape sequences.
 500
 501    A graphic character set is at first designated to one of four
 502    graphic registers (G0 through G3), then these graphic registers are
 503    invoked to GL or GR.  These designations and invocations can be
 504    done independently.  The most common case is that G0 is invoked to
 505    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 506    these invocations and designations are omitted in a coded text.
 507    With 7-bit environment, only GL can be used.
 508
 509    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 510    and 0x7F of GL area work as control characters SPACE and DEL
 511    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 512
 513    There are two ways of invocation: locking-shift and single-shift.
 514    With locking-shift, the invocation lasts until the next different
 515    invocation, whereas with single-shift, the invocation works only
 516    for the following character and doesn't affect locking-shift.
 517    Invocations are done by the following control characters or escape
 518    sequences.
 519
 520    ----------------------------------------------------------------------
 521    function             control char    escape sequence description
 522    ----------------------------------------------------------------------
 523    SI  (shift-in)               0x0F    none            invoke G0 to GL
 524    SI  (shift-out)              0x0E    none            invoke G1 to GL
 525    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 526    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 527    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 528    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 529    ----------------------------------------------------------------------
 530    The first four are for locking-shift.  Control characters for these
 531    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 532
 533    Designations are done by the following escape sequences.
 534    ----------------------------------------------------------------------
 535    escape sequence      description
 536    ----------------------------------------------------------------------
 537    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 538    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 539    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 540    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 541    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 542    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 543    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 544    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 545    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 546    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 547    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 548    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 549    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 550    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 551    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 552    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 553    ----------------------------------------------------------------------
 554
 555    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 556    of dimension 1, chars 94, and final character <F>, and etc.
 557
 558    Note (*): Although these designations are not allowed in ISO2022,
 559    Emacs accepts them on decoding, and produces them on encoding
 560    CHARS96 character set in a coding system which is characterized as
 561    7-bit environment, non-locking-shift, and non-single-shift.
 562
 563    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 564    '(' can be omitted.  We call this as "short-form" here after.
 565
 566    Now you may notice that there are a lot of ways for encoding the
 567    same multilingual text in ISO2022.  Actually, there exists many
 568    coding systems such as Compound Text (used in X's inter client
 569    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 570    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 571    localized platforms), and all of these are variants of ISO2022.
 572
 573    In addition to the above, Emacs handles two more kinds of escape
 574    sequences: ISO6429's direction specification and Emacs' private
 575    sequence for specifying character composition.
 576
 577    ISO6429's direction specification takes the following format:
 578         o CSI ']'      -- end of the current direction
 579         o CSI '0' ']'  -- end of the current direction
 580         o CSI '1' ']'  -- start of left-to-right text
 581         o CSI '2' ']'  -- start of right-to-left text
 582    The control character CSI (0x9B: control sequence introducer) is
 583    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 584
 585    Character composition specification takes the following format:
 586         o ESC '0' -- start character composition
 587         o ESC '1' -- end character composition
 588    Since these are not standard escape sequences of any ISO, the use
 589    of them for these meaning is restricted to Emacs only.  */
 590
 591 enum iso_code_class_type iso_code_class[256];
 592
 593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 594    Check if a text is encoded in ISO2022.  If it is, returns an
 595    integer in which appropriate flag bits any of:
 596         CODING_CATEGORY_MASK_ISO_7
 597         CODING_CATEGORY_MASK_ISO_8_1
 598         CODING_CATEGORY_MASK_ISO_8_2
 599         CODING_CATEGORY_MASK_ISO_7_ELSE
 600         CODING_CATEGORY_MASK_ISO_8_ELSE
 601    are set.  If a code which should never appear in ISO2022 is found,
 602    returns 0.  */
 603
 604 int
 605 detect_coding_iso2022 (src, src_end)
 606      unsigned char *src, *src_end;
 607 {
 608   int mask = (CODING_CATEGORY_MASK_ISO_7
 609               | CODING_CATEGORY_MASK_ISO_8_1
 610               | CODING_CATEGORY_MASK_ISO_8_2
 611               | CODING_CATEGORY_MASK_ISO_7_ELSE
 612               | CODING_CATEGORY_MASK_ISO_8_ELSE
 613               );
 614   int g1 = 0;                   /* 1 iff designating to G1.  */
 615   int c, i;
 616
 617   while (src < src_end)
 618     {
 619       c = *src++;
 620       switch (c)
 621         {
 622         case ISO_CODE_ESC:
 623           if (src >= src_end)
 624             break;
 625           c = *src++;
 626           if (src < src_end
 627               && ((c >= '(' && c <= '/')
 628                   || c == '$' && ((*src >= '(' && *src <= '/')
 629                                   || (*src >= '@' && *src <= 'B'))))
 630             {
 631               /* Valid designation sequence.  */
 632               if (c == ')' || (c == '$' && *src == ')'))
 633                 {
 634                   g1 = 1;
 635                   mask &= ~(CODING_CATEGORY_MASK_ISO_7
 636                             | CODING_CATEGORY_MASK_ISO_7_ELSE);
 637                 }
 638               src++;
 639               break;
 640             }
 641           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 642             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 643                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 644           break;
 645
 646         case ISO_CODE_SO:
 647           if (g1)
 648             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 649                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 650           break;
 651
 652         case ISO_CODE_CSI:
 653         case ISO_CODE_SS2:
 654         case ISO_CODE_SS3:
 655           mask &= ~(CODING_CATEGORY_MASK_ISO_7
 656                     | CODING_CATEGORY_MASK_ISO_7_ELSE);
 657           break;
 658
 659         default:
 660           if (c < 0x80)
 661             break;
 662           else if (c < 0xA0)
 663             return 0;
 664           else
 665             {
 666               unsigned char *src_begin = src;
 667
 668               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 669                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 670               while (src < src_end && *src >= 0xA0)
 671                 src++;
 672               if ((src - src_begin - 1) & 1 && src < src_end)
 673                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 674             }
 675           break;
 676         }
 677     }
 678
 679   return mask;
 680 }
 681
 682 /* Decode a character of which charset is CHARSET and the 1st position
 683    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 684    fetched from SRC and set to C2.  If CHARSET is negative, it means
 685    that we are decoding ill formed text, and what we can do is just to
 686    read C1 as is.  */
 687
 688 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 689   do {                                                                  \
 690     int c_alt, charset_alt = (charset);                                 \
 691     if (COMPOSING_HEAD_P (coding->composing))                           \
 692       {                                                                 \
 693         *dst++ = LEADING_CODE_COMPOSITION;                              \
 694         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 695           /* To tell composition rules are embeded.  */                 \
 696           *dst++ = 0xFF;                                                \
 697         coding->composing += 2;                                         \
 698       }                                                                 \
 699     if ((charset) >= 0)                                                 \
 700       {                                                                 \
 701         if (CHARSET_DIMENSION (charset) == 2)                           \
 702           ONE_MORE_BYTE (c2);                                           \
 703         if (!NILP (unification_table)                                   \
 704             && ((c_alt = unify_char (unification_table,                 \
 705                                      -1, (charset), c1, c2)) >= 0))     \
 706           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 707       }                                                                 \
 708     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 709       DECODE_CHARACTER_ASCII (c1);                                      \
 710     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 711       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 712     else                                                                \
 713       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 714     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 715       /* To tell a composition rule follows.  */                        \
 716       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 717   } while (0)
 718
 719 /* Set designation state into CODING.  */
 720 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 721   do {                                                                  \
 722     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 723                                      make_number (chars),               \
 724                                      make_number (final_char));         \
 725     if (charset >= 0)                                                   \
 726       {                                                                 \
 727         if (coding->direction == 1                                      \
 728             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 729           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 730         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 731       }                                                                 \
 732   } while (0)
 733
 734 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 735
 736 int
 737 decode_coding_iso2022 (coding, source, destination,
 738                        src_bytes, dst_bytes, consumed)
 739      struct coding_system *coding;
 740      unsigned char *source, *destination;
 741      int src_bytes, dst_bytes;
 742      int *consumed;
 743 {
 744   unsigned char *src = source;
 745   unsigned char *src_end = source + src_bytes;
 746   unsigned char *dst = destination;
 747   unsigned char *dst_end = destination + dst_bytes;
 748   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 749      from DST_END to assure that overflow checking is necessary only
 750      at the head of loop.  */
 751   unsigned char *adjusted_dst_end = dst_end - 6;
 752   int charset;
 753   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 754   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 755   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 756   Lisp_Object unification_table
 757       = coding->character_unification_table_for_decode;
 758
 759   if (!NILP (Venable_character_unification) && NILP (unification_table))
 760     unification_table = Vstandard_character_unification_table_for_decode;
 761
 762   while (src < src_end && dst < adjusted_dst_end)
 763     {
 764       /* SRC_BASE remembers the start position in source in each loop.
 765          The loop will be exited when there's not enough source text
 766          to analyze long escape sequence or 2-byte code (within macros
 767          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 768          to SRC_BASE before exiting.  */
 769       unsigned char *src_base = src;
 770       int c1 = *src++, c2;
 771
 772       switch (iso_code_class [c1])
 773         {
 774         case ISO_0x20_or_0x7F:
 775           if (!coding->composing
 776               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 777             {
 778               /* This is SPACE or DEL.  */
 779               *dst++ = c1;
 780               break;
 781             }
 782           /* This is a graphic character, we fall down ...  */
 783
 784         case ISO_graphic_plane_0:
 785           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 786             {
 787               /* This is a composition rule.  */
 788               *dst++ = c1 | 0x80;
 789               coding->composing = COMPOSING_WITH_RULE_TAIL;
 790             }
 791           else
 792             DECODE_ISO_CHARACTER (charset0, c1);
 793           break;
 794
 795         case ISO_0xA0_or_0xFF:
 796           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 797             {
 798               /* Invalid code.  */
 799               *dst++ = c1;
 800               break;
 801             }
 802           /* This is a graphic character, we fall down ... */
 803
 804         case ISO_graphic_plane_1:
 805           DECODE_ISO_CHARACTER (charset1, c1);
 806           break;
 807
 808         case ISO_control_code:
 809           /* All ISO2022 control characters in this class have the
 810              same representation in Emacs internal format.  */
 811           *dst++ = c1;
 812           break;
 813
 814         case ISO_carriage_return:
 815           if (coding->eol_type == CODING_EOL_CR)
 816             {
 817               *dst++ = '\n';
 818             }
 819           else if (coding->eol_type == CODING_EOL_CRLF)
 820             {
 821               ONE_MORE_BYTE (c1);
 822               if (c1 == ISO_CODE_LF)
 823                 *dst++ = '\n';
 824               else
 825                 {
 826                   src--;
 827                   *dst++ = c1;
 828                 }
 829             }
 830           else
 831             {
 832               *dst++ = c1;
 833             }
 834           break;
 835
 836         case ISO_shift_out:
 837           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 838             goto label_invalid_escape_sequence;
 839           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 840           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 841           break;
 842
 843         case ISO_shift_in:
 844           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 845           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 846           break;
 847
 848         case ISO_single_shift_2_7:
 849         case ISO_single_shift_2:
 850           /* SS2 is handled as an escape sequence of ESC 'N' */
 851           c1 = 'N';
 852           goto label_escape_sequence;
 853
 854         case ISO_single_shift_3:
 855           /* SS2 is handled as an escape sequence of ESC 'O' */
 856           c1 = 'O';
 857           goto label_escape_sequence;
 858
 859         case ISO_control_sequence_introducer:
 860           /* CSI is handled as an escape sequence of ESC '[' ...  */
 861           c1 = '[';
 862           goto label_escape_sequence;
 863
 864         case ISO_escape:
 865           ONE_MORE_BYTE (c1);
 866         label_escape_sequence:
 867           /* Escape sequences handled by Emacs are invocation,
 868              designation, direction specification, and character
 869              composition specification.  */
 870           switch (c1)
 871             {
 872             case '&':           /* revision of following character set */
 873               ONE_MORE_BYTE (c1);
 874               if (!(c1 >= '@' && c1 <= '~'))
 875                 goto label_invalid_escape_sequence;
 876               ONE_MORE_BYTE (c1);
 877               if (c1 != ISO_CODE_ESC)
 878                 goto label_invalid_escape_sequence;
 879               ONE_MORE_BYTE (c1);
 880               goto label_escape_sequence;
 881
 882             case '$':           /* designation of 2-byte character set */
 883               ONE_MORE_BYTE (c1);
 884               if (c1 >= '@' && c1 <= 'B')
 885                 {       /* designation of JISX0208.1978, GB2312.1980,
 886                                    or JISX0208.1980 */
 887                   DECODE_DESIGNATION (0, 2, 94, c1);
 888                 }
 889               else if (c1 >= 0x28 && c1 <= 0x2B)
 890                 {       /* designation of DIMENSION2_CHARS94 character set */
 891                   ONE_MORE_BYTE (c2);
 892                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 893                 }
 894               else if (c1 >= 0x2C && c1 <= 0x2F)
 895                 {       /* designation of DIMENSION2_CHARS96 character set */
 896                   ONE_MORE_BYTE (c2);
 897                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 898                 }
 899               else
 900                 goto label_invalid_escape_sequence;
 901               break;
 902
 903             case 'n':           /* invocation of locking-shift-2 */
 904               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 905                 goto label_invalid_escape_sequence;
 906               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 907               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 908               break;
 909
 910             case 'o':           /* invocation of locking-shift-3 */
 911               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 912                 goto label_invalid_escape_sequence;
 913               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 914               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 915               break;
 916
 917             case 'N':           /* invocation of single-shift-2 */
 918               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 919                 goto label_invalid_escape_sequence;
 920               ONE_MORE_BYTE (c1);
 921               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 922               DECODE_ISO_CHARACTER (charset, c1);
 923               break;
 924
 925             case 'O':           /* invocation of single-shift-3 */
 926               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 927                 goto label_invalid_escape_sequence;
 928               ONE_MORE_BYTE (c1);
 929               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 930               DECODE_ISO_CHARACTER (charset, c1);
 931               break;
 932
 933             case '0':           /* start composing without embeded rules */
 934               coding->composing = COMPOSING_NO_RULE_HEAD;
 935               break;
 936
 937             case '1':           /* end composing */
 938               coding->composing = COMPOSING_NO;
 939               break;
 940
 941             case '2':           /* start composing with embeded rules */
 942               coding->composing = COMPOSING_WITH_RULE_HEAD;
 943               break;
 944
 945             case '[':           /* specification of direction */
 946               /* For the moment, nested direction is not supported.
 947                  So, the value of `coding->direction' is 0 or 1: 0
 948                  means left-to-right, 1 means right-to-left.  */
 949               ONE_MORE_BYTE (c1);
 950               switch (c1)
 951                 {
 952                 case ']':       /* end of the current direction */
 953                   coding->direction = 0;
 954
 955                 case '0':       /* end of the current direction */
 956                 case '1':       /* start of left-to-right direction */
 957                   ONE_MORE_BYTE (c1);
 958                   if (c1 == ']')
 959                     coding->direction = 0;
 960                   else
 961                     goto label_invalid_escape_sequence;
 962                   break;
 963
 964                 case '2':       /* start of right-to-left direction */
 965                   ONE_MORE_BYTE (c1);
 966                   if (c1 == ']')
 967                     coding->direction= 1;
 968                   else
 969                     goto label_invalid_escape_sequence;
 970                   break;
 971
 972                 default:
 973                   goto label_invalid_escape_sequence;
 974                 }
 975               break;
 976
 977             default:
 978               if (c1 >= 0x28 && c1 <= 0x2B)
 979                 {       /* designation of DIMENSION1_CHARS94 character set */
 980                   ONE_MORE_BYTE (c2);
 981                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 982                 }
 983               else if (c1 >= 0x2C && c1 <= 0x2F)
 984                 {       /* designation of DIMENSION1_CHARS96 character set */
 985                   ONE_MORE_BYTE (c2);
 986                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 987                 }
 988               else
 989                 {
 990                   goto label_invalid_escape_sequence;
 991                 }
 992             }
 993           /* We must update these variables now.  */
 994           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 995           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 996           break;
 997
 998         label_invalid_escape_sequence:
 999           {
1000             int length = src - src_base;
1001
1002             bcopy (src_base, dst, length);
1003             dst += length;
1004           }
1005         }
1006       continue;
1007
1008     label_end_of_loop:
1009       coding->carryover_size = src - src_base;
1010       bcopy (src_base, coding->carryover, coding->carryover_size);
1011       src = src_base;
1012       break;
1013     }
1014
1015   /* If this is the last block of the text to be decoded, we had
1016      better just flush out all remaining codes in the text although
1017      they are not valid characters.  */
1018   if (coding->last_block)
1019     {
1020       bcopy (src, dst, src_end - src);
1021       dst += (src_end - src);
1022       src = src_end;
1023     }
1024   *consumed = src - source;
1025   return dst - destination;
1026 }
1027
1028 /* ISO2022 encoding stuff.  */
1029
1030 /*
1031    It is not enough to say just "ISO2022" on encoding, we have to
1032    specify more details.  In Emacs, each coding-system of ISO2022
1033    variant has the following specifications:
1034         1. Initial designation to G0 thru G3.
1035         2. Allows short-form designation?
1036         3. ASCII should be designated to G0 before control characters?
1037         4. ASCII should be designated to G0 at end of line?
1038         5. 7-bit environment or 8-bit environment?
1039         6. Use locking-shift?
1040         7. Use Single-shift?
1041    And the following two are only for Japanese:
1042         8. Use ASCII in place of JIS0201-1976-Roman?
1043         9. Use JISX0208-1983 in place of JISX0208-1978?
1044    These specifications are encoded in `coding->flags' as flag bits
1045    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1046    details.
1047 */
1048
1049 /* Produce codes (escape sequence) for designating CHARSET to graphic
1050    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1051    the coding system CODING allows, produce designation sequence of
1052    short-form.  */
1053
1054 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1055   do {                                                                  \
1056     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1057     char *intermediate_char_94 = "()*+";                                \
1058     char *intermediate_char_96 = ",-./";                                \
1059     Lisp_Object temp                                                    \
1060       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1061     if (! NILP (temp))                                                  \
1062         {                                                               \
1063         *dst++ = ISO_CODE_ESC;                                          \
1064         *dst++ = '&';                                                   \
1065         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1066       }                                                                 \
1067     *dst++ = ISO_CODE_ESC;                                              \
1068     if (CHARSET_DIMENSION (charset) == 1)                               \
1069       {                                                                 \
1070         if (CHARSET_CHARS (charset) == 94)                              \
1071           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1072         else                                                            \
1073           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1074       }                                                                 \
1075     else                                                                \
1076       {                                                                 \
1077         *dst++ = '$';                                                   \
1078         if (CHARSET_CHARS (charset) == 94)                              \
1079           {                                                             \
1080             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1081                 || reg != 0                                             \
1082                 || final_char < '@' || final_char > 'B')                \
1083               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1084           }                                                             \
1085         else                                                            \
1086           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1087       }                                                                 \
1088     *dst++ = final_char;                                                \
1089     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1090   } while (0)
1091
1092 /* The following two macros produce codes (control character or escape
1093    sequence) for ISO2022 single-shift functions (single-shift-2 and
1094    single-shift-3).  */
1095
1096 #define ENCODE_SINGLE_SHIFT_2                           \
1097   do {                                                  \
1098     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1099       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1100     else                                                \
1101       *dst++ = ISO_CODE_SS2;                            \
1102     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1103   } while (0)
1104
1105 #define ENCODE_SINGLE_SHIFT_3                           \
1106   do {                                                  \
1107     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1108       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1109     else                                                \
1110       *dst++ = ISO_CODE_SS3;                            \
1111     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1112   } while (0)
1113
1114 /* The following four macros produce codes (control character or
1115    escape sequence) for ISO2022 locking-shift functions (shift-in,
1116    shift-out, locking-shift-2, and locking-shift-3).  */
1117
1118 #define ENCODE_SHIFT_IN                         \
1119   do {                                          \
1120     *dst++ = ISO_CODE_SI;                       \
1121     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1122   } while (0)
1123
1124 #define ENCODE_SHIFT_OUT                        \
1125   do {                                          \
1126     *dst++ = ISO_CODE_SO;                       \
1127     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1128   } while (0)
1129
1130 #define ENCODE_LOCKING_SHIFT_2                  \
1131   do {                                          \
1132     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1133     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1134   } while (0)
1135
1136 #define ENCODE_LOCKING_SHIFT_3                  \
1137   do {                                          \
1138     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1139     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1140   } while (0)
1141
1142 /* Produce codes for a DIMENSION1 character whose character set is
1143    CHARSET and whose position-code is C1.  Designation and invocation
1144    sequences are also produced in advance if necessary.  */
1145
1146
1147 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1148   do {                                                                  \
1149     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1150       {                                                                 \
1151         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1152           *dst++ = c1 & 0x7F;                                           \
1153         else                                                            \
1154           *dst++ = c1 | 0x80;                                           \
1155         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1156         break;                                                          \
1157       }                                                                 \
1158     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1159       {                                                                 \
1160         *dst++ = c1 & 0x7F;                                             \
1161         break;                                                          \
1162       }                                                                 \
1163     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1164       {                                                                 \
1165         *dst++ = c1 | 0x80;                                             \
1166         break;                                                          \
1167       }                                                                 \
1168     else                                                                \
1169       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1170          must invoke it, or, at first, designate it to some graphic     \
1171          register.  Then repeat the loop to actually produce the        \
1172          character.  */                                                 \
1173       dst = encode_invocation_designation (charset, coding, dst);       \
1174   } while (1)
1175
1176 /* Produce codes for a DIMENSION2 character whose character set is
1177    CHARSET and whose position-codes are C1 and C2.  Designation and
1178    invocation codes are also produced in advance if necessary.  */
1179
1180 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1181   do {                                                                  \
1182     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1183       {                                                                 \
1184         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1185           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1186         else                                                            \
1187           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1188         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1189         break;                                                          \
1190       }                                                                 \
1191     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1192       {                                                                 \
1193         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1194         break;                                                          \
1195       }                                                                 \
1196     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1197       {                                                                 \
1198         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1199         break;                                                          \
1200       }                                                                 \
1201     else                                                                \
1202       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1203          must invoke it, or, at first, designate it to some graphic     \
1204          register.  Then repeat the loop to actually produce the        \
1205          character.  */                                                 \
1206       dst = encode_invocation_designation (charset, coding, dst);       \
1207   } while (1)
1208
1209 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1210   do {                                                                    \
1211     int c_alt, charset_alt;                                               \
1212     if (!NILP (unification_table)                                         \
1213         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1214             >= 0))                                                        \
1215       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1216     else                                                                  \
1217       charset_alt = charset;                                              \
1218     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1219       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1220     else                                                                  \
1221       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1222   } while (0)
1223
1224 /* Produce designation and invocation codes at a place pointed by DST
1225    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1226    Return new DST.  */
1227
1228 unsigned char *
1229 encode_invocation_designation (charset, coding, dst)
1230      int charset;
1231      struct coding_system *coding;
1232      unsigned char *dst;
1233 {
1234   int reg;                      /* graphic register number */
1235
1236   /* At first, check designations.  */
1237   for (reg = 0; reg < 4; reg++)
1238     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1239       break;
1240
1241   if (reg >= 4)
1242     {
1243       /* CHARSET is not yet designated to any graphic registers.  */
1244       /* At first check the requested designation.  */
1245       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1246       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1247         /* Since CHARSET requests no special designation, designate it
1248            to graphic register 0.  */
1249         reg = 0;
1250
1251       ENCODE_DESIGNATION (charset, reg, coding);
1252     }
1253
1254   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1255       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1256     {
1257       /* Since the graphic register REG is not invoked to any graphic
1258          planes, invoke it to graphic plane 0.  */
1259       switch (reg)
1260         {
1261         case 0:                 /* graphic register 0 */
1262           ENCODE_SHIFT_IN;
1263           break;
1264
1265         case 1:                 /* graphic register 1 */
1266           ENCODE_SHIFT_OUT;
1267           break;
1268
1269         case 2:                 /* graphic register 2 */
1270           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271             ENCODE_SINGLE_SHIFT_2;
1272           else
1273             ENCODE_LOCKING_SHIFT_2;
1274           break;
1275
1276         case 3:                 /* graphic register 3 */
1277           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1278             ENCODE_SINGLE_SHIFT_3;
1279           else
1280             ENCODE_LOCKING_SHIFT_3;
1281           break;
1282         }
1283     }
1284   return dst;
1285 }
1286
1287 /* The following two macros produce codes for indicating composition.  */
1288 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1289 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1290 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1291
1292 /* The following three macros produce codes for indicating direction
1293    of text.  */
1294 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1295   do {                                                  \
1296     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1297       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1298     else                                                \
1299       *dst++ = ISO_CODE_CSI;                            \
1300   } while (0)
1301
1302 #define ENCODE_DIRECTION_R2L    \
1303   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1304
1305 #define ENCODE_DIRECTION_L2R    \
1306   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1307
1308 /* Produce codes for designation and invocation to reset the graphic
1309    planes and registers to initial state.  */
1310 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1311   do {                                                                      \
1312     int reg;                                                                \
1313     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1314       ENCODE_SHIFT_IN;                                                      \
1315     for (reg = 0; reg < 4; reg++)                                           \
1316       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1317           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1318               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1319         ENCODE_DESIGNATION                                                  \
1320           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1321   } while (0)
1322
1323 /* Produce designation sequences of charsets in the line started from
1324    *SRC to a place pointed by DSTP.
1325
1326    If the current block ends before any end-of-line, we may fail to
1327    find all the necessary *designations.  */
1328 encode_designation_at_bol (coding, table, src, src_end, dstp)
1329      struct coding_system *coding;
1330      Lisp_Object table;
1331      unsigned char *src, *src_end, **dstp;
1332 {
1333   int charset, c, found = 0, reg;
1334   /* Table of charsets to be designated to each graphic register.  */
1335   int r[4];
1336   unsigned char *dst = *dstp;
1337
1338   for (reg = 0; reg < 4; reg++)
1339     r[reg] = -1;
1340
1341   while (src < src_end && *src != '\n' && found < 4)
1342     {
1343       int bytes = BYTES_BY_CHAR_HEAD (*src);
1344
1345       if (NILP (table))
1346         charset = CHARSET_AT (src);
1347       else
1348         {
1349           int c_alt, c1, c2;
1350
1351           SPLIT_STRING(src, bytes, charset, c1, c2);
1352           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1353             charset = CHAR_CHARSET (c_alt);
1354         }
1355
1356       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1357       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1358         {
1359           found++;
1360           r[reg] = charset;
1361         }
1362
1363       src += bytes;
1364     }
1365
1366   if (found)
1367     {
1368       for (reg = 0; reg < 4; reg++)
1369         if (r[reg] >= 0
1370             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1371           ENCODE_DESIGNATION (r[reg], reg, coding);
1372       *dstp = dst;
1373     }
1374 }
1375
1376 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1377
1378 int
1379 encode_coding_iso2022 (coding, source, destination,
1380                        src_bytes, dst_bytes, consumed)
1381      struct coding_system *coding;
1382      unsigned char *source, *destination;
1383      int src_bytes, dst_bytes;
1384      int *consumed;
1385 {
1386   unsigned char *src = source;
1387   unsigned char *src_end = source + src_bytes;
1388   unsigned char *dst = destination;
1389   unsigned char *dst_end = destination + dst_bytes;
1390   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1391      from DST_END to assure overflow checking is necessary only at the
1392      head of loop.  */
1393   unsigned char *adjusted_dst_end = dst_end - 19;
1394   Lisp_Object unification_table
1395       = coding->character_unification_table_for_encode;
1396
1397   if (!NILP (Venable_character_unification) && NILP (unification_table))
1398     unification_table = Vstandard_character_unification_table_for_encode;
1399
1400   while (src < src_end && dst < adjusted_dst_end)
1401     {
1402       /* SRC_BASE remembers the start position in source in each loop.
1403          The loop will be exited when there's not enough source text
1404          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1405          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1406          reset to SRC_BASE before exiting.  */
1407       unsigned char *src_base = src;
1408       int charset, c1, c2, c3, c4;
1409
1410       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1411           && CODING_SPEC_ISO_BOL (coding))
1412         {
1413           /* We have to produce designation sequences if any now.  */
1414           encode_designation_at_bol (coding, unification_table,
1415                                      src, src_end, &dst);
1416           CODING_SPEC_ISO_BOL (coding) = 0;
1417         }
1418
1419       c1 = *src++;
1420       /* If we are seeing a component of a composite character, we are
1421          seeing a leading-code specially encoded for composition, or a
1422          composition rule if composing with rule.  We must set C1
1423          to a normal leading-code or an ASCII code.  If we are not at
1424          a composed character, we must reset the composition state.  */
1425       if (COMPOSING_P (coding->composing))
1426         {
1427           if (c1 < 0xA0)
1428             {
1429               /* We are not in a composite character any longer.  */
1430               coding->composing = COMPOSING_NO;
1431               ENCODE_COMPOSITION_END;
1432             }
1433           else
1434             {
1435               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1436                 {
1437                   *dst++ = c1 & 0x7F;
1438                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1439                   continue;
1440                 }
1441               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1442                 coding->composing = COMPOSING_WITH_RULE_RULE;
1443               if (c1 == 0xA0)
1444                 {
1445                   /* This is an ASCII component.  */
1446                   ONE_MORE_BYTE (c1);
1447                   c1 &= 0x7F;
1448                 }
1449               else
1450                 /* This is a leading-code of non ASCII component.  */
1451                 c1 -= 0x20;
1452             }
1453         }
1454
1455       /* Now encode one character.  C1 is a control character, an
1456          ASCII character, or a leading-code of multi-byte character.  */
1457       switch (emacs_code_class[c1])
1458         {
1459         case EMACS_ascii_code:
1460           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1461           break;
1462
1463         case EMACS_control_code:
1464           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1465             ENCODE_RESET_PLANE_AND_REGISTER;
1466           *dst++ = c1;
1467           break;
1468
1469         case EMACS_carriage_return_code:
1470           if (!coding->selective)
1471             {
1472               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1473                 ENCODE_RESET_PLANE_AND_REGISTER;
1474               *dst++ = c1;
1475               break;
1476             }
1477           /* fall down to treat '\r' as '\n' ...  */
1478
1479         case EMACS_linefeed_code:
1480           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1481             ENCODE_RESET_PLANE_AND_REGISTER;
1482           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1483             bcopy (coding->spec.iso2022.initial_designation,
1484                    coding->spec.iso2022.current_designation,
1485                    sizeof coding->spec.iso2022.initial_designation);
1486           if (coding->eol_type == CODING_EOL_LF
1487               || coding->eol_type == CODING_EOL_UNDECIDED)
1488             *dst++ = ISO_CODE_LF;
1489           else if (coding->eol_type == CODING_EOL_CRLF)
1490             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1491           else
1492             *dst++ = ISO_CODE_CR;
1493           CODING_SPEC_ISO_BOL (coding) = 1;
1494           break;
1495
1496         case EMACS_leading_code_2:
1497           ONE_MORE_BYTE (c2);
1498           if (c2 < 0xA0)
1499             {
1500               /* invalid sequence */
1501               *dst++ = c1;
1502               *dst++ = c2;
1503             }
1504           else
1505             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1506           break;
1507
1508         case EMACS_leading_code_3:
1509           TWO_MORE_BYTES (c2, c3);
1510           if (c2 < 0xA0 || c3 < 0xA0)
1511             {
1512               /* invalid sequence */
1513               *dst++ = c1;
1514               *dst++ = c2;
1515               *dst++ = c3;
1516             }
1517           else if (c1 < LEADING_CODE_PRIVATE_11)
1518             ENCODE_ISO_CHARACTER (c1, c2, c3);
1519           else
1520             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1521           break;
1522
1523         case EMACS_leading_code_4:
1524           THREE_MORE_BYTES (c2, c3, c4);
1525           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1526             {
1527               /* invalid sequence */
1528               *dst++ = c1;
1529               *dst++ = c2;
1530               *dst++ = c3;
1531               *dst++ = c4;
1532             }
1533           else
1534             ENCODE_ISO_CHARACTER (c2, c3, c4);
1535           break;
1536
1537         case EMACS_leading_code_composition:
1538           ONE_MORE_BYTE (c2);
1539           if (c2 < 0xA0)
1540             {
1541               /* invalid sequence */
1542               *dst++ = c1;
1543               *dst++ = c2;
1544             }
1545           else if (c2 == 0xFF)
1546             {
1547               coding->composing = COMPOSING_WITH_RULE_HEAD;
1548               ENCODE_COMPOSITION_WITH_RULE_START;
1549             }
1550           else
1551             {
1552               /* Rewind one byte because it is a character code of
1553                  composition elements.  */
1554               src--;
1555               coding->composing = COMPOSING_NO_RULE_HEAD;
1556               ENCODE_COMPOSITION_NO_RULE_START;
1557             }
1558           break;
1559
1560         case EMACS_invalid_code:
1561           *dst++ = c1;
1562           break;
1563         }
1564       continue;
1565     label_end_of_loop:
1566       /* We reach here because the source date ends not at character
1567          boundary.  */
1568       coding->carryover_size = src_end - src_base;
1569       bcopy (src_base, coding->carryover, coding->carryover_size);
1570       src = src_end;
1571       break;
1572     }
1573
1574   /* If this is the last block of the text to be encoded, we must
1575      reset graphic planes and registers to the initial state.  */
1576   if (src >= src_end && coding->last_block)
1577     {
1578       ENCODE_RESET_PLANE_AND_REGISTER;
1579       if (coding->carryover_size > 0
1580           && coding->carryover_size < (dst_end - dst))
1581         {
1582           bcopy (coding->carryover, dst, coding->carryover_size);
1583           dst += coding->carryover_size;
1584           coding->carryover_size = 0;
1585         }
1586     }
1587   *consumed = src - source;
1588   return dst - destination;
1589 }
1590
1591 \f
1592 /*** 4. SJIS and BIG5 handlers ***/
1593
1594 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1595    quite widely.  So, for the moment, Emacs supports them in the bare
1596    C code.  But, in the future, they may be supported only by CCL.  */
1597
1598 /* SJIS is a coding system encoding three character sets: ASCII, right
1599    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1600    as is.  A character of charset katakana-jisx0201 is encoded by
1601    "position-code + 0x80".  A character of charset japanese-jisx0208
1602    is encoded in 2-byte but two position-codes are divided and shifted
1603    so that it fit in the range below.
1604
1605    --- CODE RANGE of SJIS ---
1606    (character set)      (range)
1607    ASCII                0x00 .. 0x7F
1608    KATAKANA-JISX0201    0xA0 .. 0xDF
1609    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1610             (2nd byte)  0x40 .. 0xFF
1611    -------------------------------
1612
1613 */
1614
1615 /* BIG5 is a coding system encoding two character sets: ASCII and
1616    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1617    character set and is encoded in two-byte.
1618
1619    --- CODE RANGE of BIG5 ---
1620    (character set)      (range)
1621    ASCII                0x00 .. 0x7F
1622    Big5 (1st byte)      0xA1 .. 0xFE
1623         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1624    --------------------------
1625
1626    Since the number of characters in Big5 is larger than maximum
1627    characters in Emacs' charset (96x96), it can't be handled as one
1628    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1629    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1630    contains frequently used characters and the latter contains less
1631    frequently used characters.  */
1632
1633 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1634    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1635    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1636    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1637
1638 /* Number of Big5 characters which have the same code in 1st byte.  */
1639 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1640
1641 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1642   do {                                                                  \
1643     unsigned int temp                                                   \
1644       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1645     if (b1 < 0xC9)                                                      \
1646       charset = charset_big5_1;                                         \
1647     else                                                                \
1648       {                                                                 \
1649         charset = charset_big5_2;                                       \
1650         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1651       }                                                                 \
1652     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1653     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1654   } while (0)
1655
1656 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1657   do {                                                                  \
1658     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1659     if (charset == charset_big5_2)                                      \
1660       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1661     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1662     b2 = temp % BIG5_SAME_ROW;                                          \
1663     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1664   } while (0)
1665
1666 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1667   do {                                                                  \
1668     int c_alt, charset_alt = (charset);                                 \
1669     if (!NILP (unification_table)                                       \
1670         && ((c_alt = unify_char (unification_table,                     \
1671                                  -1, (charset), c1, c2)) >= 0))         \
1672           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1673     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1674       DECODE_CHARACTER_ASCII (c1);                                      \
1675     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1676       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1677     else                                                                \
1678       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1679   } while (0)
1680
1681 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1682   do {                                                                    \
1683     int c_alt, charset_alt;                                               \
1684     if (!NILP (unification_table)                                         \
1685         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1686             >= 0))                                                        \
1687       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1688     else                                                                  \
1689       charset_alt = charset;                                              \
1690     if (charset_alt == charset_ascii)                                     \
1691       *dst++ = c1;                                                        \
1692     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1693       {                                                                   \
1694         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1695           *dst++ = c1;                                                    \
1696         else                                                              \
1697           *dst++ = charset_alt, *dst++ = c1;                              \
1698       }                                                                   \
1699     else                                                                  \
1700       {                                                                   \
1701         c1 &= 0x7F, c2 &= 0x7F;                                           \
1702         if (sjis_p && charset_alt == charset_jisx0208)                    \
1703           {                                                               \
1704             unsigned char s1, s2;                                         \
1705                                                                           \
1706             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1707             *dst++ = s1, *dst++ = s2;                                     \
1708           }                                                               \
1709         else if (!sjis_p                                                  \
1710                  && (charset_alt == charset_big5_1                        \
1711                      || charset_alt == charset_big5_2))                   \
1712           {                                                               \
1713             unsigned char b1, b2;                                         \
1714                                                                           \
1715             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1716             *dst++ = b1, *dst++ = b2;                                     \
1717           }                                                               \
1718         else                                                              \
1719           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1720       }                                                                   \
1721   } while (0);
1722
1723 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1724    Check if a text is encoded in SJIS.  If it is, return
1725    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1726
1727 int
1728 detect_coding_sjis (src, src_end)
1729      unsigned char *src, *src_end;
1730 {
1731   unsigned char c;
1732
1733   while (src < src_end)
1734     {
1735       c = *src++;
1736       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1737         return 0;
1738       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1739         {
1740           if (src < src_end && *src++ < 0x40)
1741             return 0;
1742         }
1743     }
1744   return CODING_CATEGORY_MASK_SJIS;
1745 }
1746
1747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1748    Check if a text is encoded in BIG5.  If it is, return
1749    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1750
1751 int
1752 detect_coding_big5 (src, src_end)
1753      unsigned char *src, *src_end;
1754 {
1755   unsigned char c;
1756
1757   while (src < src_end)
1758     {
1759       c = *src++;
1760       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1761         return 0;
1762       if (c >= 0xA1)
1763         {
1764           if (src >= src_end)
1765             break;
1766           c = *src++;
1767           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1768             return 0;
1769         }
1770     }
1771   return CODING_CATEGORY_MASK_BIG5;
1772 }
1773
1774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1775    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1776
1777 int
1778 decode_coding_sjis_big5 (coding, source, destination,
1779                          src_bytes, dst_bytes, consumed, sjis_p)
1780      struct coding_system *coding;
1781      unsigned char *source, *destination;
1782      int src_bytes, dst_bytes;
1783      int *consumed;
1784      int sjis_p;
1785 {
1786   unsigned char *src = source;
1787   unsigned char *src_end = source + src_bytes;
1788   unsigned char *dst = destination;
1789   unsigned char *dst_end = destination + dst_bytes;
1790   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1791      from DST_END to assure overflow checking is necessary only at the
1792      head of loop.  */
1793   unsigned char *adjusted_dst_end = dst_end - 3;
1794   Lisp_Object unification_table
1795       = coding->character_unification_table_for_decode;
1796
1797   if (!NILP (Venable_character_unification) && NILP (unification_table))
1798     unification_table = Vstandard_character_unification_table_for_decode;
1799
1800   while (src < src_end && dst < adjusted_dst_end)
1801     {
1802       /* SRC_BASE remembers the start position in source in each loop.
1803          The loop will be exited when there's not enough source text
1804          to analyze two-byte character (within macro ONE_MORE_BYTE).
1805          In that case, SRC is reset to SRC_BASE before exiting.  */
1806       unsigned char *src_base = src;
1807       unsigned char c1 = *src++, c2, c3, c4;
1808
1809       if (c1 == '\r')
1810         {
1811           if (coding->eol_type == CODING_EOL_CRLF)
1812             {
1813               ONE_MORE_BYTE (c2);
1814               if (c2 == '\n')
1815                 *dst++ = c2;
1816               else
1817                 /* To process C2 again, SRC is subtracted by 1.  */
1818                 *dst++ = c1, src--;
1819             }
1820           else
1821             *dst++ = c1;
1822         }
1823       else if (c1 < 0x20)
1824         *dst++ = c1;
1825       else if (c1 < 0x80)
1826         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1827       else if (c1 < 0xA0 || c1 >= 0xE0)
1828         {
1829           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1830           if (sjis_p)
1831             {
1832               ONE_MORE_BYTE (c2);
1833               DECODE_SJIS (c1, c2, c3, c4);
1834               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1835             }
1836           else if (c1 >= 0xE0 && c1 < 0xFF)
1837             {
1838               int charset;
1839
1840               ONE_MORE_BYTE (c2);
1841               DECODE_BIG5 (c1, c2, charset, c3, c4);
1842               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1843             }
1844           else                  /* Invalid code */
1845             *dst++ = c1;
1846         }
1847       else
1848         {
1849           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1850           if (sjis_p)
1851             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1852           else
1853             {
1854               int charset;
1855
1856               ONE_MORE_BYTE (c2);
1857               DECODE_BIG5 (c1, c2, charset, c3, c4);
1858               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1859             }
1860         }
1861       continue;
1862
1863     label_end_of_loop:
1864       coding->carryover_size = src - src_base;
1865       bcopy (src_base, coding->carryover, coding->carryover_size);
1866       src = src_base;
1867       break;
1868     }
1869
1870   *consumed = src - source;
1871   return dst - destination;
1872 }
1873
1874 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1875    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1876    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1877    sure that all these charsets are registered as official charset
1878    (i.e. do not have extended leading-codes).  Characters of other
1879    charsets are produced without any encoding.  If SJIS_P is 1, encode
1880    SJIS text, else encode BIG5 text.  */
1881
1882 int
1883 encode_coding_sjis_big5 (coding, source, destination,
1884                          src_bytes, dst_bytes, consumed, sjis_p)
1885      struct coding_system *coding;
1886      unsigned char *source, *destination;
1887      int src_bytes, dst_bytes;
1888      int *consumed;
1889      int sjis_p;
1890 {
1891   unsigned char *src = source;
1892   unsigned char *src_end = source + src_bytes;
1893   unsigned char *dst = destination;
1894   unsigned char *dst_end = destination + dst_bytes;
1895   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1896      from DST_END to assure overflow checking is necessary only at the
1897      head of loop.  */
1898   unsigned char *adjusted_dst_end = dst_end - 1;
1899   Lisp_Object unification_table
1900       = coding->character_unification_table_for_encode;
1901
1902   if (!NILP (Venable_character_unification) && NILP (unification_table))
1903     unification_table = Vstandard_character_unification_table_for_encode;
1904
1905   while (src < src_end && dst < adjusted_dst_end)
1906     {
1907       /* SRC_BASE remembers the start position in source in each loop.
1908          The loop will be exited when there's not enough source text
1909          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1910          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1911          before exiting.  */
1912       unsigned char *src_base = src;
1913       unsigned char c1 = *src++, c2, c3, c4;
1914
1915       if (coding->composing)
1916         {
1917           if (c1 == 0xA0)
1918             {
1919               ONE_MORE_BYTE (c1);
1920               c1 &= 0x7F;
1921             }
1922           else if (c1 >= 0xA0)
1923             c1 -= 0x20;
1924           else
1925             coding->composing = 0;
1926         }
1927
1928       switch (emacs_code_class[c1])
1929         {
1930         case EMACS_ascii_code:
1931           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1932           break;
1933
1934         case EMACS_control_code:
1935           *dst++ = c1;
1936           break;
1937
1938         case EMACS_carriage_return_code:
1939           if (!coding->selective)
1940             {
1941               *dst++ = c1;
1942               break;
1943             }
1944           /* fall down to treat '\r' as '\n' ...  */
1945
1946         case EMACS_linefeed_code:
1947           if (coding->eol_type == CODING_EOL_LF
1948               || coding->eol_type == CODING_EOL_UNDECIDED)
1949             *dst++ = '\n';
1950           else if (coding->eol_type == CODING_EOL_CRLF)
1951             *dst++ = '\r', *dst++ = '\n';
1952           else
1953             *dst++ = '\r';
1954           break;
1955
1956         case EMACS_leading_code_2:
1957           ONE_MORE_BYTE (c2);
1958           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1959           break;
1960
1961         case EMACS_leading_code_3:
1962           TWO_MORE_BYTES (c2, c3);
1963           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1964           break;
1965
1966         case EMACS_leading_code_4:
1967           THREE_MORE_BYTES (c2, c3, c4);
1968           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1969           break;
1970
1971         case EMACS_leading_code_composition:
1972           coding->composing = 1;
1973           break;
1974
1975         default:                /* i.e. case EMACS_invalid_code: */
1976           *dst++ = c1;
1977         }
1978       continue;
1979
1980     label_end_of_loop:
1981       coding->carryover_size = src_end - src_base;
1982       bcopy (src_base, coding->carryover, coding->carryover_size);
1983       src = src_end;
1984       break;
1985     }
1986
1987   *consumed = src - source;
1988   return dst - destination;
1989 }
1990
1991 \f
1992 /*** 5. End-of-line handlers ***/
1993
1994 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1995    This function is called only when `coding->eol_type' is
1996    CODING_EOL_CRLF or CODING_EOL_CR.  */
1997
1998 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1999      struct coding_system *coding;
2000      unsigned char *source, *destination;
2001      int src_bytes, dst_bytes;
2002      int *consumed;
2003 {
2004   unsigned char *src = source;
2005   unsigned char *src_end = source + src_bytes;
2006   unsigned char *dst = destination;
2007   unsigned char *dst_end = destination + dst_bytes;
2008   int produced;
2009
2010   switch (coding->eol_type)
2011     {
2012     case CODING_EOL_CRLF:
2013       {
2014         /* Since the maximum bytes produced by each loop is 2, we
2015            subtract 1 from DST_END to assure overflow checking is
2016            necessary only at the head of loop.  */
2017         unsigned char *adjusted_dst_end = dst_end - 1;
2018
2019         while (src < src_end && dst < adjusted_dst_end)
2020           {
2021             unsigned char *src_base = src;
2022             unsigned char c = *src++;
2023             if (c == '\r')
2024               {
2025                 ONE_MORE_BYTE (c);
2026                 if (c != '\n')
2027                   *dst++ = '\r';
2028                 *dst++ = c;
2029               }
2030             else
2031               *dst++ = c;
2032             continue;
2033
2034           label_end_of_loop:
2035             coding->carryover_size = src - src_base;
2036             bcopy (src_base, coding->carryover, coding->carryover_size);
2037             src = src_base;
2038             break;
2039           }
2040         *consumed = src - source;
2041         produced = dst - destination;
2042         break;
2043       }
2044
2045     case CODING_EOL_CR:
2046       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2047       bcopy (source, destination, produced);
2048       dst_end = destination + produced;
2049       while (dst < dst_end)
2050         if (*dst++ == '\r') dst[-1] = '\n';
2051       *consumed = produced;
2052       break;
2053
2054     default:                    /* i.e. case: CODING_EOL_LF */
2055       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2056       bcopy (source, destination, produced);
2057       *consumed = produced;
2058       break;
2059     }
2060
2061   return produced;
2062 }
2063
2064 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2065    format of end-of-line according to `coding->eol_type'.  If
2066    `coding->selective' is 1, code '\r' in source text also means
2067    end-of-line.  */
2068
2069 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2070      struct coding_system *coding;
2071      unsigned char *source, *destination;
2072      int src_bytes, dst_bytes;
2073      int *consumed;
2074 {
2075   unsigned char *src = source;
2076   unsigned char *dst = destination;
2077   int produced;
2078
2079   if (src_bytes <= 0)
2080     return 0;
2081
2082   switch (coding->eol_type)
2083     {
2084     case CODING_EOL_LF:
2085     case CODING_EOL_UNDECIDED:
2086       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2087       bcopy (source, destination, produced);
2088       if (coding->selective)
2089         {
2090           int i = produced;
2091           while (i--)
2092             if (*dst++ == '\r') dst[-1] = '\n';
2093         }
2094       *consumed = produced;
2095
2096     case CODING_EOL_CRLF:
2097       {
2098         unsigned char c;
2099         unsigned char *src_end = source + src_bytes;
2100         unsigned char *dst_end = destination + dst_bytes;
2101         /* Since the maximum bytes produced by each loop is 2, we
2102            subtract 1 from DST_END to assure overflow checking is
2103            necessary only at the head of loop.  */
2104         unsigned char *adjusted_dst_end = dst_end - 1;
2105
2106         while (src < src_end && dst < adjusted_dst_end)
2107           {
2108             c = *src++;
2109             if (c == '\n' || (c == '\r' && coding->selective))
2110               *dst++ = '\r', *dst++ = '\n';
2111             else
2112               *dst++ = c;
2113           }
2114         produced = dst - destination;
2115         *consumed = src - source;
2116         break;
2117       }
2118
2119     default:                    /* i.e. case CODING_EOL_CR: */
2120       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2121       bcopy (source, destination, produced);
2122       {
2123         int i = produced;
2124         while (i--)
2125           if (*dst++ == '\n') dst[-1] = '\r';
2126       }
2127       *consumed = produced;
2128     }
2129
2130   return produced;
2131 }
2132
2133 \f
2134 /*** 6. C library functions ***/
2135
2136 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2137    has a property `coding-system'.  The value of this property is a
2138    vector of length 5 (called as coding-vector).  Among elements of
2139    this vector, the first (element[0]) and the fifth (element[4])
2140    carry important information for decoding/encoding.  Before
2141    decoding/encoding, this information should be set in fields of a
2142    structure of type `coding_system'.
2143
2144    A value of property `coding-system' can be a symbol of another
2145    subsidiary coding-system.  In that case, Emacs gets coding-vector
2146    from that symbol.
2147
2148    `element[0]' contains information to be set in `coding->type'.  The
2149    value and its meaning is as follows:
2150
2151    0 -- coding_type_emacs_mule
2152    1 -- coding_type_sjis
2153    2 -- coding_type_iso2022
2154    3 -- coding_type_big5
2155    4 -- coding_type_ccl encoder/decoder written in CCL
2156    nil -- coding_type_no_conversion
2157    t -- coding_type_undecided (automatic conversion on decoding,
2158                                no-conversion on encoding)
2159
2160    `element[4]' contains information to be set in `coding->flags' and
2161    `coding->spec'.  The meaning varies by `coding->type'.
2162
2163    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2164    of length 32 (of which the first 13 sub-elements are used now).
2165    Meanings of these sub-elements are:
2166
2167    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2168         If the value is an integer of valid charset, the charset is
2169         assumed to be designated to graphic register N initially.
2170
2171         If the value is minus, it is a minus value of charset which
2172         reserves graphic register N, which means that the charset is
2173         not designated initially but should be designated to graphic
2174         register N just before encoding a character in that charset.
2175
2176         If the value is nil, graphic register N is never used on
2177         encoding.
2178
2179    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2180         Each value takes t or nil.  See the section ISO2022 of
2181         `coding.h' for more information.
2182
2183    If `coding->type' is `coding_type_big5', element[4] is t to denote
2184    BIG5-ETen or nil to denote BIG5-HKU.
2185
2186    If `coding->type' takes the other value, element[4] is ignored.
2187
2188    Emacs Lisp's coding system also carries information about format of
2189    end-of-line in a value of property `eol-type'.  If the value is
2190    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2191    means CODING_EOL_CR.  If it is not integer, it should be a vector
2192    of subsidiary coding systems of which property `eol-type' has one
2193    of above values.
2194
2195 */
2196
2197 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2198    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2199    is setup so that no conversion is necessary and return -1, else
2200    return 0.  */
2201
2202 int
2203 setup_coding_system (coding_system, coding)
2204      Lisp_Object coding_system;
2205      struct coding_system *coding;
2206 {
2207   Lisp_Object type, eol_type;
2208
2209   /* At first, set several fields to default values.  */
2210   coding->require_flushing = 0;
2211   coding->last_block = 0;
2212   coding->selective = 0;
2213   coding->composing = 0;
2214   coding->direction = 0;
2215   coding->carryover_size = 0;
2216   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2217   coding->character_unification_table_for_decode = Qnil;
2218   coding->character_unification_table_for_encode = Qnil;
2219
2220   Vlast_coding_system_used = coding->symbol = coding_system;
2221   eol_type = Qnil;
2222   /* Get value of property `coding-system' until we get a vector.
2223      While doing that, also get values of properties
2224      `post-read-conversion', `pre-write-conversion',
2225      `character-unification-table-for-decode',
2226      `character-unification-table-for-encode' and `eol-type'.  */
2227   while (!NILP (coding_system) && SYMBOLP (coding_system))
2228     {
2229       if (NILP (coding->post_read_conversion))
2230         coding->post_read_conversion = Fget (coding_system,
2231                                              Qpost_read_conversion);
2232       if (NILP (coding->pre_write_conversion))
2233         coding->pre_write_conversion = Fget (coding_system,
2234                                              Qpre_write_conversion);
2235       if (!inhibit_eol_conversion && NILP (eol_type))
2236         eol_type = Fget (coding_system, Qeol_type);
2237
2238       if (NILP (coding->character_unification_table_for_decode))
2239         coding->character_unification_table_for_decode
2240           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2241
2242       if (NILP (coding->character_unification_table_for_encode))
2243         coding->character_unification_table_for_encode
2244           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2245
2246       coding_system = Fget (coding_system, Qcoding_system);
2247     }
2248
2249   while (!NILP (coding->character_unification_table_for_decode)
2250          && SYMBOLP (coding->character_unification_table_for_decode))
2251         coding->character_unification_table_for_decode
2252           = Fget (coding->character_unification_table_for_decode,
2253                   Qcharacter_unification_table_for_decode);
2254   if (!NILP (coding->character_unification_table_for_decode)
2255       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2256       coding->character_unification_table_for_decode = Qnil;
2257
2258   while (!NILP (coding->character_unification_table_for_encode)
2259          && SYMBOLP (coding->character_unification_table_for_encode))
2260         coding->character_unification_table_for_encode
2261           = Fget (coding->character_unification_table_for_encode,
2262                   Qcharacter_unification_table_for_encode);
2263   if (!NILP (coding->character_unification_table_for_encode)
2264       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2265       coding->character_unification_table_for_encode = Qnil;
2266
2267   if (!VECTORP (coding_system)
2268       || XVECTOR (coding_system)->size != 5)
2269     goto label_invalid_coding_system;
2270
2271   if (VECTORP (eol_type))
2272     coding->eol_type = CODING_EOL_UNDECIDED;
2273   else if (XFASTINT (eol_type) == 1)
2274     coding->eol_type = CODING_EOL_CRLF;
2275   else if (XFASTINT (eol_type) == 2)
2276     coding->eol_type = CODING_EOL_CR;
2277   else
2278     coding->eol_type = CODING_EOL_LF;
2279
2280   type = XVECTOR (coding_system)->contents[0];
2281   switch (XFASTINT (type))
2282     {
2283     case 0:
2284       coding->type = coding_type_emacs_mule;
2285       break;
2286
2287     case 1:
2288       coding->type = coding_type_sjis;
2289       break;
2290
2291     case 2:
2292       coding->type = coding_type_iso2022;
2293       {
2294         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2295         Lisp_Object *flags;
2296         int i, charset, default_reg_bits = 0;
2297
2298         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2299           goto label_invalid_coding_system;
2300
2301         flags = XVECTOR (val)->contents;
2302         coding->flags
2303           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2304              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2305              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2306              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2307              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2308              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2309              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2310              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2311              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2312              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2313              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2314
2315         /* Invoke graphic register 0 to plane 0.  */
2316         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2317         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2318         CODING_SPEC_ISO_INVOCATION (coding, 1)
2319           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2320         /* Not single shifting at first.  */
2321         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2322         /* Beginning of buffer should also be regarded as bol. */
2323         CODING_SPEC_ISO_BOL(coding) = 1;
2324
2325         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2326            FLAGS[REG] can be one of below:
2327                 integer CHARSET: CHARSET occupies register I,
2328                 t: designate nothing to REG initially, but can be used
2329                   by any charsets,
2330                 list of integer, nil, or t: designate the first
2331                   element (if integer) to REG initially, the remaining
2332                   elements (if integer) is designated to REG on request,
2333                   if an element is t, REG can be used by any charset,
2334                 nil: REG is never used.  */
2335         for (charset = 0; charset <= MAX_CHARSET; charset++)
2336           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2337             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2338         for (i = 0; i < 4; i++)
2339           {
2340             if (INTEGERP (flags[i])
2341                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2342                 || (charset = get_charset_id (flags[i])) >= 0)
2343               {
2344                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2345                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2346               }
2347             else if (EQ (flags[i], Qt))
2348               {
2349                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2350                 default_reg_bits |= 1 << i;
2351               }
2352             else if (CONSP (flags[i]))
2353               {
2354                 Lisp_Object tail = flags[i];
2355
2356                 if (INTEGERP (XCONS (tail)->car)
2357                     && (charset = XINT (XCONS (tail)->car),
2358                         CHARSET_VALID_P (charset))
2359                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2360                   {
2361                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2362                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2363                   }
2364                 else
2365                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2366                 tail = XCONS (tail)->cdr;
2367                 while (CONSP (tail))
2368                   {
2369                     if (INTEGERP (XCONS (tail)->car)
2370                         && (charset = XINT (XCONS (tail)->car),
2371                             CHARSET_VALID_P (charset))
2372                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2373                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2374                         = i;
2375                     else if (EQ (XCONS (tail)->car, Qt))
2376                       default_reg_bits |= 1 << i;
2377                     tail = XCONS (tail)->cdr;
2378                   }
2379               }
2380             else
2381               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2382
2383             CODING_SPEC_ISO_DESIGNATION (coding, i)
2384               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2385           }
2386
2387         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2388           {
2389             /* REG 1 can be used only by locking shift in 7-bit env.  */
2390             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2391               default_reg_bits &= ~2;
2392             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2393               /* Without any shifting, only REG 0 and 1 can be used.  */
2394               default_reg_bits &= 3;
2395           }
2396
2397         for (charset = 0; charset <= MAX_CHARSET; charset++)
2398           if (CHARSET_VALID_P (charset)
2399               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2400                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2401             {
2402               /* We have not yet decided where to designate CHARSET.  */
2403               int reg_bits = default_reg_bits;
2404
2405               if (CHARSET_CHARS (charset) == 96)
2406                 /* A charset of CHARS96 can't be designated to REG 0.  */
2407                 reg_bits &= ~1;
2408
2409               if (reg_bits)
2410                 /* There exist some default graphic register.  */
2411                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2412                   = (reg_bits & 1
2413                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2414               else
2415                 /* We anyway have to designate CHARSET to somewhere.  */
2416                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2417                   = (CHARSET_CHARS (charset) == 94
2418                      ? 0
2419                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2420                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2421                         ? 1
2422                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2423                            ? 2 : 0)));
2424             }
2425       }
2426       coding->require_flushing = 1;
2427       break;
2428
2429     case 3:
2430       coding->type = coding_type_big5;
2431       coding->flags
2432         = (NILP (XVECTOR (coding_system)->contents[4])
2433            ? CODING_FLAG_BIG5_HKU
2434            : CODING_FLAG_BIG5_ETEN);
2435       break;
2436
2437     case 4:
2438       coding->type = coding_type_ccl;
2439       {
2440         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2441         if (CONSP  (val)
2442             && VECTORP (XCONS (val)->car)
2443             && VECTORP (XCONS (val)->cdr))
2444           {
2445             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2446             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2447           }
2448         else
2449           goto label_invalid_coding_system;
2450       }
2451       coding->require_flushing = 1;
2452       break;
2453
2454     default:
2455       if (EQ (type, Qt))
2456         coding->type = coding_type_undecided;
2457       else
2458         coding->type = coding_type_no_conversion;
2459       break;
2460     }
2461   return 0;
2462
2463  label_invalid_coding_system:
2464   coding->type = coding_type_no_conversion;
2465   coding->eol_type = CODING_EOL_LF;
2466   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2467     = Qnil;
2468   return -1;
2469 }
2470
2471 /* Emacs has a mechanism to automatically detect a coding system if it
2472    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2473    it's impossible to distinguish some coding systems accurately
2474    because they use the same range of codes.  So, at first, coding
2475    systems are categorized into 7, those are:
2476
2477    o coding-category-emacs-mule
2478
2479         The category for a coding system which has the same code range
2480         as Emacs' internal format.  Assigned the coding-system (Lisp
2481         symbol) `emacs-mule' by default.
2482
2483    o coding-category-sjis
2484
2485         The category for a coding system which has the same code range
2486         as SJIS.  Assigned the coding-system (Lisp
2487         symbol) `japanese-shift-jis' by default.
2488
2489    o coding-category-iso-7
2490
2491         The category for a coding system which has the same code range
2492         as ISO2022 of 7-bit environment.  This doesn't use any locking
2493         shift and single shift functions.  Assigned the coding-system
2494         (Lisp symbol) `iso-2022-7bit' by default.
2495
2496    o coding-category-iso-8-1
2497
2498         The category for a coding system which has the same code range
2499         as ISO2022 of 8-bit environment and graphic plane 1 used only
2500         for DIMENSION1 charset.  This doesn't use any locking shift
2501         and single shift functions.  Assigned the coding-system (Lisp
2502         symbol) `iso-latin-1' by default.
2503
2504    o coding-category-iso-8-2
2505
2506         The category for a coding system which has the same code range
2507         as ISO2022 of 8-bit environment and graphic plane 1 used only
2508         for DIMENSION2 charset.  This doesn't use any locking shift
2509         and single shift functions.  Assigned the coding-system (Lisp
2510         symbol) `japanese-iso-8bit' by default.
2511
2512    o coding-category-iso-7-else
2513
2514         The category for a coding system which has the same code range
2515         as ISO2022 of 7-bit environemnt but uses locking shift or
2516         single shift functions.  Assigned the coding-system (Lisp
2517         symbol) `iso-2022-7bit-lock' by default.
2518
2519    o coding-category-iso-8-else
2520
2521         The category for a coding system which has the same code range
2522         as ISO2022 of 8-bit environemnt but uses locking shift or
2523         single shift functions.  Assigned the coding-system (Lisp
2524         symbol) `iso-2022-8bit-ss2' by default.
2525
2526    o coding-category-big5
2527
2528         The category for a coding system which has the same code range
2529         as BIG5.  Assigned the coding-system (Lisp symbol)
2530         `cn-big5' by default.
2531
2532    o coding-category-binary
2533
2534         The category for a coding system not categorized in any of the
2535         above.  Assigned the coding-system (Lisp symbol)
2536         `no-conversion' by default.
2537
2538    Each of them is a Lisp symbol and the value is an actual
2539    `coding-system's (this is also a Lisp symbol) assigned by a user.
2540    What Emacs does actually is to detect a category of coding system.
2541    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2542    decide only one possible category, it selects a category of the
2543    highest priority.  Priorities of categories are also specified by a
2544    user in a Lisp variable `coding-category-list'.
2545
2546 */
2547
2548 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2549    If it detects possible coding systems, return an integer in which
2550    appropriate flag bits are set.  Flag bits are defined by macros
2551    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2552
2553 int
2554 detect_coding_mask (src, src_bytes)
2555      unsigned char *src;
2556      int src_bytes;
2557 {
2558   register unsigned char c;
2559   unsigned char *src_end = src + src_bytes;
2560   int mask;
2561
2562   /* At first, skip all ASCII characters and control characters except
2563      for three ISO2022 specific control characters.  */
2564  label_loop_detect_coding:
2565   while (src < src_end)
2566     {
2567       c = *src;
2568       if (c >= 0x80
2569           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2570         break;
2571       src++;
2572     }
2573
2574   if (src >= src_end)
2575     /* We found nothing other than ASCII.  There's nothing to do.  */
2576     return CODING_CATEGORY_MASK_ANY;
2577
2578   /* The text seems to be encoded in some multilingual coding system.
2579      Now, try to find in which coding system the text is encoded.  */
2580   if (c < 0x80)
2581     {
2582       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2583       /* C is an ISO2022 specific control code of C0.  */
2584       mask = detect_coding_iso2022 (src, src_end);
2585       src++;
2586       if (mask == CODING_CATEGORY_MASK_ANY)
2587         /* No valid ISO2022 code follows C.  Try again.  */
2588         goto label_loop_detect_coding;
2589     }
2590   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2591     /* C is an ISO2022 specific control code of C1,
2592        or the first byte of SJIS's 2-byte character code,
2593        or a leading code of Emacs.  */
2594     mask = (detect_coding_iso2022 (src, src_end)
2595             | detect_coding_sjis (src, src_end)
2596             | detect_coding_emacs_mule (src, src_end));
2597
2598   else if (c == ISO_CODE_CSI
2599            && (src < src_end
2600               && (*src == ']'
2601                   || (src + 1 < src_end
2602                       && src[1] == ']'
2603                       && (*src == '0' || *src == '1' || *src == '2')))))
2604     /* C is an ISO2022's control-sequence-introducer.  */
2605     mask = (detect_coding_iso2022 (src, src_end)
2606             | detect_coding_sjis (src, src_end)
2607             | detect_coding_emacs_mule (src, src_end));
2608
2609   else if (c < 0xA0)
2610     /* C is the first byte of SJIS character code,
2611        or a leading-code of Emacs.  */
2612     mask = (detect_coding_sjis (src, src_end)
2613             | detect_coding_emacs_mule (src, src_end));
2614
2615   else
2616     /* C is a character of ISO2022 in graphic plane right,
2617        or a SJIS's 1-byte character code (i.e. JISX0201),
2618        or the first byte of BIG5's 2-byte code.  */
2619     mask = (detect_coding_iso2022 (src, src_end)
2620             | detect_coding_sjis (src, src_end)
2621             | detect_coding_big5 (src, src_end));
2622
2623   return mask;
2624 }
2625
2626 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2627    The information of the detected coding system is set in CODING.  */
2628
2629 void
2630 detect_coding (coding, src, src_bytes)
2631      struct coding_system *coding;
2632      unsigned char *src;
2633      int src_bytes;
2634 {
2635   int mask = detect_coding_mask (src, src_bytes);
2636   int idx;
2637
2638   if (mask == CODING_CATEGORY_MASK_ANY)
2639     /* We found nothing other than ASCII.  There's nothing to do.  */
2640     return;
2641
2642   if (!mask)
2643     /* The source text seems to be encoded in unknown coding system.
2644        Emacs regards the category of such a kind of coding system as
2645        `coding-category-binary'.  We assume that a user has assigned
2646        an appropriate coding system for a `coding-category-binary'.  */
2647     idx = CODING_CATEGORY_IDX_BINARY;
2648   else
2649     {
2650       /* We found some plausible coding systems.  Let's use a coding
2651          system of the highest priority.  */
2652       Lisp_Object val = Vcoding_category_list;
2653
2654       if (CONSP (val))
2655         while (!NILP (val))
2656           {
2657             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2658             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2659               break;
2660             val = XCONS (val)->cdr;
2661           }
2662       else
2663         val = Qnil;
2664
2665       if (NILP (val))
2666         {
2667           /* For unknown reason, `Vcoding_category_list' contains none
2668              of found categories.  Let's use any of them.  */
2669           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2670             if (mask & (1 << idx))
2671               break;
2672         }
2673     }
2674   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2675 }
2676
2677 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2678    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2679    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2680
2681 int
2682 detect_eol_type (src, src_bytes)
2683      unsigned char *src;
2684      int src_bytes;
2685 {
2686   unsigned char *src_end = src + src_bytes;
2687   unsigned char c;
2688
2689   while (src < src_end)
2690     {
2691       c = *src++;
2692       if (c == '\n')
2693         return CODING_EOL_LF;
2694       else if (c == '\r')
2695         {
2696           if (src < src_end && *src == '\n')
2697             return CODING_EOL_CRLF;
2698           else
2699             return CODING_EOL_CR;
2700         }
2701     }
2702   return CODING_EOL_UNDECIDED;
2703 }
2704
2705 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2706    is encoded.  If it detects an appropriate format of end-of-line, it
2707    sets the information in *CODING.  */
2708
2709 void
2710 detect_eol (coding, src, src_bytes)
2711      struct coding_system *coding;
2712      unsigned char *src;
2713      int src_bytes;
2714 {
2715   Lisp_Object val;
2716   int eol_type = detect_eol_type (src, src_bytes);
2717
2718   if (eol_type == CODING_EOL_UNDECIDED)
2719     /*  We found no end-of-line in the source text.  */
2720     return;
2721
2722   val = Fget (coding->symbol, Qeol_type);
2723   if (VECTORP (val) && XVECTOR (val)->size == 3)
2724     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2725 }
2726
2727 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2728    decoding, it may detect coding system and format of end-of-line if
2729    those are not yet decided.  */
2730
2731 int
2732 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2733      struct coding_system *coding;
2734      unsigned char *source, *destination;
2735      int src_bytes, dst_bytes;
2736      int *consumed;
2737 {
2738   int produced;
2739
2740   if (src_bytes <= 0)
2741     {
2742       *consumed = 0;
2743       return 0;
2744     }
2745
2746   if (coding->type == coding_type_undecided)
2747     detect_coding (coding, source, src_bytes);
2748
2749   if (coding->eol_type == CODING_EOL_UNDECIDED)
2750     detect_eol (coding, source, src_bytes);
2751
2752   coding->carryover_size = 0;
2753   switch (coding->type)
2754     {
2755     case coding_type_no_conversion:
2756     label_no_conversion:
2757       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2758       bcopy (source, destination, produced);
2759       *consumed = produced;
2760       break;
2761
2762     case coding_type_emacs_mule:
2763     case coding_type_undecided:
2764       if (coding->eol_type == CODING_EOL_LF
2765           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2766         goto label_no_conversion;
2767       produced = decode_eol (coding, source, destination,
2768                              src_bytes, dst_bytes, consumed);
2769       break;
2770
2771     case coding_type_sjis:
2772       produced = decode_coding_sjis_big5 (coding, source, destination,
2773                                           src_bytes, dst_bytes, consumed,
2774                                           1);
2775       break;
2776
2777     case coding_type_iso2022:
2778       produced = decode_coding_iso2022 (coding, source, destination,
2779                                         src_bytes, dst_bytes, consumed);
2780       break;
2781
2782     case coding_type_big5:
2783       produced = decode_coding_sjis_big5 (coding, source, destination,
2784                                           src_bytes, dst_bytes, consumed,
2785                                           0);
2786       break;
2787
2788     case coding_type_ccl:
2789       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2790                              src_bytes, dst_bytes, consumed);
2791       break;
2792     }
2793
2794   return produced;
2795 }
2796
2797 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2798
2799 int
2800 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2801      struct coding_system *coding;
2802      unsigned char *source, *destination;
2803      int src_bytes, dst_bytes;
2804      int *consumed;
2805 {
2806   int produced;
2807
2808   switch (coding->type)
2809     {
2810     case coding_type_no_conversion:
2811     label_no_conversion:
2812       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2813       if (produced > 0)
2814         {
2815           bcopy (source, destination, produced);
2816           if (coding->selective)
2817             {
2818               unsigned char *p = destination, *pend = destination + produced;
2819               while (p < pend)
2820                 if (*p++ == '\015') p[-1] = '\n';
2821             }
2822         }
2823       *consumed = produced;
2824       break;
2825
2826     case coding_type_emacs_mule:
2827     case coding_type_undecided:
2828       if (coding->eol_type == CODING_EOL_LF
2829           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2830         goto label_no_conversion;
2831       produced = encode_eol (coding, source, destination,
2832                              src_bytes, dst_bytes, consumed);
2833       break;
2834
2835     case coding_type_sjis:
2836       produced = encode_coding_sjis_big5 (coding, source, destination,
2837                                           src_bytes, dst_bytes, consumed,
2838                                           1);
2839       break;
2840
2841     case coding_type_iso2022:
2842       produced = encode_coding_iso2022 (coding, source, destination,
2843                                         src_bytes, dst_bytes, consumed);
2844       break;
2845
2846     case coding_type_big5:
2847       produced = encode_coding_sjis_big5 (coding, source, destination,
2848                                           src_bytes, dst_bytes, consumed,
2849                                           0);
2850       break;
2851
2852     case coding_type_ccl:
2853       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2854                              src_bytes, dst_bytes, consumed);
2855       break;
2856     }
2857
2858   return produced;
2859 }
2860
2861 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2862
2863 /* Return maximum size (bytes) of a buffer enough for decoding
2864    SRC_BYTES of text encoded in CODING.  */
2865
2866 int
2867 decoding_buffer_size (coding, src_bytes)
2868      struct coding_system *coding;
2869      int src_bytes;
2870 {
2871   int magnification;
2872
2873   if (coding->type == coding_type_iso2022)
2874     magnification = 3;
2875   else if (coding->type == coding_type_ccl)
2876     magnification = coding->spec.ccl.decoder.buf_magnification;
2877   else
2878     magnification = 2;
2879
2880   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2881 }
2882
2883 /* Return maximum size (bytes) of a buffer enough for encoding
2884    SRC_BYTES of text to CODING.  */
2885
2886 int
2887 encoding_buffer_size (coding, src_bytes)
2888      struct coding_system *coding;
2889      int src_bytes;
2890 {
2891   int magnification;
2892
2893   if (coding->type == coding_type_ccl)
2894     magnification = coding->spec.ccl.encoder.buf_magnification;
2895   else
2896     magnification = 3;
2897
2898   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2899 }
2900
2901 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2902 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2903 #endif
2904
2905 char *conversion_buffer;
2906 int conversion_buffer_size;
2907
2908 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2909    or decoding.  Sufficient memory is allocated automatically.  If we
2910    run out of memory, return NULL.  */
2911
2912 char *
2913 get_conversion_buffer (size)
2914      int size;
2915 {
2916   if (size > conversion_buffer_size)
2917     {
2918       char *buf;
2919       int real_size = conversion_buffer_size * 2;
2920
2921       while (real_size < size) real_size *= 2;
2922       buf = (char *) xmalloc (real_size);
2923       xfree (conversion_buffer);
2924       conversion_buffer = buf;
2925       conversion_buffer_size = real_size;
2926     }
2927   return conversion_buffer;
2928 }
2929
2930 \f
2931 #ifdef emacs
2932 /*** 7. Emacs Lisp library functions ***/
2933
2934 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2935        1, 1, 0,
2936   "Return coding-spec of CODING-SYSTEM.\n\
2937 If CODING-SYSTEM is not a valid coding-system, return nil.")
2938   (obj)
2939      Lisp_Object obj;
2940 {
2941   while (SYMBOLP (obj) && !NILP (obj))
2942     obj = Fget (obj, Qcoding_system);
2943   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2944           ? Qnil : obj);
2945 }
2946
2947 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2948   "Return t if OBJECT is nil or a coding-system.\n\
2949 See document of make-coding-system for coding-system object.")
2950   (obj)
2951      Lisp_Object obj;
2952 {
2953   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2954 }
2955
2956 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2957        Sread_non_nil_coding_system, 1, 1, 0,
2958   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2959   (prompt)
2960      Lisp_Object prompt;
2961 {
2962   Lisp_Object val;
2963   do
2964     {
2965       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2966                               Qt, Qnil, Qnil, Qnil);
2967     }
2968   while (XSTRING (val)->size == 0);
2969   return (Fintern (val, Qnil));
2970 }
2971
2972 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2973   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2974   (prompt)
2975      Lisp_Object prompt;
2976 {
2977   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2978                                       Qt, Qnil, Qnil, Qnil);
2979   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2980 }
2981
2982 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2983        1, 1, 0,
2984   "Check validity of CODING-SYSTEM.\n\
2985 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2986 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2987 The value of property should be a vector of length 5.")
2988   (coding_system)
2989      Lisp_Object coding_system;
2990 {
2991   CHECK_SYMBOL (coding_system, 0);
2992   if (!NILP (Fcoding_system_p (coding_system)))
2993     return coding_system;
2994   while (1)
2995     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2996 }
2997
2998 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2999        2, 2, 0,
3000   "Detect coding-system of the text in the region between START and END.\n\
3001 Return a list of possible coding-systems ordered by priority.\n\
3002 If only ASCII characters are found, it returns `undecided'\n\
3003  or its subsidiary coding-system according to a detected end-of-line format.")
3004   (b, e)
3005      Lisp_Object b, e;
3006 {
3007   int coding_mask, eol_type;
3008   Lisp_Object val;
3009   int beg, end;
3010
3011   validate_region (&b, &e);
3012   beg = XINT (b), end = XINT (e);
3013   if (beg < GPT && end >= GPT) move_gap (end);
3014
3015   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3016   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3017
3018   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3019     {
3020       val = intern ("undecided");
3021       if (eol_type != CODING_EOL_UNDECIDED)
3022         {
3023           Lisp_Object val2 = Fget (val, Qeol_type);
3024           if (VECTORP (val2))
3025             val = XVECTOR (val2)->contents[eol_type];
3026         }
3027     }
3028   else
3029     {
3030       Lisp_Object val2;
3031
3032       /* At first, gather possible coding-systems in VAL in a reverse
3033          order.  */
3034       val = Qnil;
3035       for (val2 = Vcoding_category_list;
3036            !NILP (val2);
3037            val2 = XCONS (val2)->cdr)
3038         {
3039           int idx
3040             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3041           if (coding_mask & (1 << idx))
3042             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3043         }
3044
3045       /* Then, change the order of the list, while getting subsidiary
3046          coding-systems.  */
3047       val2 = val;
3048       val = Qnil;
3049       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3050         {
3051           if (eol_type == CODING_EOL_UNDECIDED)
3052             val = Fcons (XCONS (val2)->car, val);
3053           else
3054             {
3055               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3056               if (VECTORP (val3))
3057                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3058               else
3059                 val = Fcons (XCONS (val2)->car, val);
3060             }
3061         }
3062     }
3063
3064   return val;
3065 }
3066
3067 /* Scan text in the region between *BEGP and *ENDP, skip characters
3068    which we never have to encode to (iff ENCODEP is 1) or decode from
3069    coding system CODING at the head and tail, then set BEGP and ENDP
3070    to the addresses of start and end of the text we actually convert.  */
3071
3072 void
3073 shrink_conversion_area (begp, endp, coding, encodep)
3074      unsigned char **begp, **endp;
3075      struct coding_system *coding;
3076      int encodep;
3077 {
3078   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3079
3080   if (coding->eol_type != CODING_EOL_LF
3081       && coding->eol_type != CODING_EOL_UNDECIDED)
3082     /* Since we anyway have to convert end-of-line format, it is not
3083        worth skipping at most 100 bytes or so.  */
3084     return;
3085
3086   if (encodep)                  /* for encoding */
3087     {
3088       switch (coding->type)
3089         {
3090         case coding_type_no_conversion:
3091         case coding_type_emacs_mule:
3092         case coding_type_undecided:
3093           /* We need no conversion.  */
3094           *begp = *endp;
3095           return;
3096         case coding_type_ccl:
3097           /* We can't skip any data.  */
3098           return;
3099         case coding_type_iso2022:
3100           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3101             {
3102               unsigned char *bol = beg_addr;
3103               while (beg_addr < end_addr && *beg_addr < 0x80)
3104                 {
3105                   beg_addr++;
3106                   if (*(beg_addr - 1) == '\n')
3107                     bol = beg_addr;
3108                 }
3109               beg_addr = bol;
3110               goto label_skip_tail;
3111             }
3112           /* fall down ... */
3113         default:
3114           /* We can skip all ASCII characters at the head and tail.  */
3115           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3116         label_skip_tail:
3117           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3118           break;
3119         }
3120     }
3121   else                          /* for decoding */
3122     {
3123       switch (coding->type)
3124         {
3125         case coding_type_no_conversion:
3126           /* We need no conversion.  */
3127           *begp = *endp;
3128           return;
3129         case coding_type_emacs_mule:
3130           if (coding->eol_type == CODING_EOL_LF)
3131             {
3132               /* We need no conversion.  */
3133               *begp = *endp;
3134               return;
3135             }
3136           /* We can skip all but carriage-return.  */
3137           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3138           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3139           break;
3140         case coding_type_sjis:
3141         case coding_type_big5:
3142           /* We can skip all ASCII characters at the head.  */
3143           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3144           /* We can skip all ASCII characters at the tail except for
3145              the second byte of SJIS or BIG5 code.  */
3146           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3147           if (end_addr != *endp)
3148             end_addr++;
3149           break;
3150         case coding_type_ccl:
3151           /* We can't skip any data.  */
3152           return;
3153         default:                /* i.e. case coding_type_iso2022: */
3154           {
3155             unsigned char c;
3156
3157             /* We can skip all ASCII characters except for a few
3158                control codes at the head.  */
3159             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3160                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3161                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3162               beg_addr++;
3163           }
3164           break;
3165         }
3166     }
3167   *begp = beg_addr;
3168   *endp = end_addr;
3169   return;
3170 }
3171
3172 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3173    text between B and E.  B and E are buffer position.  */
3174
3175 Lisp_Object
3176 code_convert_region (b, e, coding, encodep)
3177      Lisp_Object b, e;
3178      struct coding_system *coding;
3179      int encodep;
3180 {
3181   int beg, end, len, consumed, produced;
3182   char *buf;
3183   unsigned char *begp, *endp;
3184   int pos = PT;
3185
3186   validate_region (&b, &e);
3187   beg = XINT (b), end = XINT (e);
3188   if (beg < GPT && end >= GPT)
3189     move_gap (end);
3190
3191   if (encodep && !NILP (coding->pre_write_conversion))
3192     {
3193       /* We must call a pre-conversion function which may put a new
3194          text to be converted in a new buffer.  */
3195       struct buffer *old = current_buffer, *new;
3196
3197       TEMP_SET_PT (beg);
3198       call2 (coding->pre_write_conversion, b, e);
3199       if (old != current_buffer)
3200         {
3201           /* Replace the original text by the text just generated.  */
3202           len = ZV - BEGV;
3203           new = current_buffer;
3204           set_buffer_internal (old);
3205           del_range (beg, end);
3206           insert_from_buffer (new, 1, len, 0);
3207           end = beg + len;
3208         }
3209     }
3210
3211   /* We may be able to shrink the conversion region.  */
3212   begp = POS_ADDR (beg); endp = begp + (end - beg);
3213   shrink_conversion_area (&begp, &endp, coding, encodep);
3214
3215   if (begp == endp)
3216     /* We need no conversion.  */
3217     len = end - beg;
3218   else
3219     {
3220       beg += begp - POS_ADDR (beg);
3221       end =  beg + (endp - begp);
3222
3223       if (encodep)
3224         len = encoding_buffer_size (coding, end - beg);
3225       else
3226         len = decoding_buffer_size (coding, end - beg);
3227       buf = get_conversion_buffer (len);
3228
3229       coding->last_block = 1;
3230       produced = (encodep
3231                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3232                                    &consumed)
3233                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3234                                    &consumed));
3235
3236       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3237
3238       TEMP_SET_PT (beg);
3239       insert (buf, produced);
3240       del_range (PT, PT + end - beg);
3241       if (pos >= end)
3242         pos = PT + (pos - end);
3243       else if (pos > beg)
3244         pos = beg;
3245       TEMP_SET_PT (pos);
3246   }
3247
3248   if (!encodep && !NILP (coding->post_read_conversion))
3249     {
3250       /* We must call a post-conversion function which may alter
3251          the text just converted.  */
3252       Lisp_Object insval;
3253
3254       beg = XINT (b);
3255       TEMP_SET_PT (beg);
3256       insval = call1 (coding->post_read_conversion, make_number (len));
3257       CHECK_NUMBER (insval, 0);
3258       len = XINT (insval);
3259     }
3260
3261   return make_number (len);
3262 }
3263
3264 Lisp_Object
3265 code_convert_string (str, coding, encodep, nocopy)
3266      Lisp_Object str, nocopy;
3267      struct coding_system *coding;
3268      int encodep;
3269 {
3270   int len, consumed, produced;
3271   char *buf;
3272   unsigned char *begp, *endp;
3273   int head_skip, tail_skip;
3274   struct gcpro gcpro1;
3275
3276   if (encodep && !NILP (coding->pre_write_conversion)
3277       || !encodep && !NILP (coding->post_read_conversion))
3278     {
3279       /* Since we have to call Lisp functions which assume target text
3280          is in a buffer, after setting a temporary buffer, call
3281          code_convert_region.  */
3282       int count = specpdl_ptr - specpdl;
3283       int len = XSTRING (str)->size;
3284       Lisp_Object result;
3285       struct buffer *old = current_buffer;
3286
3287       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3288       temp_output_buffer_setup (" *code-converting-work*");
3289       set_buffer_internal (XBUFFER (Vstandard_output));
3290       insert_from_string (str, 0, len, 0);
3291       code_convert_region (make_number (BEGV), make_number (ZV),
3292                            coding, encodep);
3293       result = make_buffer_string (BEGV, ZV, 0);
3294       set_buffer_internal (old);
3295       return unbind_to (count, result);
3296     }
3297
3298   /* We may be able to shrink the conversion region.  */
3299   begp = XSTRING (str)->data;
3300   endp = begp + XSTRING (str)->size;
3301   shrink_conversion_area (&begp, &endp, coding, encodep);
3302
3303   if (begp == endp)
3304     /* We need no conversion.  */
3305     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3306
3307   head_skip = begp - XSTRING (str)->data;
3308   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3309
3310   GCPRO1 (str);
3311
3312   if (encodep)
3313     len = encoding_buffer_size (coding, endp - begp);
3314   else
3315     len = decoding_buffer_size (coding, endp - begp);
3316   buf = get_conversion_buffer (len + head_skip + tail_skip);
3317
3318   bcopy (XSTRING (str)->data, buf, head_skip);
3319   coding->last_block = 1;
3320   produced = (encodep
3321               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3322                                buf + head_skip, endp - begp, len, &consumed)
3323               : decode_coding (coding, XSTRING (str)->data + head_skip,
3324                                buf + head_skip, endp - begp, len, &consumed));
3325   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3326          buf + head_skip + produced,
3327          tail_skip);
3328
3329   UNGCPRO;
3330
3331   return make_string (buf, head_skip + produced + tail_skip);
3332 }
3333
3334 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3335        3, 3, "r\nzCoding system: ",
3336   "Decode current region by specified coding system.\n\
3337 When called from a program, takes three arguments:\n\
3338 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3339 Return length of decoded text.")
3340   (b, e, coding_system)
3341      Lisp_Object b, e, coding_system;
3342 {
3343   struct coding_system coding;
3344
3345   CHECK_NUMBER_COERCE_MARKER (b, 0);
3346   CHECK_NUMBER_COERCE_MARKER (e, 1);
3347   CHECK_SYMBOL (coding_system, 2);
3348
3349   if (NILP (coding_system))
3350     return make_number (XFASTINT (e) - XFASTINT (b));
3351   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3352     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3353
3354   return code_convert_region (b, e, &coding, 0);
3355 }
3356
3357 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3358        3, 3, "r\nzCoding system: ",
3359   "Encode current region by specified coding system.\n\
3360 When called from a program, takes three arguments:\n\
3361 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3362 Return length of encoded text.")
3363   (b, e, coding_system)
3364      Lisp_Object b, e, coding_system;
3365 {
3366   struct coding_system coding;
3367
3368   CHECK_NUMBER_COERCE_MARKER (b, 0);
3369   CHECK_NUMBER_COERCE_MARKER (e, 1);
3370   CHECK_SYMBOL (coding_system, 2);
3371
3372   if (NILP (coding_system))
3373     return make_number (XFASTINT (e) - XFASTINT (b));
3374   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3375     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3376
3377   return code_convert_region (b, e, &coding, 1);
3378 }
3379
3380 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3381        2, 3, 0,
3382   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3383 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3384 of decoding.")
3385   (string, coding_system, nocopy)
3386      Lisp_Object string, coding_system, nocopy;
3387 {
3388   struct coding_system coding;
3389
3390   CHECK_STRING (string, 0);
3391   CHECK_SYMBOL (coding_system, 1);
3392
3393   if (NILP (coding_system))
3394     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3395   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3396     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3397
3398   return code_convert_string (string, &coding, 0, nocopy);
3399 }
3400
3401 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3402        2, 3, 0,
3403   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3404 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3405 of encoding.")
3406   (string, coding_system, nocopy)
3407      Lisp_Object string, coding_system, nocopy;
3408 {
3409   struct coding_system coding;
3410
3411   CHECK_STRING (string, 0);
3412   CHECK_SYMBOL (coding_system, 1);
3413
3414   if (NILP (coding_system))
3415     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3416   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3417     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3418
3419   return code_convert_string (string, &coding, 1, nocopy);
3420 }
3421
3422 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3423   "Decode a JISX0208 character of shift-jis encoding.\n\
3424 CODE is the character code in SJIS.\n\
3425 Return the corresponding character.")
3426   (code)
3427      Lisp_Object code;
3428 {
3429   unsigned char c1, c2, s1, s2;
3430   Lisp_Object val;
3431
3432   CHECK_NUMBER (code, 0);
3433   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3434   DECODE_SJIS (s1, s2, c1, c2);
3435   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3436   return val;
3437 }
3438
3439 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3440   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3441 Return the corresponding character code in SJIS.")
3442   (ch)
3443      Lisp_Object ch;
3444 {
3445   int charset, c1, c2, s1, s2;
3446   Lisp_Object val;
3447
3448   CHECK_NUMBER (ch, 0);
3449   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3450   if (charset == charset_jisx0208)
3451     {
3452       ENCODE_SJIS (c1, c2, s1, s2);
3453       XSETFASTINT (val, (s1 << 8) | s2);
3454     }
3455   else
3456     XSETFASTINT (val, 0);
3457   return val;
3458 }
3459
3460 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3461   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3462 CODE is the character code in BIG5.\n\
3463 Return the corresponding character.")
3464   (code)
3465      Lisp_Object code;
3466 {
3467   int charset;
3468   unsigned char b1, b2, c1, c2;
3469   Lisp_Object val;
3470
3471   CHECK_NUMBER (code, 0);
3472   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3473   DECODE_BIG5 (b1, b2, charset, c1, c2);
3474   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3475   return val;
3476 }
3477
3478 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3479   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3480 Return the corresponding character code in Big5.")
3481   (ch)
3482      Lisp_Object ch;
3483 {
3484   int charset, c1, c2, b1, b2;
3485   Lisp_Object val;
3486
3487   CHECK_NUMBER (ch, 0);
3488   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3489   if (charset == charset_big5_1 || charset == charset_big5_2)
3490     {
3491       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3492       XSETFASTINT (val, (b1 << 8) | b2);
3493     }
3494   else
3495     XSETFASTINT (val, 0);
3496   return val;
3497 }
3498
3499 DEFUN ("set-terminal-coding-system-internal",
3500        Fset_terminal_coding_system_internal,
3501        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3502   (coding_system)
3503      Lisp_Object coding_system;
3504 {
3505   CHECK_SYMBOL (coding_system, 0);
3506   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3507   return Qnil;
3508 }
3509
3510 DEFUN ("terminal-coding-system",
3511        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3512   "Return coding-system of your terminal.")
3513   ()
3514 {
3515   return terminal_coding.symbol;
3516 }
3517
3518 DEFUN ("set-keyboard-coding-system-internal",
3519        Fset_keyboard_coding_system_internal,
3520        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3521   (coding_system)
3522      Lisp_Object coding_system;
3523 {
3524   CHECK_SYMBOL (coding_system, 0);
3525   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3526   return Qnil;
3527 }
3528
3529 DEFUN ("keyboard-coding-system",
3530        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3531   "Return coding-system of what is sent from terminal keyboard.")
3532   ()
3533 {
3534   return keyboard_coding.symbol;
3535 }
3536
3537 \f
3538 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3539        Sfind_operation_coding_system,  1, MANY, 0,
3540   "Choose a coding system for an operation based on the target name.\n\
3541 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3542 DECODING-SYSTEM is the coding system to use for decoding\n\
3543 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3544 for encoding (in case OPERATION does encoding).\n\
3545 \n\
3546 The first argument OPERATION specifies an I/O primitive:\n\
3547   For file I/O, `insert-file-contents' or `write-region'.\n\
3548   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3549   For network I/O, `open-network-stream'.\n\
3550 \n\
3551 The remaining arguments should be the same arguments that were passed\n\
3552 to the primitive.  Depending on which primitive, one of those arguments\n\
3553 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3554 whichever argument specifies the file name is TARGET.\n\
3555 \n\
3556 TARGET has a meaning which depends on OPERATION:\n\
3557   For file I/O, TARGET is a file name.\n\
3558   For process I/O, TARGET is a process name.\n\
3559   For network I/O, TARGET is a service name or a port number\n\
3560 \n\
3561 This function looks up what specified for TARGET in,\n\
3562 `file-coding-system-alist', `process-coding-system-alist',\n\
3563 or `network-coding-system-alist' depending on OPERATION.\n\
3564 They may specify a coding system, a cons of coding systems,\n\
3565 or a function symbol to call.\n\
3566 In the last case, we call the function with one argument,\n\
3567 which is a list of all the arguments given to this function.")
3568   (nargs, args)
3569      int nargs;
3570      Lisp_Object *args;
3571 {
3572   Lisp_Object operation, target_idx, target, val;
3573   register Lisp_Object chain;
3574
3575   if (nargs < 2)
3576     error ("Too few arguments");
3577   operation = args[0];
3578   if (!SYMBOLP (operation)
3579       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3580     error ("Invalid first arguement");
3581   if (nargs < 1 + XINT (target_idx))
3582     error ("Too few arguments for operation: %s",
3583            XSYMBOL (operation)->name->data);
3584   target = args[XINT (target_idx) + 1];
3585   if (!(STRINGP (target)
3586         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3587     error ("Invalid %dth argument", XINT (target_idx) + 1);
3588
3589   chain = ((EQ (operation, Qinsert_file_contents)
3590             || EQ (operation, Qwrite_region))
3591            ? Vfile_coding_system_alist
3592            : (EQ (operation, Qopen_network_stream)
3593               ? Vnetwork_coding_system_alist
3594               : Vprocess_coding_system_alist));
3595   if (NILP (chain))
3596     return Qnil;
3597
3598   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3599     {
3600       Lisp_Object elt = XCONS (chain)->car;
3601
3602       if (CONSP (elt)
3603           && ((STRINGP (target)
3604                && STRINGP (XCONS (elt)->car)
3605                && fast_string_match (XCONS (elt)->car, target) >= 0)
3606               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3607         {
3608           val = XCONS (elt)->cdr;
3609           if (CONSP (val))
3610             return val;
3611           if (! SYMBOLP (val))
3612             return Qnil;
3613           if (! NILP (Fcoding_system_p (val)))
3614             return Fcons (val, val);
3615           if (!NILP (Ffboundp (val)))
3616             return call1 (val, Flist (nargs, args));
3617           return Qnil;
3618         }
3619     }
3620   return Qnil;
3621 }
3622
3623 #endif /* emacs */
3624
3625 \f
3626 /*** 8. Post-amble ***/
3627
3628 init_coding_once ()
3629 {
3630   int i;
3631
3632   /* Emacs' internal format specific initialize routine.  */
3633   for (i = 0; i <= 0x20; i++)
3634     emacs_code_class[i] = EMACS_control_code;
3635   emacs_code_class[0x0A] = EMACS_linefeed_code;
3636   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3637   for (i = 0x21 ; i < 0x7F; i++)
3638     emacs_code_class[i] = EMACS_ascii_code;
3639   emacs_code_class[0x7F] = EMACS_control_code;
3640   emacs_code_class[0x80] = EMACS_leading_code_composition;
3641   for (i = 0x81; i < 0xFF; i++)
3642     emacs_code_class[i] = EMACS_invalid_code;
3643   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3644   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3645   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3646   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3647
3648   /* ISO2022 specific initialize routine.  */
3649   for (i = 0; i < 0x20; i++)
3650     iso_code_class[i] = ISO_control_code;
3651   for (i = 0x21; i < 0x7F; i++)
3652     iso_code_class[i] = ISO_graphic_plane_0;
3653   for (i = 0x80; i < 0xA0; i++)
3654     iso_code_class[i] = ISO_control_code;
3655   for (i = 0xA1; i < 0xFF; i++)
3656     iso_code_class[i] = ISO_graphic_plane_1;
3657   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3658   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3659   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3660   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3661   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3662   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3663   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3664   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3665   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3666   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3667
3668   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3669   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3670
3671   setup_coding_system (Qnil, &keyboard_coding);
3672   setup_coding_system (Qnil, &terminal_coding);
3673
3674 #if defined (MSDOS) || defined (WINDOWSNT)
3675   system_eol_type = CODING_EOL_CRLF;
3676 #else
3677   system_eol_type = CODING_EOL_LF;
3678 #endif
3679 }
3680
3681 #ifdef emacs
3682
3683 syms_of_coding ()
3684 {
3685   Qtarget_idx = intern ("target-idx");
3686   staticpro (&Qtarget_idx);
3687
3688   /* Target FILENAME is the first argument.  */
3689   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3690   /* Target FILENAME is the third argument.  */
3691   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3692
3693   Qcall_process = intern ("call-process");
3694   staticpro (&Qcall_process);
3695   /* Target PROGRAM is the first argument.  */
3696   Fput (Qcall_process, Qtarget_idx, make_number (0));
3697
3698   Qcall_process_region = intern ("call-process-region");
3699   staticpro (&Qcall_process_region);
3700   /* Target PROGRAM is the third argument.  */
3701   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3702
3703   Qstart_process = intern ("start-process");
3704   staticpro (&Qstart_process);
3705   /* Target PROGRAM is the third argument.  */
3706   Fput (Qstart_process, Qtarget_idx, make_number (2));
3707
3708   Qopen_network_stream = intern ("open-network-stream");
3709   staticpro (&Qopen_network_stream);
3710   /* Target SERVICE is the fourth argument.  */
3711   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3712
3713   Qcoding_system = intern ("coding-system");
3714   staticpro (&Qcoding_system);
3715
3716   Qeol_type = intern ("eol-type");
3717   staticpro (&Qeol_type);
3718
3719   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3720   staticpro (&Qbuffer_file_coding_system);
3721
3722   Qpost_read_conversion = intern ("post-read-conversion");
3723   staticpro (&Qpost_read_conversion);
3724
3725   Qpre_write_conversion = intern ("pre-write-conversion");
3726   staticpro (&Qpre_write_conversion);
3727
3728   Qcoding_system_spec = intern ("coding-system-spec");
3729   staticpro (&Qcoding_system_spec);
3730
3731   Qcoding_system_p = intern ("coding-system-p");
3732   staticpro (&Qcoding_system_p);
3733
3734   Qcoding_system_error = intern ("coding-system-error");
3735   staticpro (&Qcoding_system_error);
3736
3737   Fput (Qcoding_system_error, Qerror_conditions,
3738         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3739   Fput (Qcoding_system_error, Qerror_message,
3740         build_string ("Invalid coding system"));
3741
3742   Qcoding_category_index = intern ("coding-category-index");
3743   staticpro (&Qcoding_category_index);
3744
3745   {
3746     int i;
3747     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3748       {
3749         coding_category_table[i] = intern (coding_category_name[i]);
3750         staticpro (&coding_category_table[i]);
3751         Fput (coding_category_table[i], Qcoding_category_index,
3752               make_number (i));
3753       }
3754   }
3755
3756   Qcharacter_unification_table = intern ("character-unification-table");
3757   staticpro (&Qcharacter_unification_table);
3758   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3759         make_number (0));
3760
3761   Qcharacter_unification_table_for_decode
3762     = intern ("character-unification-table-for-decode");
3763   staticpro (&Qcharacter_unification_table_for_decode);
3764
3765   Qcharacter_unification_table_for_encode
3766     = intern ("character-unification-table-for-encode");
3767   staticpro (&Qcharacter_unification_table_for_encode);
3768
3769   Qemacs_mule = intern ("emacs-mule");
3770   staticpro (&Qemacs_mule);
3771
3772   defsubr (&Scoding_system_spec);
3773   defsubr (&Scoding_system_p);
3774   defsubr (&Sread_coding_system);
3775   defsubr (&Sread_non_nil_coding_system);
3776   defsubr (&Scheck_coding_system);
3777   defsubr (&Sdetect_coding_region);
3778   defsubr (&Sdecode_coding_region);
3779   defsubr (&Sencode_coding_region);
3780   defsubr (&Sdecode_coding_string);
3781   defsubr (&Sencode_coding_string);
3782   defsubr (&Sdecode_sjis_char);
3783   defsubr (&Sencode_sjis_char);
3784   defsubr (&Sdecode_big5_char);
3785   defsubr (&Sencode_big5_char);
3786   defsubr (&Sset_terminal_coding_system_internal);
3787   defsubr (&Sterminal_coding_system);
3788   defsubr (&Sset_keyboard_coding_system_internal);
3789   defsubr (&Skeyboard_coding_system);
3790   defsubr (&Sfind_operation_coding_system);
3791
3792   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3793     "List of coding-categories (symbols) ordered by priority.");
3794   {
3795     int i;
3796
3797     Vcoding_category_list = Qnil;
3798     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3799       Vcoding_category_list
3800         = Fcons (coding_category_table[i], Vcoding_category_list);
3801   }
3802
3803   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3804     "A variable of internal use only.\n\
3805 If the value is a coding system, it is used for decoding on read operation.\n\
3806 If not, an appropriate element is used from one of the coding system alists:\n\
3807 There are three of such tables, `file-coding-system-alist',\n\
3808 `process-coding-system-alist', and `network-coding-system-alist'.");
3809   Vcoding_system_for_read = Qnil;
3810
3811   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3812     "A variable of internal use only.\n\
3813 If the value is a coding system, it is used for encoding on write operation.\n\
3814 If not, an appropriate element is used from one of the coding system alists:\n\
3815 There are three of such tables, `file-coding-system-alist',\n\
3816 `process-coding-system-alist', and `network-coding-system-alist'.");
3817   Vcoding_system_for_write = Qnil;
3818
3819   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3820     "Coding system used in the latest file or process I/O.");
3821   Vlast_coding_system_used = Qnil;
3822
3823   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3824     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3825   inhibit_eol_conversion = 0;
3826
3827   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3828     "Alist to decide a coding system to use for a file I/O operation.\n\
3829 The format is ((PATTERN . VAL) ...),\n\
3830 where PATTERN is a regular expression matching a file name,\n\
3831 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3832 If VAL is a coding system, it is used for both decoding and encoding\n\
3833 the file contents.\n\
3834 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3835 and the cdr part is used for encoding.\n\
3836 If VAL is a function symbol, the function must return a coding system\n\
3837 or a cons of coding systems which are used as above.\n\
3838 \n\
3839 See also the function `find-operation-coding-system'.");
3840   Vfile_coding_system_alist = Qnil;
3841
3842   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3843     "Alist to decide a coding system to use for a process I/O operation.\n\
3844 The format is ((PATTERN . VAL) ...),\n\
3845 where PATTERN is a regular expression matching a program name,\n\
3846 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3847 If VAL is a coding system, it is used for both decoding what received\n\
3848 from the program and encoding what sent to the program.\n\
3849 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3850 and the cdr part is used for encoding.\n\
3851 If VAL is a function symbol, the function must return a coding system\n\
3852 or a cons of coding systems which are used as above.\n\
3853 \n\
3854 See also the function `find-operation-coding-system'.");
3855   Vprocess_coding_system_alist = Qnil;
3856
3857   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3858     "Alist to decide a coding system to use for a network I/O operation.\n\
3859 The format is ((PATTERN . VAL) ...),\n\
3860 where PATTERN is a regular expression matching a network service name\n\
3861 or is a port number to connect to,\n\
3862 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3863 If VAL is a coding system, it is used for both decoding what received\n\
3864 from the network stream and encoding what sent to the network stream.\n\
3865 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3866 and the cdr part is used for encoding.\n\
3867 If VAL is a function symbol, the function must return a coding system\n\
3868 or a cons of coding systems which are used as above.\n\
3869 \n\
3870 See also the function `find-operation-coding-system'.");
3871   Vnetwork_coding_system_alist = Qnil;
3872
3873   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3874     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3875   eol_mnemonic_unix = ':';
3876
3877   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3878     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3879   eol_mnemonic_dos = '\\';
3880
3881   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3882     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3883   eol_mnemonic_mac = '/';
3884
3885   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3886     "Mnemonic character indicating end-of-line format is not yet decided.");
3887   eol_mnemonic_undecided = ':';
3888
3889   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3890     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3891   Venable_character_unification = Qt;
3892
3893   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3894     &Vstandard_character_unification_table_for_decode,
3895     "Table for unifying characters when reading.");
3896   Vstandard_character_unification_table_for_decode = Qnil;
3897
3898   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3899     &Vstandard_character_unification_table_for_encode,
3900     "Table for unifying characters when writing.");
3901   Vstandard_character_unification_table_for_encode = Qnil;
3902
3903   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3904     "Alist of charsets vs revision numbers.\n\
3905 While encoding, if a charset (car part of an element) is found,\n\
3906 designate it with the escape sequence identifing revision (cdr part of the element).");
3907   Vcharset_revision_alist = Qnil;
3908
3909   DEFVAR_LISP ("default-process-coding-system",
3910                &Vdefault_process_coding_system,
3911     "Cons of coding systems used for process I/O by default.\n\
3912 The car part is used for decoding a process output,\n\
3913 the cdr part is used for encoding a text to be sent to a process.");
3914   Vdefault_process_coding_system = Qnil;
3915 }
3916
3917 #endif /* emacs */