src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* Flag to inhibit code conversion of end-of-line format.  */
 282 int inhibit_eol_conversion;
 283
 284 /* Coding-system of what terminal accept for displaying.  */
 285 struct coding_system terminal_coding;
 286
 287 /* Coding-system of what is sent from terminal keyboard.  */
 288 struct coding_system keyboard_coding;
 289
 290 Lisp_Object Vfile_coding_system_alist;
 291 Lisp_Object Vprocess_coding_system_alist;
 292 Lisp_Object Vnetwork_coding_system_alist;
 293
 294 #endif /* emacs */
 295
 296 Lisp_Object Qcoding_category_index;
 297
 298 /* List of symbols `coding-category-xxx' ordered by priority.  */
 299 Lisp_Object Vcoding_category_list;
 300
 301 /* Table of coding-systems currently assigned to each coding-category.  */
 302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 303
 304 /* Table of names of symbol for each coding-category.  */
 305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 306   "coding-category-emacs-mule",
 307   "coding-category-sjis",
 308   "coding-category-iso-7",
 309   "coding-category-iso-8-1",
 310   "coding-category-iso-8-2",
 311   "coding-category-iso-else",
 312   "coding-category-big5",
 313   "coding-category-binary"
 314 };
 315
 316 /* Flag to tell if we look up unification table on character code
 317    conversion.  */
 318 Lisp_Object Venable_character_unification;
 319 /* Standard unification table to look up on decoding (reading).  */
 320 Lisp_Object Vstandard_character_unification_table_for_decode;
 321 /* Standard unification table to look up on encoding (writing).  */
 322 Lisp_Object Vstandard_character_unification_table_for_encode;
 323
 324 Lisp_Object Qcharacter_unification_table;
 325 Lisp_Object Qcharacter_unification_table_for_decode;
 326 Lisp_Object Qcharacter_unification_table_for_encode;
 327
 328 /* Alist of charsets vs revision number.  */
 329 Lisp_Object Vcharset_revision_alist;
 330
 331 /* Default coding systems used for process I/O.  */
 332 Lisp_Object Vdefault_process_coding_system;
 333
 334 \f
 335 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 336
 337 /* Emacs' internal format for encoding multiple character sets is a
 338    kind of multi-byte encoding, i.e. characters are encoded by
 339    variable-length sequences of one-byte codes.  ASCII characters
 340    and control characters (e.g. `tab', `newline') are represented by
 341    one-byte sequences which are their ASCII codes, in the range 0x00
 342    through 0x7F.  The other characters are represented by a sequence
 343    of `base leading-code', optional `extended leading-code', and one
 344    or two `position-code's.  The length of the sequence is determined
 345    by the base leading-code.  Leading-code takes the range 0x80
 346    through 0x9F, whereas extended leading-code and position-code take
 347    the range 0xA0 through 0xFF.  See `charset.h' for more details
 348    about leading-code and position-code.
 349
 350    There's one exception to this rule.  Special leading-code
 351    `leading-code-composition' denotes that the following several
 352    characters should be composed into one character.  Leading-codes of
 353    components (except for ASCII) are added 0x20.  An ASCII character
 354    component is represented by a 2-byte sequence of `0xA0' and
 355    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 356    details of composite character.  Hence, we can summarize the code
 357    range as follows:
 358
 359    --- CODE RANGE of Emacs' internal format ---
 360    (character set)      (range)
 361    ASCII                0x00 .. 0x7F
 362    ELSE (1st byte)      0x80 .. 0x9F
 363         (rest bytes)    0xA0 .. 0xFF
 364    ---------------------------------------------
 365
 366   */
 367
 368 enum emacs_code_class_type emacs_code_class[256];
 369
 370 /* Go to the next statement only if *SRC is accessible and the code is
 371    greater than 0xA0.  */
 372 #define CHECK_CODE_RANGE_A0_FF  \
 373   do {                          \
 374     if (src >= src_end)         \
 375       goto label_end_of_switch; \
 376     else if (*src++ < 0xA0)     \
 377       return 0;                 \
 378   } while (0)
 379
 380 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 381    Check if a text is encoded in Emacs' internal format.  If it is,
 382    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 383
 384 int
 385 detect_coding_emacs_mule (src, src_end)
 386      unsigned char *src, *src_end;
 387 {
 388   unsigned char c;
 389   int composing = 0;
 390
 391   while (src < src_end)
 392     {
 393       c = *src++;
 394
 395       if (composing)
 396         {
 397           if (c < 0xA0)
 398             composing = 0;
 399           else
 400             c -= 0x20;
 401         }
 402
 403       switch (emacs_code_class[c])
 404         {
 405         case EMACS_ascii_code:
 406         case EMACS_linefeed_code:
 407           break;
 408
 409         case EMACS_control_code:
 410           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 411             return 0;
 412           break;
 413
 414         case EMACS_invalid_code:
 415           return 0;
 416
 417         case EMACS_leading_code_composition: /* c == 0x80 */
 418           if (composing)
 419             CHECK_CODE_RANGE_A0_FF;
 420           else
 421             composing = 1;
 422           break;
 423
 424         case EMACS_leading_code_4:
 425           CHECK_CODE_RANGE_A0_FF;
 426           /* fall down to check it two more times ...  */
 427
 428         case EMACS_leading_code_3:
 429           CHECK_CODE_RANGE_A0_FF;
 430           /* fall down to check it one more time ...  */
 431
 432         case EMACS_leading_code_2:
 433           CHECK_CODE_RANGE_A0_FF;
 434           break;
 435
 436         default:
 437         label_end_of_switch:
 438           break;
 439         }
 440     }
 441   return CODING_CATEGORY_MASK_EMACS_MULE;
 442 }
 443
 444 \f
 445 /*** 3. ISO2022 handlers ***/
 446
 447 /* The following note describes the coding system ISO2022 briefly.
 448    Since the intention of this note is to help in understanding of
 449    the programs in this file, some parts are NOT ACCURATE or OVERLY
 450    SIMPLIFIED.  For the thorough understanding, please refer to the
 451    original document of ISO2022.
 452
 453    ISO2022 provides many mechanisms to encode several character sets
 454    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 455    all text is encoded by codes of less than 128.  This may make the
 456    encoded text a little bit longer, but the text gets more stability
 457    to pass through several gateways (some of them strip off the MSB).
 458
 459    There are two kinds of character set: control character set and
 460    graphic character set.  The former contains control characters such
 461    as `newline' and `escape' to provide control functions (control
 462    functions are provided also by escape sequences).  The latter
 463    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 464    two control character sets and many graphic character sets.
 465
 466    Graphic character sets are classified into one of the following
 467    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 468    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 469    bytes (DIMENSION) and the number of characters in one dimension
 470    (CHARS) of the set.  In addition, each character set is assigned an
 471    identification tag (called "final character" and denoted as <F>
 472    here after) which is unique in each class.  <F> of each character
 473    set is decided by ECMA(*) when it is registered in ISO.  Code range
 474    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 475
 476    Note (*): ECMA = European Computer Manufacturers Association
 477
 478    Here are examples of graphic character set [NAME(<F>)]:
 479         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 480         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 481         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 482         o DIMENSION2_CHARS96 -- none for the moment
 483
 484    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 485         C0 [0x00..0x1F] -- control character plane 0
 486         GL [0x20..0x7F] -- graphic character plane 0
 487         C1 [0x80..0x9F] -- control character plane 1
 488         GR [0xA0..0xFF] -- graphic character plane 1
 489
 490    A control character set is directly designated and invoked to C0 or
 491    C1 by an escape sequence.  The most common case is that ISO646's
 492    control character set is designated/invoked to C0 and ISO6429's
 493    control character set is designated/invoked to C1, and usually
 494    these designations/invocations are omitted in a coded text.  With
 495    7-bit environment, only C0 can be used, and a control character for
 496    C1 is encoded by an appropriate escape sequence to fit in the
 497    environment.  All control characters for C1 are defined the
 498    corresponding escape sequences.
 499
 500    A graphic character set is at first designated to one of four
 501    graphic registers (G0 through G3), then these graphic registers are
 502    invoked to GL or GR.  These designations and invocations can be
 503    done independently.  The most common case is that G0 is invoked to
 504    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 505    these invocations and designations are omitted in a coded text.
 506    With 7-bit environment, only GL can be used.
 507
 508    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 509    and 0x7F of GL area work as control characters SPACE and DEL
 510    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 511
 512    There are two ways of invocation: locking-shift and single-shift.
 513    With locking-shift, the invocation lasts until the next different
 514    invocation, whereas with single-shift, the invocation works only
 515    for the following character and doesn't affect locking-shift.
 516    Invocations are done by the following control characters or escape
 517    sequences.
 518
 519    ----------------------------------------------------------------------
 520    function             control char    escape sequence description
 521    ----------------------------------------------------------------------
 522    SI  (shift-in)               0x0F    none            invoke G0 to GL
 523    SI  (shift-out)              0x0E    none            invoke G1 to GL
 524    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 525    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 526    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 527    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 528    ----------------------------------------------------------------------
 529    The first four are for locking-shift.  Control characters for these
 530    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 531
 532    Designations are done by the following escape sequences.
 533    ----------------------------------------------------------------------
 534    escape sequence      description
 535    ----------------------------------------------------------------------
 536    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 537    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 538    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 539    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 540    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 541    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 542    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 543    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 544    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 545    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 546    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 547    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 548    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 549    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 550    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 551    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 552    ----------------------------------------------------------------------
 553
 554    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 555    of dimension 1, chars 94, and final character <F>, and etc.
 556
 557    Note (*): Although these designations are not allowed in ISO2022,
 558    Emacs accepts them on decoding, and produces them on encoding
 559    CHARS96 character set in a coding system which is characterized as
 560    7-bit environment, non-locking-shift, and non-single-shift.
 561
 562    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 563    '(' can be omitted.  We call this as "short-form" here after.
 564
 565    Now you may notice that there are a lot of ways for encoding the
 566    same multilingual text in ISO2022.  Actually, there exists many
 567    coding systems such as Compound Text (used in X's inter client
 568    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 569    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 570    localized platforms), and all of these are variants of ISO2022.
 571
 572    In addition to the above, Emacs handles two more kinds of escape
 573    sequences: ISO6429's direction specification and Emacs' private
 574    sequence for specifying character composition.
 575
 576    ISO6429's direction specification takes the following format:
 577         o CSI ']'      -- end of the current direction
 578         o CSI '0' ']'  -- end of the current direction
 579         o CSI '1' ']'  -- start of left-to-right text
 580         o CSI '2' ']'  -- start of right-to-left text
 581    The control character CSI (0x9B: control sequence introducer) is
 582    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 583
 584    Character composition specification takes the following format:
 585         o ESC '0' -- start character composition
 586         o ESC '1' -- end character composition
 587    Since these are not standard escape sequences of any ISO, the use
 588    of them for these meaning is restricted to Emacs only.  */
 589
 590 enum iso_code_class_type iso_code_class[256];
 591
 592 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 593    Check if a text is encoded in ISO2022.  If it is, returns an
 594    integer in which appropriate flag bits any of:
 595         CODING_CATEGORY_MASK_ISO_7
 596         CODING_CATEGORY_MASK_ISO_8_1
 597         CODING_CATEGORY_MASK_ISO_8_2
 598         CODING_CATEGORY_MASK_ISO_ELSE
 599    are set.  If a code which should never appear in ISO2022 is found,
 600    returns 0.  */
 601
 602 int
 603 detect_coding_iso2022 (src, src_end)
 604      unsigned char *src, *src_end;
 605 {
 606   int mask = (CODING_CATEGORY_MASK_ISO_7
 607               | CODING_CATEGORY_MASK_ISO_8_1
 608               | CODING_CATEGORY_MASK_ISO_8_2
 609               | CODING_CATEGORY_MASK_ISO_ELSE);
 610   int g1 = 0;                   /* 1 iff designating to G1.  */
 611   int c, i;
 612
 613   while (src < src_end)
 614     {
 615       c = *src++;
 616       switch (c)
 617         {
 618         case ISO_CODE_ESC:
 619           if (src >= src_end)
 620             break;
 621           c = *src++;
 622           if (src < src_end
 623               && ((c >= '(' && c <= '/')
 624                   || c == '$' && ((*src >= '(' && *src <= '/')
 625                                   || (*src >= '@' && *src <= 'B'))))
 626             {
 627               /* Valid designation sequence.  */
 628               if (c == ')' || (c == '$' && *src == ')'))
 629                 {
 630                   g1 = 1;
 631                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 632                 }
 633               src++;
 634               break;
 635             }
 636           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 637             return CODING_CATEGORY_MASK_ISO_ELSE;
 638           break;
 639
 640         case ISO_CODE_SO:
 641           if (g1)
 642             return CODING_CATEGORY_MASK_ISO_ELSE;
 643           break;
 644
 645         case ISO_CODE_CSI:
 646         case ISO_CODE_SS2:
 647         case ISO_CODE_SS3:
 648           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 649           break;
 650
 651         default:
 652           if (c < 0x80)
 653             break;
 654           else if (c < 0xA0)
 655             return 0;
 656           else
 657             {
 658               int count = 1;
 659
 660               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 661               while (src < src_end && *src >= 0xA0)
 662                 count++, src++;
 663               if (count & 1 && src < src_end)
 664                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 665             }
 666           break;
 667         }
 668     }
 669
 670   return mask;
 671 }
 672
 673 /* Decode a character of which charset is CHARSET and the 1st position
 674    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 675    fetched from SRC and set to C2.  If CHARSET is negative, it means
 676    that we are decoding ill formed text, and what we can do is just to
 677    read C1 as is.  */
 678
 679 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 680   do {                                                                  \
 681     int c_alt, charset_alt = (charset);                                 \
 682     if (COMPOSING_HEAD_P (coding->composing))                           \
 683       {                                                                 \
 684         *dst++ = LEADING_CODE_COMPOSITION;                              \
 685         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 686           /* To tell composition rules are embeded.  */                 \
 687           *dst++ = 0xFF;                                                \
 688         coding->composing += 2;                                         \
 689       }                                                                 \
 690     if ((charset) >= 0)                                                 \
 691       {                                                                 \
 692         if (CHARSET_DIMENSION (charset) == 2)                           \
 693           ONE_MORE_BYTE (c2);                                           \
 694         if (!NILP (unification_table)                                   \
 695             && ((c_alt = unify_char (unification_table,                 \
 696                                      -1, (charset), c1, c2)) >= 0))     \
 697           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 698       }                                                                 \
 699     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 700       DECODE_CHARACTER_ASCII (c1);                                      \
 701     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 702       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 703     else                                                                \
 704       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 705     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 706       /* To tell a composition rule follows.  */                        \
 707       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 708   } while (0)
 709
 710 /* Set designation state into CODING.  */
 711 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 712   do {                                                                  \
 713     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 714                                      make_number (chars),               \
 715                                      make_number (final_char));         \
 716     if (charset >= 0)                                                   \
 717       {                                                                 \
 718         if (coding->direction == 1                                      \
 719             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 720           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 721         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 722       }                                                                 \
 723   } while (0)
 724
 725 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 726
 727 int
 728 decode_coding_iso2022 (coding, source, destination,
 729                        src_bytes, dst_bytes, consumed)
 730      struct coding_system *coding;
 731      unsigned char *source, *destination;
 732      int src_bytes, dst_bytes;
 733      int *consumed;
 734 {
 735   unsigned char *src = source;
 736   unsigned char *src_end = source + src_bytes;
 737   unsigned char *dst = destination;
 738   unsigned char *dst_end = destination + dst_bytes;
 739   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 740      from DST_END to assure that overflow checking is necessary only
 741      at the head of loop.  */
 742   unsigned char *adjusted_dst_end = dst_end - 6;
 743   int charset;
 744   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 745   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 746   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 747   Lisp_Object unification_table
 748       = coding->character_unification_table_for_decode;
 749
 750   if (!NILP (Venable_character_unification) && NILP (unification_table))
 751     unification_table = Vstandard_character_unification_table_for_decode;
 752
 753   while (src < src_end && dst < adjusted_dst_end)
 754     {
 755       /* SRC_BASE remembers the start position in source in each loop.
 756          The loop will be exited when there's not enough source text
 757          to analyze long escape sequence or 2-byte code (within macros
 758          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 759          to SRC_BASE before exiting.  */
 760       unsigned char *src_base = src;
 761       int c1 = *src++, c2;
 762
 763       switch (iso_code_class [c1])
 764         {
 765         case ISO_0x20_or_0x7F:
 766           if (!coding->composing
 767               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 768             {
 769               /* This is SPACE or DEL.  */
 770               *dst++ = c1;
 771               break;
 772             }
 773           /* This is a graphic character, we fall down ...  */
 774
 775         case ISO_graphic_plane_0:
 776           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 777             {
 778               /* This is a composition rule.  */
 779               *dst++ = c1 | 0x80;
 780               coding->composing = COMPOSING_WITH_RULE_TAIL;
 781             }
 782           else
 783             DECODE_ISO_CHARACTER (charset0, c1);
 784           break;
 785
 786         case ISO_0xA0_or_0xFF:
 787           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 788             {
 789               /* Invalid code.  */
 790               *dst++ = c1;
 791               break;
 792             }
 793           /* This is a graphic character, we fall down ... */
 794
 795         case ISO_graphic_plane_1:
 796           DECODE_ISO_CHARACTER (charset1, c1);
 797           break;
 798
 799         case ISO_control_code:
 800           /* All ISO2022 control characters in this class have the
 801              same representation in Emacs internal format.  */
 802           *dst++ = c1;
 803           break;
 804
 805         case ISO_carriage_return:
 806           if (coding->eol_type == CODING_EOL_CR)
 807             {
 808               *dst++ = '\n';
 809             }
 810           else if (coding->eol_type == CODING_EOL_CRLF)
 811             {
 812               ONE_MORE_BYTE (c1);
 813               if (c1 == ISO_CODE_LF)
 814                 *dst++ = '\n';
 815               else
 816                 {
 817                   src--;
 818                   *dst++ = c1;
 819                 }
 820             }
 821           else
 822             {
 823               *dst++ = c1;
 824             }
 825           break;
 826
 827         case ISO_shift_out:
 828           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 829             goto label_invalid_escape_sequence;
 830           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 831           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 832           break;
 833
 834         case ISO_shift_in:
 835           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 836           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 837           break;
 838
 839         case ISO_single_shift_2_7:
 840         case ISO_single_shift_2:
 841           /* SS2 is handled as an escape sequence of ESC 'N' */
 842           c1 = 'N';
 843           goto label_escape_sequence;
 844
 845         case ISO_single_shift_3:
 846           /* SS2 is handled as an escape sequence of ESC 'O' */
 847           c1 = 'O';
 848           goto label_escape_sequence;
 849
 850         case ISO_control_sequence_introducer:
 851           /* CSI is handled as an escape sequence of ESC '[' ...  */
 852           c1 = '[';
 853           goto label_escape_sequence;
 854
 855         case ISO_escape:
 856           ONE_MORE_BYTE (c1);
 857         label_escape_sequence:
 858           /* Escape sequences handled by Emacs are invocation,
 859              designation, direction specification, and character
 860              composition specification.  */
 861           switch (c1)
 862             {
 863             case '&':           /* revision of following character set */
 864               ONE_MORE_BYTE (c1);
 865               if (!(c1 >= '@' && c1 <= '~'))
 866                 goto label_invalid_escape_sequence;
 867               ONE_MORE_BYTE (c1);
 868               if (c1 != ISO_CODE_ESC)
 869                 goto label_invalid_escape_sequence;
 870               ONE_MORE_BYTE (c1);
 871               goto label_escape_sequence;
 872
 873             case '$':           /* designation of 2-byte character set */
 874               ONE_MORE_BYTE (c1);
 875               if (c1 >= '@' && c1 <= 'B')
 876                 {       /* designation of JISX0208.1978, GB2312.1980,
 877                                    or JISX0208.1980 */
 878                   DECODE_DESIGNATION (0, 2, 94, c1);
 879                 }
 880               else if (c1 >= 0x28 && c1 <= 0x2B)
 881                 {       /* designation of DIMENSION2_CHARS94 character set */
 882                   ONE_MORE_BYTE (c2);
 883                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 884                 }
 885               else if (c1 >= 0x2C && c1 <= 0x2F)
 886                 {       /* designation of DIMENSION2_CHARS96 character set */
 887                   ONE_MORE_BYTE (c2);
 888                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 889                 }
 890               else
 891                 goto label_invalid_escape_sequence;
 892               break;
 893
 894             case 'n':           /* invocation of locking-shift-2 */
 895               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 896                 goto label_invalid_escape_sequence;
 897               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 898               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 899               break;
 900
 901             case 'o':           /* invocation of locking-shift-3 */
 902               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 903                 goto label_invalid_escape_sequence;
 904               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 905               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 906               break;
 907
 908             case 'N':           /* invocation of single-shift-2 */
 909               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 910                 goto label_invalid_escape_sequence;
 911               ONE_MORE_BYTE (c1);
 912               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 913               DECODE_ISO_CHARACTER (charset, c1);
 914               break;
 915
 916             case 'O':           /* invocation of single-shift-3 */
 917               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 918                 goto label_invalid_escape_sequence;
 919               ONE_MORE_BYTE (c1);
 920               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 921               DECODE_ISO_CHARACTER (charset, c1);
 922               break;
 923
 924             case '0':           /* start composing without embeded rules */
 925               coding->composing = COMPOSING_NO_RULE_HEAD;
 926               break;
 927
 928             case '1':           /* end composing */
 929               coding->composing = COMPOSING_NO;
 930               break;
 931
 932             case '2':           /* start composing with embeded rules */
 933               coding->composing = COMPOSING_WITH_RULE_HEAD;
 934               break;
 935
 936             case '[':           /* specification of direction */
 937               /* For the moment, nested direction is not supported.
 938                  So, the value of `coding->direction' is 0 or 1: 0
 939                  means left-to-right, 1 means right-to-left.  */
 940               ONE_MORE_BYTE (c1);
 941               switch (c1)
 942                 {
 943                 case ']':       /* end of the current direction */
 944                   coding->direction = 0;
 945
 946                 case '0':       /* end of the current direction */
 947                 case '1':       /* start of left-to-right direction */
 948                   ONE_MORE_BYTE (c1);
 949                   if (c1 == ']')
 950                     coding->direction = 0;
 951                   else
 952                     goto label_invalid_escape_sequence;
 953                   break;
 954
 955                 case '2':       /* start of right-to-left direction */
 956                   ONE_MORE_BYTE (c1);
 957                   if (c1 == ']')
 958                     coding->direction= 1;
 959                   else
 960                     goto label_invalid_escape_sequence;
 961                   break;
 962
 963                 default:
 964                   goto label_invalid_escape_sequence;
 965                 }
 966               break;
 967
 968             default:
 969               if (c1 >= 0x28 && c1 <= 0x2B)
 970                 {       /* designation of DIMENSION1_CHARS94 character set */
 971                   ONE_MORE_BYTE (c2);
 972                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 973                 }
 974               else if (c1 >= 0x2C && c1 <= 0x2F)
 975                 {       /* designation of DIMENSION1_CHARS96 character set */
 976                   ONE_MORE_BYTE (c2);
 977                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 978                 }
 979               else
 980                 {
 981                   goto label_invalid_escape_sequence;
 982                 }
 983             }
 984           /* We must update these variables now.  */
 985           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 986           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 987           break;
 988
 989         label_invalid_escape_sequence:
 990           {
 991             int length = src - src_base;
 992
 993             bcopy (src_base, dst, length);
 994             dst += length;
 995           }
 996         }
 997       continue;
 998
 999     label_end_of_loop:
1000       coding->carryover_size = src - src_base;
1001       bcopy (src_base, coding->carryover, coding->carryover_size);
1002       src = src_base;
1003       break;
1004     }
1005
1006   /* If this is the last block of the text to be decoded, we had
1007      better just flush out all remaining codes in the text although
1008      they are not valid characters.  */
1009   if (coding->last_block)
1010     {
1011       bcopy (src, dst, src_end - src);
1012       dst += (src_end - src);
1013       src = src_end;
1014     }
1015   *consumed = src - source;
1016   return dst - destination;
1017 }
1018
1019 /* ISO2022 encoding stuff.  */
1020
1021 /*
1022    It is not enough to say just "ISO2022" on encoding, we have to
1023    specify more details.  In Emacs, each coding-system of ISO2022
1024    variant has the following specifications:
1025         1. Initial designation to G0 thru G3.
1026         2. Allows short-form designation?
1027         3. ASCII should be designated to G0 before control characters?
1028         4. ASCII should be designated to G0 at end of line?
1029         5. 7-bit environment or 8-bit environment?
1030         6. Use locking-shift?
1031         7. Use Single-shift?
1032    And the following two are only for Japanese:
1033         8. Use ASCII in place of JIS0201-1976-Roman?
1034         9. Use JISX0208-1983 in place of JISX0208-1978?
1035    These specifications are encoded in `coding->flags' as flag bits
1036    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1037    details.
1038 */
1039
1040 /* Produce codes (escape sequence) for designating CHARSET to graphic
1041    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1042    the coding system CODING allows, produce designation sequence of
1043    short-form.  */
1044
1045 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1046   do {                                                                  \
1047     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1048     char *intermediate_char_94 = "()*+";                                \
1049     char *intermediate_char_96 = ",-./";                                \
1050     Lisp_Object temp                                                    \
1051       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1052     if (! NILP (temp))                                                  \
1053         {                                                               \
1054         *dst++ = ISO_CODE_ESC;                                          \
1055         *dst++ = '&';                                                   \
1056         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1057       }                                                                 \
1058     *dst++ = ISO_CODE_ESC;                                              \
1059     if (CHARSET_DIMENSION (charset) == 1)                               \
1060       {                                                                 \
1061         if (CHARSET_CHARS (charset) == 94)                              \
1062           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1063         else                                                            \
1064           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1065       }                                                                 \
1066     else                                                                \
1067       {                                                                 \
1068         *dst++ = '$';                                                   \
1069         if (CHARSET_CHARS (charset) == 94)                              \
1070           {                                                             \
1071             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1072                 || reg != 0                                             \
1073                 || final_char < '@' || final_char > 'B')                \
1074               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1075           }                                                             \
1076         else                                                            \
1077           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1078       }                                                                 \
1079     *dst++ = final_char;                                                \
1080     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1081   } while (0)
1082
1083 /* The following two macros produce codes (control character or escape
1084    sequence) for ISO2022 single-shift functions (single-shift-2 and
1085    single-shift-3).  */
1086
1087 #define ENCODE_SINGLE_SHIFT_2                           \
1088   do {                                                  \
1089     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1090       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1091     else                                                \
1092       *dst++ = ISO_CODE_SS2;                            \
1093     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1094   } while (0)
1095
1096 #define ENCODE_SINGLE_SHIFT_3                           \
1097   do {                                                  \
1098     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1099       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1100     else                                                \
1101       *dst++ = ISO_CODE_SS3;                            \
1102     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1103   } while (0)
1104
1105 /* The following four macros produce codes (control character or
1106    escape sequence) for ISO2022 locking-shift functions (shift-in,
1107    shift-out, locking-shift-2, and locking-shift-3).  */
1108
1109 #define ENCODE_SHIFT_IN                         \
1110   do {                                          \
1111     *dst++ = ISO_CODE_SI;                       \
1112     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1113   } while (0)
1114
1115 #define ENCODE_SHIFT_OUT                        \
1116   do {                                          \
1117     *dst++ = ISO_CODE_SO;                       \
1118     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1119   } while (0)
1120
1121 #define ENCODE_LOCKING_SHIFT_2                  \
1122   do {                                          \
1123     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1124     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1125   } while (0)
1126
1127 #define ENCODE_LOCKING_SHIFT_3                  \
1128   do {                                          \
1129     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1130     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1131   } while (0)
1132
1133 /* Produce codes for a DIMENSION1 character whose character set is
1134    CHARSET and whose position-code is C1.  Designation and invocation
1135    sequences are also produced in advance if necessary.  */
1136
1137
1138 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1139   do {                                                                  \
1140     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1141       {                                                                 \
1142         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1143           *dst++ = c1 & 0x7F;                                           \
1144         else                                                            \
1145           *dst++ = c1 | 0x80;                                           \
1146         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1147         break;                                                          \
1148       }                                                                 \
1149     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1150       {                                                                 \
1151         *dst++ = c1 & 0x7F;                                             \
1152         break;                                                          \
1153       }                                                                 \
1154     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1155       {                                                                 \
1156         *dst++ = c1 | 0x80;                                             \
1157         break;                                                          \
1158       }                                                                 \
1159     else                                                                \
1160       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1161          must invoke it, or, at first, designate it to some graphic     \
1162          register.  Then repeat the loop to actually produce the        \
1163          character.  */                                                 \
1164       dst = encode_invocation_designation (charset, coding, dst);       \
1165   } while (1)
1166
1167 /* Produce codes for a DIMENSION2 character whose character set is
1168    CHARSET and whose position-codes are C1 and C2.  Designation and
1169    invocation codes are also produced in advance if necessary.  */
1170
1171 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1172   do {                                                                  \
1173     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1174       {                                                                 \
1175         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1176           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1177         else                                                            \
1178           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1179         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1180         break;                                                          \
1181       }                                                                 \
1182     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1183       {                                                                 \
1184         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1185         break;                                                          \
1186       }                                                                 \
1187     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1188       {                                                                 \
1189         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1190         break;                                                          \
1191       }                                                                 \
1192     else                                                                \
1193       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1194          must invoke it, or, at first, designate it to some graphic     \
1195          register.  Then repeat the loop to actually produce the        \
1196          character.  */                                                 \
1197       dst = encode_invocation_designation (charset, coding, dst);       \
1198   } while (1)
1199
1200 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1201   do {                                                                    \
1202     int c_alt, charset_alt;                                               \
1203     if (!NILP (unification_table)                                         \
1204         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1205             >= 0))                                                        \
1206       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1207     else                                                                  \
1208       charset_alt = charset;                                              \
1209     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1210       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1211     else                                                                  \
1212       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1213   } while (0)
1214
1215 /* Produce designation and invocation codes at a place pointed by DST
1216    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1217    Return new DST.  */
1218
1219 unsigned char *
1220 encode_invocation_designation (charset, coding, dst)
1221      int charset;
1222      struct coding_system *coding;
1223      unsigned char *dst;
1224 {
1225   int reg;                      /* graphic register number */
1226
1227   /* At first, check designations.  */
1228   for (reg = 0; reg < 4; reg++)
1229     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1230       break;
1231
1232   if (reg >= 4)
1233     {
1234       /* CHARSET is not yet designated to any graphic registers.  */
1235       /* At first check the requested designation.  */
1236       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1237       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1238         /* Since CHARSET requests no special designation, designate it
1239            to graphic register 0.  */
1240         reg = 0;
1241
1242       ENCODE_DESIGNATION (charset, reg, coding);
1243     }
1244
1245   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1246       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1247     {
1248       /* Since the graphic register REG is not invoked to any graphic
1249          planes, invoke it to graphic plane 0.  */
1250       switch (reg)
1251         {
1252         case 0:                 /* graphic register 0 */
1253           ENCODE_SHIFT_IN;
1254           break;
1255
1256         case 1:                 /* graphic register 1 */
1257           ENCODE_SHIFT_OUT;
1258           break;
1259
1260         case 2:                 /* graphic register 2 */
1261           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1262             ENCODE_SINGLE_SHIFT_2;
1263           else
1264             ENCODE_LOCKING_SHIFT_2;
1265           break;
1266
1267         case 3:                 /* graphic register 3 */
1268           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1269             ENCODE_SINGLE_SHIFT_3;
1270           else
1271             ENCODE_LOCKING_SHIFT_3;
1272           break;
1273         }
1274     }
1275   return dst;
1276 }
1277
1278 /* The following two macros produce codes for indicating composition.  */
1279 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1280 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1281 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1282
1283 /* The following three macros produce codes for indicating direction
1284    of text.  */
1285 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1286   do {                                                  \
1287     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1288       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1289     else                                                \
1290       *dst++ = ISO_CODE_CSI;                            \
1291   } while (0)
1292
1293 #define ENCODE_DIRECTION_R2L    \
1294   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1295
1296 #define ENCODE_DIRECTION_L2R    \
1297   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1298
1299 /* Produce codes for designation and invocation to reset the graphic
1300    planes and registers to initial state.  */
1301 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1302   do {                                                                      \
1303     int reg;                                                                \
1304     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1305       ENCODE_SHIFT_IN;                                                      \
1306     for (reg = 0; reg < 4; reg++)                                           \
1307       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1308           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1309               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1310         ENCODE_DESIGNATION                                                  \
1311           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1312   } while (0)
1313
1314 /* Produce designation sequences of charsets in the line started from
1315    *SRC to a place pointed by DSTP.
1316
1317    If the current block ends before any end-of-line, we may fail to
1318    find all the necessary *designations.  */
1319 encode_designation_at_bol (coding, table, src, src_end, dstp)
1320      struct coding_system *coding;
1321      Lisp_Object table;
1322      unsigned char *src, *src_end, **dstp;
1323 {
1324   int charset, c, found = 0, reg;
1325   /* Table of charsets to be designated to each graphic register.  */
1326   int r[4];
1327   unsigned char *dst = *dstp;
1328
1329   for (reg = 0; reg < 4; reg++)
1330     r[reg] = -1;
1331
1332   while (src < src_end && *src != '\n' && found < 4)
1333     {
1334       int bytes = BYTES_BY_CHAR_HEAD (*src);
1335
1336       if (NILP (table))
1337         charset = CHARSET_AT (src);
1338       else
1339         {
1340           int c_alt, c1, c2;
1341
1342           SPLIT_STRING(src, bytes, charset, c1, c2);
1343           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1344             charset = CHAR_CHARSET (c_alt);
1345         }
1346
1347       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1348       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1349         {
1350           found++;
1351           r[reg] = charset;
1352         }
1353
1354       src += bytes;
1355     }
1356
1357   if (found)
1358     {
1359       for (reg = 0; reg < 4; reg++)
1360         if (r[reg] >= 0
1361             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1362           ENCODE_DESIGNATION (r[reg], reg, coding);
1363       *dstp = dst;
1364     }
1365 }
1366
1367 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1368
1369 int
1370 encode_coding_iso2022 (coding, source, destination,
1371                        src_bytes, dst_bytes, consumed)
1372      struct coding_system *coding;
1373      unsigned char *source, *destination;
1374      int src_bytes, dst_bytes;
1375      int *consumed;
1376 {
1377   unsigned char *src = source;
1378   unsigned char *src_end = source + src_bytes;
1379   unsigned char *dst = destination;
1380   unsigned char *dst_end = destination + dst_bytes;
1381   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1382      from DST_END to assure overflow checking is necessary only at the
1383      head of loop.  */
1384   unsigned char *adjusted_dst_end = dst_end - 19;
1385   Lisp_Object unification_table
1386       = coding->character_unification_table_for_encode;
1387
1388   if (!NILP (Venable_character_unification) && NILP (unification_table))
1389     unification_table = Vstandard_character_unification_table_for_encode;
1390
1391   while (src < src_end && dst < adjusted_dst_end)
1392     {
1393       /* SRC_BASE remembers the start position in source in each loop.
1394          The loop will be exited when there's not enough source text
1395          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1396          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1397          reset to SRC_BASE before exiting.  */
1398       unsigned char *src_base = src;
1399       int charset, c1, c2, c3, c4;
1400
1401       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1402           && CODING_SPEC_ISO_BOL (coding))
1403         {
1404           /* We have to produce designation sequences if any now.  */
1405           encode_designation_at_bol (coding, unification_table,
1406                                      src, src_end, &dst);
1407           CODING_SPEC_ISO_BOL (coding) = 0;
1408         }
1409
1410       c1 = *src++;
1411       /* If we are seeing a component of a composite character, we are
1412          seeing a leading-code specially encoded for composition, or a
1413          composition rule if composing with rule.  We must set C1
1414          to a normal leading-code or an ASCII code.  If we are not at
1415          a composed character, we must reset the composition state.  */
1416       if (COMPOSING_P (coding->composing))
1417         {
1418           if (c1 < 0xA0)
1419             {
1420               /* We are not in a composite character any longer.  */
1421               coding->composing = COMPOSING_NO;
1422               ENCODE_COMPOSITION_END;
1423             }
1424           else
1425             {
1426               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1427                 {
1428                   *dst++ = c1 & 0x7F;
1429                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1430                   continue;
1431                 }
1432               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1433                 coding->composing = COMPOSING_WITH_RULE_RULE;
1434               if (c1 == 0xA0)
1435                 {
1436                   /* This is an ASCII component.  */
1437                   ONE_MORE_BYTE (c1);
1438                   c1 &= 0x7F;
1439                 }
1440               else
1441                 /* This is a leading-code of non ASCII component.  */
1442                 c1 -= 0x20;
1443             }
1444         }
1445
1446       /* Now encode one character.  C1 is a control character, an
1447          ASCII character, or a leading-code of multi-byte character.  */
1448       switch (emacs_code_class[c1])
1449         {
1450         case EMACS_ascii_code:
1451           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1452           break;
1453
1454         case EMACS_control_code:
1455           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1456             ENCODE_RESET_PLANE_AND_REGISTER;
1457           *dst++ = c1;
1458           break;
1459
1460         case EMACS_carriage_return_code:
1461           if (!coding->selective)
1462             {
1463               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1464                 ENCODE_RESET_PLANE_AND_REGISTER;
1465               *dst++ = c1;
1466               break;
1467             }
1468           /* fall down to treat '\r' as '\n' ...  */
1469
1470         case EMACS_linefeed_code:
1471           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1472             ENCODE_RESET_PLANE_AND_REGISTER;
1473           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1474             bcopy (coding->spec.iso2022.initial_designation,
1475                    coding->spec.iso2022.current_designation,
1476                    sizeof coding->spec.iso2022.initial_designation);
1477           if (coding->eol_type == CODING_EOL_LF
1478               || coding->eol_type == CODING_EOL_UNDECIDED)
1479             *dst++ = ISO_CODE_LF;
1480           else if (coding->eol_type == CODING_EOL_CRLF)
1481             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1482           else
1483             *dst++ = ISO_CODE_CR;
1484           CODING_SPEC_ISO_BOL (coding) = 1;
1485           break;
1486
1487         case EMACS_leading_code_2:
1488           ONE_MORE_BYTE (c2);
1489           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1490           break;
1491
1492         case EMACS_leading_code_3:
1493           TWO_MORE_BYTES (c2, c3);
1494           if (c1 < LEADING_CODE_PRIVATE_11)
1495             ENCODE_ISO_CHARACTER (c1, c2, c3);
1496           else
1497             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1498           break;
1499
1500         case EMACS_leading_code_4:
1501           THREE_MORE_BYTES (c2, c3, c4);
1502           ENCODE_ISO_CHARACTER (c2, c3, c4);
1503           break;
1504
1505         case EMACS_leading_code_composition:
1506           ONE_MORE_BYTE (c1);
1507           if (c1 == 0xFF)
1508             {
1509               coding->composing = COMPOSING_WITH_RULE_HEAD;
1510               ENCODE_COMPOSITION_WITH_RULE_START;
1511             }
1512           else
1513             {
1514               /* Rewind one byte because it is a character code of
1515                  composition elements.  */
1516               src--;
1517               coding->composing = COMPOSING_NO_RULE_HEAD;
1518               ENCODE_COMPOSITION_NO_RULE_START;
1519             }
1520           break;
1521
1522         case EMACS_invalid_code:
1523           *dst++ = c1;
1524           break;
1525         }
1526       continue;
1527     label_end_of_loop:
1528       coding->carryover_size = src - src_base;
1529       bcopy (src_base, coding->carryover, coding->carryover_size);
1530       break;
1531     }
1532
1533   /* If this is the last block of the text to be encoded, we must
1534      reset graphic planes and registers to the initial state.  */
1535   if (src >= src_end && coding->last_block)
1536     {
1537       ENCODE_RESET_PLANE_AND_REGISTER;
1538       if (coding->carryover_size > 0
1539           && coding->carryover_size < (dst_end - dst))
1540         {
1541           bcopy (coding->carryover, dst, coding->carryover_size);
1542           dst += coding->carryover_size;
1543           coding->carryover_size = 0;
1544         }
1545     }
1546   *consumed = src - source;
1547   return dst - destination;
1548 }
1549
1550 \f
1551 /*** 4. SJIS and BIG5 handlers ***/
1552
1553 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1554    quite widely.  So, for the moment, Emacs supports them in the bare
1555    C code.  But, in the future, they may be supported only by CCL.  */
1556
1557 /* SJIS is a coding system encoding three character sets: ASCII, right
1558    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1559    as is.  A character of charset katakana-jisx0201 is encoded by
1560    "position-code + 0x80".  A character of charset japanese-jisx0208
1561    is encoded in 2-byte but two position-codes are divided and shifted
1562    so that it fit in the range below.
1563
1564    --- CODE RANGE of SJIS ---
1565    (character set)      (range)
1566    ASCII                0x00 .. 0x7F
1567    KATAKANA-JISX0201    0xA0 .. 0xDF
1568    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1569             (2nd byte)  0x40 .. 0xFF
1570    -------------------------------
1571
1572 */
1573
1574 /* BIG5 is a coding system encoding two character sets: ASCII and
1575    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1576    character set and is encoded in two-byte.
1577
1578    --- CODE RANGE of BIG5 ---
1579    (character set)      (range)
1580    ASCII                0x00 .. 0x7F
1581    Big5 (1st byte)      0xA1 .. 0xFE
1582         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1583    --------------------------
1584
1585    Since the number of characters in Big5 is larger than maximum
1586    characters in Emacs' charset (96x96), it can't be handled as one
1587    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1588    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1589    contains frequently used characters and the latter contains less
1590    frequently used characters.  */
1591
1592 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1593    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1594    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1595    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1596
1597 /* Number of Big5 characters which have the same code in 1st byte.  */
1598 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1599
1600 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1601   do {                                                                  \
1602     unsigned int temp                                                   \
1603       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1604     if (b1 < 0xC9)                                                      \
1605       charset = charset_big5_1;                                         \
1606     else                                                                \
1607       {                                                                 \
1608         charset = charset_big5_2;                                       \
1609         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1610       }                                                                 \
1611     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1612     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1613   } while (0)
1614
1615 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1616   do {                                                                  \
1617     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1618     if (charset == charset_big5_2)                                      \
1619       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1620     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1621     b2 = temp % BIG5_SAME_ROW;                                          \
1622     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1623   } while (0)
1624
1625 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1626   do {                                                                  \
1627     int c_alt, charset_alt = (charset);                                 \
1628     if (!NILP (unification_table)                                       \
1629         && ((c_alt = unify_char (unification_table,                     \
1630                                  -1, (charset), c1, c2)) >= 0))         \
1631           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1632     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1633       DECODE_CHARACTER_ASCII (c1);                                      \
1634     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1635       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1636     else                                                                \
1637       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1638   } while (0)
1639
1640 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1641   do {                                                                    \
1642     int c_alt, charset_alt;                                               \
1643     if (!NILP (unification_table)                                         \
1644         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1645             >= 0))                                                        \
1646       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1647     else                                                                  \
1648       charset_alt = charset;                                              \
1649     if (charset_alt == charset_ascii)                                     \
1650       *dst++ = c1;                                                        \
1651     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1652       {                                                                   \
1653         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1654           *dst++ = c1;                                                    \
1655         else                                                              \
1656           *dst++ = charset_alt, *dst++ = c1;                              \
1657       }                                                                   \
1658     else                                                                  \
1659       {                                                                   \
1660         c1 &= 0x7F, c2 &= 0x7F;                                           \
1661         if (sjis_p && charset_alt == charset_jisx0208)                    \
1662           {                                                               \
1663             unsigned char s1, s2;                                         \
1664                                                                           \
1665             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1666             *dst++ = s1, *dst++ = s2;                                     \
1667           }                                                               \
1668         else if (!sjis_p                                                  \
1669                  && (charset_alt == charset_big5_1                        \
1670                      || charset_alt == charset_big5_2))                   \
1671           {                                                               \
1672             unsigned char b1, b2;                                         \
1673                                                                           \
1674             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1675             *dst++ = b1, *dst++ = b2;                                     \
1676           }                                                               \
1677         else                                                              \
1678           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1679       }                                                                   \
1680   } while (0);
1681
1682 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1683    Check if a text is encoded in SJIS.  If it is, return
1684    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1685
1686 int
1687 detect_coding_sjis (src, src_end)
1688      unsigned char *src, *src_end;
1689 {
1690   unsigned char c;
1691
1692   while (src < src_end)
1693     {
1694       c = *src++;
1695       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1696         return 0;
1697       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1698         {
1699           if (src < src_end && *src++ < 0x40)
1700             return 0;
1701         }
1702     }
1703   return CODING_CATEGORY_MASK_SJIS;
1704 }
1705
1706 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1707    Check if a text is encoded in BIG5.  If it is, return
1708    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1709
1710 int
1711 detect_coding_big5 (src, src_end)
1712      unsigned char *src, *src_end;
1713 {
1714   unsigned char c;
1715
1716   while (src < src_end)
1717     {
1718       c = *src++;
1719       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1720         return 0;
1721       if (c >= 0xA1)
1722         {
1723           if (src >= src_end)
1724             break;
1725           c = *src++;
1726           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1727             return 0;
1728         }
1729     }
1730   return CODING_CATEGORY_MASK_BIG5;
1731 }
1732
1733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1734    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1735
1736 int
1737 decode_coding_sjis_big5 (coding, source, destination,
1738                          src_bytes, dst_bytes, consumed, sjis_p)
1739      struct coding_system *coding;
1740      unsigned char *source, *destination;
1741      int src_bytes, dst_bytes;
1742      int *consumed;
1743      int sjis_p;
1744 {
1745   unsigned char *src = source;
1746   unsigned char *src_end = source + src_bytes;
1747   unsigned char *dst = destination;
1748   unsigned char *dst_end = destination + dst_bytes;
1749   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1750      from DST_END to assure overflow checking is necessary only at the
1751      head of loop.  */
1752   unsigned char *adjusted_dst_end = dst_end - 3;
1753   Lisp_Object unification_table
1754       = coding->character_unification_table_for_decode;
1755
1756   if (!NILP (Venable_character_unification) && NILP (unification_table))
1757     unification_table = Vstandard_character_unification_table_for_decode;
1758
1759   while (src < src_end && dst < adjusted_dst_end)
1760     {
1761       /* SRC_BASE remembers the start position in source in each loop.
1762          The loop will be exited when there's not enough source text
1763          to analyze two-byte character (within macro ONE_MORE_BYTE).
1764          In that case, SRC is reset to SRC_BASE before exiting.  */
1765       unsigned char *src_base = src;
1766       unsigned char c1 = *src++, c2, c3, c4;
1767
1768       if (c1 == '\r')
1769         {
1770           if (coding->eol_type == CODING_EOL_CRLF)
1771             {
1772               ONE_MORE_BYTE (c2);
1773               if (c2 == '\n')
1774                 *dst++ = c2;
1775               else
1776                 /* To process C2 again, SRC is subtracted by 1.  */
1777                 *dst++ = c1, src--;
1778             }
1779           else
1780             *dst++ = c1;
1781         }
1782       else if (c1 < 0x20)
1783         *dst++ = c1;
1784       else if (c1 < 0x80)
1785         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1786       else if (c1 < 0xA0 || c1 >= 0xE0)
1787         {
1788           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1789           if (sjis_p)
1790             {
1791               ONE_MORE_BYTE (c2);
1792               DECODE_SJIS (c1, c2, c3, c4);
1793               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1794             }
1795           else if (c1 >= 0xE0 && c1 < 0xFF)
1796             {
1797               int charset;
1798
1799               ONE_MORE_BYTE (c2);
1800               DECODE_BIG5 (c1, c2, charset, c3, c4);
1801               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1802             }
1803           else                  /* Invalid code */
1804             *dst++ = c1;
1805         }
1806       else
1807         {
1808           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1809           if (sjis_p)
1810             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1811           else
1812             {
1813               int charset;
1814
1815               ONE_MORE_BYTE (c2);
1816               DECODE_BIG5 (c1, c2, charset, c3, c4);
1817               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1818             }
1819         }
1820       continue;
1821
1822     label_end_of_loop:
1823       coding->carryover_size = src - src_base;
1824       bcopy (src_base, coding->carryover, coding->carryover_size);
1825       src = src_base;
1826       break;
1827     }
1828
1829   *consumed = src - source;
1830   return dst - destination;
1831 }
1832
1833 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1834    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1835    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1836    sure that all these charsets are registered as official charset
1837    (i.e. do not have extended leading-codes).  Characters of other
1838    charsets are produced without any encoding.  If SJIS_P is 1, encode
1839    SJIS text, else encode BIG5 text.  */
1840
1841 int
1842 encode_coding_sjis_big5 (coding, source, destination,
1843                          src_bytes, dst_bytes, consumed, sjis_p)
1844      struct coding_system *coding;
1845      unsigned char *source, *destination;
1846      int src_bytes, dst_bytes;
1847      int *consumed;
1848      int sjis_p;
1849 {
1850   unsigned char *src = source;
1851   unsigned char *src_end = source + src_bytes;
1852   unsigned char *dst = destination;
1853   unsigned char *dst_end = destination + dst_bytes;
1854   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1855      from DST_END to assure overflow checking is necessary only at the
1856      head of loop.  */
1857   unsigned char *adjusted_dst_end = dst_end - 1;
1858   Lisp_Object unification_table
1859       = coding->character_unification_table_for_encode;
1860
1861   if (!NILP (Venable_character_unification) && NILP (unification_table))
1862     unification_table = Vstandard_character_unification_table_for_encode;
1863
1864   while (src < src_end && dst < adjusted_dst_end)
1865     {
1866       /* SRC_BASE remembers the start position in source in each loop.
1867          The loop will be exited when there's not enough source text
1868          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1869          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1870          before exiting.  */
1871       unsigned char *src_base = src;
1872       unsigned char c1 = *src++, c2, c3, c4;
1873
1874       if (coding->composing)
1875         {
1876           if (c1 == 0xA0)
1877             {
1878               ONE_MORE_BYTE (c1);
1879               c1 &= 0x7F;
1880             }
1881           else if (c1 >= 0xA0)
1882             c1 -= 0x20;
1883           else
1884             coding->composing = 0;
1885         }
1886
1887       switch (emacs_code_class[c1])
1888         {
1889         case EMACS_ascii_code:
1890           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1891           break;
1892
1893         case EMACS_control_code:
1894           *dst++ = c1;
1895           break;
1896
1897         case EMACS_carriage_return_code:
1898           if (!coding->selective)
1899             {
1900               *dst++ = c1;
1901               break;
1902             }
1903           /* fall down to treat '\r' as '\n' ...  */
1904
1905         case EMACS_linefeed_code:
1906           if (coding->eol_type == CODING_EOL_LF
1907               || coding->eol_type == CODING_EOL_UNDECIDED)
1908             *dst++ = '\n';
1909           else if (coding->eol_type == CODING_EOL_CRLF)
1910             *dst++ = '\r', *dst++ = '\n';
1911           else
1912             *dst++ = '\r';
1913           break;
1914
1915         case EMACS_leading_code_2:
1916           ONE_MORE_BYTE (c2);
1917           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1918           break;
1919
1920         case EMACS_leading_code_3:
1921           TWO_MORE_BYTES (c2, c3);
1922           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1923           break;
1924
1925         case EMACS_leading_code_4:
1926           THREE_MORE_BYTES (c2, c3, c4);
1927           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1928           break;
1929
1930         case EMACS_leading_code_composition:
1931           coding->composing = 1;
1932           break;
1933
1934         default:                /* i.e. case EMACS_invalid_code: */
1935           *dst++ = c1;
1936         }
1937       continue;
1938
1939     label_end_of_loop:
1940       coding->carryover_size = src - src_base;
1941       bcopy (src_base, coding->carryover, coding->carryover_size);
1942       src = src_base;
1943       break;
1944     }
1945
1946   *consumed = src - source;
1947   return dst - destination;
1948 }
1949
1950 \f
1951 /*** 5. End-of-line handlers ***/
1952
1953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1954    This function is called only when `coding->eol_type' is
1955    CODING_EOL_CRLF or CODING_EOL_CR.  */
1956
1957 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1958      struct coding_system *coding;
1959      unsigned char *source, *destination;
1960      int src_bytes, dst_bytes;
1961      int *consumed;
1962 {
1963   unsigned char *src = source;
1964   unsigned char *src_end = source + src_bytes;
1965   unsigned char *dst = destination;
1966   unsigned char *dst_end = destination + dst_bytes;
1967   int produced;
1968
1969   switch (coding->eol_type)
1970     {
1971     case CODING_EOL_CRLF:
1972       {
1973         /* Since the maximum bytes produced by each loop is 2, we
1974            subtract 1 from DST_END to assure overflow checking is
1975            necessary only at the head of loop.  */
1976         unsigned char *adjusted_dst_end = dst_end - 1;
1977
1978         while (src < src_end && dst < adjusted_dst_end)
1979           {
1980             unsigned char *src_base = src;
1981             unsigned char c = *src++;
1982             if (c == '\r')
1983               {
1984                 ONE_MORE_BYTE (c);
1985                 if (c != '\n')
1986                   *dst++ = '\r';
1987                 *dst++ = c;
1988               }
1989             else
1990               *dst++ = c;
1991             continue;
1992
1993           label_end_of_loop:
1994             coding->carryover_size = src - src_base;
1995             bcopy (src_base, coding->carryover, coding->carryover_size);
1996             src = src_base;
1997             break;
1998           }
1999         *consumed = src - source;
2000         produced = dst - destination;
2001         break;
2002       }
2003
2004     case CODING_EOL_CR:
2005       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2006       bcopy (source, destination, produced);
2007       dst_end = destination + produced;
2008       while (dst < dst_end)
2009         if (*dst++ == '\r') dst[-1] = '\n';
2010       *consumed = produced;
2011       break;
2012
2013     default:                    /* i.e. case: CODING_EOL_LF */
2014       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2015       bcopy (source, destination, produced);
2016       *consumed = produced;
2017       break;
2018     }
2019
2020   return produced;
2021 }
2022
2023 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2024    format of end-of-line according to `coding->eol_type'.  If
2025    `coding->selective' is 1, code '\r' in source text also means
2026    end-of-line.  */
2027
2028 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2029      struct coding_system *coding;
2030      unsigned char *source, *destination;
2031      int src_bytes, dst_bytes;
2032      int *consumed;
2033 {
2034   unsigned char *src = source;
2035   unsigned char *dst = destination;
2036   int produced;
2037
2038   if (src_bytes <= 0)
2039     return 0;
2040
2041   switch (coding->eol_type)
2042     {
2043     case CODING_EOL_LF:
2044     case CODING_EOL_UNDECIDED:
2045       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2046       bcopy (source, destination, produced);
2047       if (coding->selective)
2048         {
2049           int i = produced;
2050           while (i--)
2051             if (*dst++ == '\r') dst[-1] = '\n';
2052         }
2053       *consumed = produced;
2054
2055     case CODING_EOL_CRLF:
2056       {
2057         unsigned char c;
2058         unsigned char *src_end = source + src_bytes;
2059         unsigned char *dst_end = destination + dst_bytes;
2060         /* Since the maximum bytes produced by each loop is 2, we
2061            subtract 1 from DST_END to assure overflow checking is
2062            necessary only at the head of loop.  */
2063         unsigned char *adjusted_dst_end = dst_end - 1;
2064
2065         while (src < src_end && dst < adjusted_dst_end)
2066           {
2067             c = *src++;
2068             if (c == '\n' || (c == '\r' && coding->selective))
2069               *dst++ = '\r', *dst++ = '\n';
2070             else
2071               *dst++ = c;
2072           }
2073         produced = dst - destination;
2074         *consumed = src - source;
2075         break;
2076       }
2077
2078     default:                    /* i.e. case CODING_EOL_CR: */
2079       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2080       bcopy (source, destination, produced);
2081       {
2082         int i = produced;
2083         while (i--)
2084           if (*dst++ == '\n') dst[-1] = '\r';
2085       }
2086       *consumed = produced;
2087     }
2088
2089   return produced;
2090 }
2091
2092 \f
2093 /*** 6. C library functions ***/
2094
2095 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2096    has a property `coding-system'.  The value of this property is a
2097    vector of length 5 (called as coding-vector).  Among elements of
2098    this vector, the first (element[0]) and the fifth (element[4])
2099    carry important information for decoding/encoding.  Before
2100    decoding/encoding, this information should be set in fields of a
2101    structure of type `coding_system'.
2102
2103    A value of property `coding-system' can be a symbol of another
2104    subsidiary coding-system.  In that case, Emacs gets coding-vector
2105    from that symbol.
2106
2107    `element[0]' contains information to be set in `coding->type'.  The
2108    value and its meaning is as follows:
2109
2110    0 -- coding_type_emacs_mule
2111    1 -- coding_type_sjis
2112    2 -- coding_type_iso2022
2113    3 -- coding_type_big5
2114    4 -- coding_type_ccl encoder/decoder written in CCL
2115    nil -- coding_type_no_conversion
2116    t -- coding_type_undecided (automatic conversion on decoding,
2117                                no-conversion on encoding)
2118
2119    `element[4]' contains information to be set in `coding->flags' and
2120    `coding->spec'.  The meaning varies by `coding->type'.
2121
2122    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2123    of length 32 (of which the first 13 sub-elements are used now).
2124    Meanings of these sub-elements are:
2125
2126    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2127         If the value is an integer of valid charset, the charset is
2128         assumed to be designated to graphic register N initially.
2129
2130         If the value is minus, it is a minus value of charset which
2131         reserves graphic register N, which means that the charset is
2132         not designated initially but should be designated to graphic
2133         register N just before encoding a character in that charset.
2134
2135         If the value is nil, graphic register N is never used on
2136         encoding.
2137
2138    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2139         Each value takes t or nil.  See the section ISO2022 of
2140         `coding.h' for more information.
2141
2142    If `coding->type' is `coding_type_big5', element[4] is t to denote
2143    BIG5-ETen or nil to denote BIG5-HKU.
2144
2145    If `coding->type' takes the other value, element[4] is ignored.
2146
2147    Emacs Lisp's coding system also carries information about format of
2148    end-of-line in a value of property `eol-type'.  If the value is
2149    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2150    means CODING_EOL_CR.  If it is not integer, it should be a vector
2151    of subsidiary coding systems of which property `eol-type' has one
2152    of above values.
2153
2154 */
2155
2156 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2157    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2158    is setup so that no conversion is necessary and return -1, else
2159    return 0.  */
2160
2161 int
2162 setup_coding_system (coding_system, coding)
2163      Lisp_Object coding_system;
2164      struct coding_system *coding;
2165 {
2166   Lisp_Object type, eol_type;
2167
2168   /* At first, set several fields to default values.  */
2169   coding->require_flushing = 0;
2170   coding->last_block = 0;
2171   coding->selective = 0;
2172   coding->composing = 0;
2173   coding->direction = 0;
2174   coding->carryover_size = 0;
2175   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2176   coding->character_unification_table_for_decode = Qnil;
2177   coding->character_unification_table_for_encode = Qnil;
2178
2179   Vlast_coding_system_used = coding->symbol = coding_system;
2180   eol_type = Qnil;
2181   /* Get value of property `coding-system' until we get a vector.
2182      While doing that, also get values of properties
2183      `post-read-conversion', `pre-write-conversion',
2184      `character-unification-table-for-decode',
2185      `character-unification-table-for-encode' and `eol-type'.  */
2186   while (!NILP (coding_system) && SYMBOLP (coding_system))
2187     {
2188       if (NILP (coding->post_read_conversion))
2189         coding->post_read_conversion = Fget (coding_system,
2190                                              Qpost_read_conversion);
2191       if (NILP (coding->pre_write_conversion))
2192         coding->pre_write_conversion = Fget (coding_system,
2193                                              Qpre_write_conversion);
2194       if (!inhibit_eol_conversion && NILP (eol_type))
2195         eol_type = Fget (coding_system, Qeol_type);
2196
2197       if (NILP (coding->character_unification_table_for_decode))
2198         coding->character_unification_table_for_decode
2199           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2200
2201       if (NILP (coding->character_unification_table_for_encode))
2202         coding->character_unification_table_for_encode
2203           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2204
2205       coding_system = Fget (coding_system, Qcoding_system);
2206     }
2207
2208   while (!NILP (coding->character_unification_table_for_decode)
2209          && SYMBOLP (coding->character_unification_table_for_decode))
2210         coding->character_unification_table_for_decode
2211           = Fget (coding->character_unification_table_for_decode,
2212                   Qcharacter_unification_table_for_decode);
2213   if (!NILP (coding->character_unification_table_for_decode)
2214       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2215       coding->character_unification_table_for_decode = Qnil;
2216
2217   while (!NILP (coding->character_unification_table_for_encode)
2218          && SYMBOLP (coding->character_unification_table_for_encode))
2219         coding->character_unification_table_for_encode
2220           = Fget (coding->character_unification_table_for_encode,
2221                   Qcharacter_unification_table_for_encode);
2222   if (!NILP (coding->character_unification_table_for_encode)
2223       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2224       coding->character_unification_table_for_encode = Qnil;
2225
2226   if (!VECTORP (coding_system)
2227       || XVECTOR (coding_system)->size != 5)
2228     goto label_invalid_coding_system;
2229
2230   if (VECTORP (eol_type))
2231     coding->eol_type = CODING_EOL_UNDECIDED;
2232   else if (XFASTINT (eol_type) == 1)
2233     coding->eol_type = CODING_EOL_CRLF;
2234   else if (XFASTINT (eol_type) == 2)
2235     coding->eol_type = CODING_EOL_CR;
2236   else
2237     coding->eol_type = CODING_EOL_LF;
2238
2239   type = XVECTOR (coding_system)->contents[0];
2240   switch (XFASTINT (type))
2241     {
2242     case 0:
2243       coding->type = coding_type_emacs_mule;
2244       break;
2245
2246     case 1:
2247       coding->type = coding_type_sjis;
2248       break;
2249
2250     case 2:
2251       coding->type = coding_type_iso2022;
2252       {
2253         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2254         Lisp_Object *flags;
2255         int i, charset, default_reg_bits = 0;
2256
2257         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2258           goto label_invalid_coding_system;
2259
2260         flags = XVECTOR (val)->contents;
2261         coding->flags
2262           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2263              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2264              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2265              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2266              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2267              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2268              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2269              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2270              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2271              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2272              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2273
2274         /* Invoke graphic register 0 to plane 0.  */
2275         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2276         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2277         CODING_SPEC_ISO_INVOCATION (coding, 1)
2278           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2279         /* Not single shifting at first.  */
2280         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2281         /* Beginning of buffer should also be regarded as bol. */
2282         CODING_SPEC_ISO_BOL(coding) = 1;
2283
2284         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2285            FLAGS[REG] can be one of below:
2286                 integer CHARSET: CHARSET occupies register I,
2287                 t: designate nothing to REG initially, but can be used
2288                   by any charsets,
2289                 list of integer, nil, or t: designate the first
2290                   element (if integer) to REG initially, the remaining
2291                   elements (if integer) is designated to REG on request,
2292                   if an element is t, REG can be used by any charset,
2293                 nil: REG is never used.  */
2294         for (charset = 0; charset <= MAX_CHARSET; charset++)
2295           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2296             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2297         for (i = 0; i < 4; i++)
2298           {
2299             if (INTEGERP (flags[i])
2300                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2301                 || (charset = get_charset_id (flags[i])) >= 0)
2302               {
2303                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2304                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2305               }
2306             else if (EQ (flags[i], Qt))
2307               {
2308                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2309                 default_reg_bits |= 1 << i;
2310               }
2311             else if (CONSP (flags[i]))
2312               {
2313                 Lisp_Object tail = flags[i];
2314
2315                 if (INTEGERP (XCONS (tail)->car)
2316                     && (charset = XINT (XCONS (tail)->car),
2317                         CHARSET_VALID_P (charset))
2318                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2319                   {
2320                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2321                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2322                   }
2323                 else
2324                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2325                 tail = XCONS (tail)->cdr;
2326                 while (CONSP (tail))
2327                   {
2328                     if (INTEGERP (XCONS (tail)->car)
2329                         && (charset = XINT (XCONS (tail)->car),
2330                             CHARSET_VALID_P (charset))
2331                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2332                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2333                         = i;
2334                     else if (EQ (XCONS (tail)->car, Qt))
2335                       default_reg_bits |= 1 << i;
2336                     tail = XCONS (tail)->cdr;
2337                   }
2338               }
2339             else
2340               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2341
2342             CODING_SPEC_ISO_DESIGNATION (coding, i)
2343               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2344           }
2345
2346         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2347           {
2348             /* REG 1 can be used only by locking shift in 7-bit env.  */
2349             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2350               default_reg_bits &= ~2;
2351             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2352               /* Without any shifting, only REG 0 and 1 can be used.  */
2353               default_reg_bits &= 3;
2354           }
2355
2356         for (charset = 0; charset <= MAX_CHARSET; charset++)
2357           if (CHARSET_VALID_P (charset)
2358               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2359                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2360             {
2361               /* We have not yet decided where to designate CHARSET.  */
2362               int reg_bits = default_reg_bits;
2363
2364               if (CHARSET_CHARS (charset) == 96)
2365                 /* A charset of CHARS96 can't be designated to REG 0.  */
2366                 reg_bits &= ~1;
2367
2368               if (reg_bits)
2369                 /* There exist some default graphic register.  */
2370                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2371                   = (reg_bits & 1
2372                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2373               else
2374                 /* We anyway have to designate CHARSET to somewhere.  */
2375                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2376                   = (CHARSET_CHARS (charset) == 94
2377                      ? 0
2378                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2379                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2380                         ? 1
2381                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2382                            ? 2 : 0)));
2383             }
2384       }
2385       coding->require_flushing = 1;
2386       break;
2387
2388     case 3:
2389       coding->type = coding_type_big5;
2390       coding->flags
2391         = (NILP (XVECTOR (coding_system)->contents[4])
2392            ? CODING_FLAG_BIG5_HKU
2393            : CODING_FLAG_BIG5_ETEN);
2394       break;
2395
2396     case 4:
2397       coding->type = coding_type_ccl;
2398       {
2399         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2400         if (CONSP  (val)
2401             && VECTORP (XCONS (val)->car)
2402             && VECTORP (XCONS (val)->cdr))
2403           {
2404             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2405             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2406           }
2407         else
2408           goto label_invalid_coding_system;
2409       }
2410       coding->require_flushing = 1;
2411       break;
2412
2413     default:
2414       if (EQ (type, Qt))
2415         coding->type = coding_type_undecided;
2416       else
2417         coding->type = coding_type_no_conversion;
2418       break;
2419     }
2420   return 0;
2421
2422  label_invalid_coding_system:
2423   coding->type = coding_type_no_conversion;
2424   coding->eol_type = CODING_EOL_LF;
2425   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2426     = Qnil;
2427   return -1;
2428 }
2429
2430 /* Emacs has a mechanism to automatically detect a coding system if it
2431    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2432    it's impossible to distinguish some coding systems accurately
2433    because they use the same range of codes.  So, at first, coding
2434    systems are categorized into 7, those are:
2435
2436    o coding-category-emacs-mule
2437
2438         The category for a coding system which has the same code range
2439         as Emacs' internal format.  Assigned the coding-system (Lisp
2440         symbol) `emacs-mule' by default.
2441
2442    o coding-category-sjis
2443
2444         The category for a coding system which has the same code range
2445         as SJIS.  Assigned the coding-system (Lisp
2446         symbol) `shift-jis' by default.
2447
2448    o coding-category-iso-7
2449
2450         The category for a coding system which has the same code range
2451         as ISO2022 of 7-bit environment.  Assigned the coding-system
2452         (Lisp symbol) `iso-2022-7' by default.
2453
2454    o coding-category-iso-8-1
2455
2456         The category for a coding system which has the same code range
2457         as ISO2022 of 8-bit environment and graphic plane 1 used only
2458         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2459         symbol) `iso-8859-1' by default.
2460
2461    o coding-category-iso-8-2
2462
2463         The category for a coding system which has the same code range
2464         as ISO2022 of 8-bit environment and graphic plane 1 used only
2465         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2466         symbol) `euc-japan' by default.
2467
2468    o coding-category-iso-else
2469
2470         The category for a coding system which has the same code range
2471         as ISO2022 but not belongs to any of the above three
2472         categories.  Assigned the coding-system (Lisp symbol)
2473         `iso-2022-ss2-7' by default.
2474
2475    o coding-category-big5
2476
2477         The category for a coding system which has the same code range
2478         as BIG5.  Assigned the coding-system (Lisp symbol)
2479         `cn-big5' by default.
2480
2481    o coding-category-binary
2482
2483         The category for a coding system not categorized in any of the
2484         above.  Assigned the coding-system (Lisp symbol)
2485         `no-conversion' by default.
2486
2487    Each of them is a Lisp symbol and the value is an actual
2488    `coding-system's (this is also a Lisp symbol) assigned by a user.
2489    What Emacs does actually is to detect a category of coding system.
2490    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2491    decide only one possible category, it selects a category of the
2492    highest priority.  Priorities of categories are also specified by a
2493    user in a Lisp variable `coding-category-list'.
2494
2495 */
2496
2497 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2498    If it detects possible coding systems, return an integer in which
2499    appropriate flag bits are set.  Flag bits are defined by macros
2500    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2501
2502 int
2503 detect_coding_mask (src, src_bytes)
2504      unsigned char *src;
2505      int src_bytes;
2506 {
2507   register unsigned char c;
2508   unsigned char *src_end = src + src_bytes;
2509   int mask;
2510
2511   /* At first, skip all ASCII characters and control characters except
2512      for three ISO2022 specific control characters.  */
2513  label_loop_detect_coding:
2514   while (src < src_end)
2515     {
2516       c = *src;
2517       if (c >= 0x80
2518           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2519         break;
2520       src++;
2521     }
2522
2523   if (src >= src_end)
2524     /* We found nothing other than ASCII.  There's nothing to do.  */
2525     return CODING_CATEGORY_MASK_ANY;
2526
2527   /* The text seems to be encoded in some multilingual coding system.
2528      Now, try to find in which coding system the text is encoded.  */
2529   if (c < 0x80)
2530     {
2531       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2532       /* C is an ISO2022 specific control code of C0.  */
2533       mask = detect_coding_iso2022 (src, src_end);
2534       src++;
2535       if (mask == CODING_CATEGORY_MASK_ANY)
2536         /* No valid ISO2022 code follows C.  Try again.  */
2537         goto label_loop_detect_coding;
2538     }
2539   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2540     /* C is an ISO2022 specific control code of C1,
2541        or the first byte of SJIS's 2-byte character code,
2542        or a leading code of Emacs.  */
2543     mask = (detect_coding_iso2022 (src, src_end)
2544             | detect_coding_sjis (src, src_end)
2545             | detect_coding_emacs_mule (src, src_end));
2546
2547   else if (c < 0xA0)
2548     /* C is the first byte of SJIS character code,
2549        or a leading-code of Emacs.  */
2550     mask = (detect_coding_sjis (src, src_end)
2551             | detect_coding_emacs_mule (src, src_end));
2552
2553   else
2554     /* C is a character of ISO2022 in graphic plane right,
2555        or a SJIS's 1-byte character code (i.e. JISX0201),
2556        or the first byte of BIG5's 2-byte code.  */
2557     mask = (detect_coding_iso2022 (src, src_end)
2558             | detect_coding_sjis (src, src_end)
2559             | detect_coding_big5 (src, src_end));
2560
2561   return mask;
2562 }
2563
2564 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2565    The information of the detected coding system is set in CODING.  */
2566
2567 void
2568 detect_coding (coding, src, src_bytes)
2569      struct coding_system *coding;
2570      unsigned char *src;
2571      int src_bytes;
2572 {
2573   int mask = detect_coding_mask (src, src_bytes);
2574   int idx;
2575
2576   if (mask == CODING_CATEGORY_MASK_ANY)
2577     /* We found nothing other than ASCII.  There's nothing to do.  */
2578     return;
2579
2580   if (!mask)
2581     /* The source text seems to be encoded in unknown coding system.
2582        Emacs regards the category of such a kind of coding system as
2583        `coding-category-binary'.  We assume that a user has assigned
2584        an appropriate coding system for a `coding-category-binary'.  */
2585     idx = CODING_CATEGORY_IDX_BINARY;
2586   else
2587     {
2588       /* We found some plausible coding systems.  Let's use a coding
2589          system of the highest priority.  */
2590       Lisp_Object val = Vcoding_category_list;
2591
2592       if (CONSP (val))
2593         while (!NILP (val))
2594           {
2595             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2596             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2597               break;
2598             val = XCONS (val)->cdr;
2599           }
2600       else
2601         val = Qnil;
2602
2603       if (NILP (val))
2604         {
2605           /* For unknown reason, `Vcoding_category_list' contains none
2606              of found categories.  Let's use any of them.  */
2607           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2608             if (mask & (1 << idx))
2609               break;
2610         }
2611     }
2612   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2613 }
2614
2615 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2616    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2617    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2618
2619 int
2620 detect_eol_type (src, src_bytes)
2621      unsigned char *src;
2622      int src_bytes;
2623 {
2624   unsigned char *src_end = src + src_bytes;
2625   unsigned char c;
2626
2627   while (src < src_end)
2628     {
2629       c = *src++;
2630       if (c == '\n')
2631         return CODING_EOL_LF;
2632       else if (c == '\r')
2633         {
2634           if (src < src_end && *src == '\n')
2635             return CODING_EOL_CRLF;
2636           else
2637             return CODING_EOL_CR;
2638         }
2639     }
2640   return CODING_EOL_UNDECIDED;
2641 }
2642
2643 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2644    is encoded.  If it detects an appropriate format of end-of-line, it
2645    sets the information in *CODING.  */
2646
2647 void
2648 detect_eol (coding, src, src_bytes)
2649      struct coding_system *coding;
2650      unsigned char *src;
2651      int src_bytes;
2652 {
2653   Lisp_Object val;
2654   int eol_type = detect_eol_type (src, src_bytes);
2655
2656   if (eol_type == CODING_EOL_UNDECIDED)
2657     /*  We found no end-of-line in the source text.  */
2658     return;
2659
2660   val = Fget (coding->symbol, Qeol_type);
2661   if (VECTORP (val) && XVECTOR (val)->size == 3)
2662     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2663 }
2664
2665 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2666    decoding, it may detect coding system and format of end-of-line if
2667    those are not yet decided.  */
2668
2669 int
2670 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2671      struct coding_system *coding;
2672      unsigned char *source, *destination;
2673      int src_bytes, dst_bytes;
2674      int *consumed;
2675 {
2676   int produced;
2677
2678   if (src_bytes <= 0)
2679     {
2680       *consumed = 0;
2681       return 0;
2682     }
2683
2684   if (coding->type == coding_type_undecided)
2685     detect_coding (coding, source, src_bytes);
2686
2687   if (coding->eol_type == CODING_EOL_UNDECIDED)
2688     detect_eol (coding, source, src_bytes);
2689
2690   coding->carryover_size = 0;
2691   switch (coding->type)
2692     {
2693     case coding_type_no_conversion:
2694     label_no_conversion:
2695       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2696       bcopy (source, destination, produced);
2697       *consumed = produced;
2698       break;
2699
2700     case coding_type_emacs_mule:
2701     case coding_type_undecided:
2702       if (coding->eol_type == CODING_EOL_LF
2703           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2704         goto label_no_conversion;
2705       produced = decode_eol (coding, source, destination,
2706                              src_bytes, dst_bytes, consumed);
2707       break;
2708
2709     case coding_type_sjis:
2710       produced = decode_coding_sjis_big5 (coding, source, destination,
2711                                           src_bytes, dst_bytes, consumed,
2712                                           1);
2713       break;
2714
2715     case coding_type_iso2022:
2716       produced = decode_coding_iso2022 (coding, source, destination,
2717                                         src_bytes, dst_bytes, consumed);
2718       break;
2719
2720     case coding_type_big5:
2721       produced = decode_coding_sjis_big5 (coding, source, destination,
2722                                           src_bytes, dst_bytes, consumed,
2723                                           0);
2724       break;
2725
2726     case coding_type_ccl:
2727       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2728                              src_bytes, dst_bytes, consumed);
2729       break;
2730     }
2731
2732   return produced;
2733 }
2734
2735 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2736
2737 int
2738 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2739      struct coding_system *coding;
2740      unsigned char *source, *destination;
2741      int src_bytes, dst_bytes;
2742      int *consumed;
2743 {
2744   int produced;
2745
2746   coding->carryover_size = 0;
2747   switch (coding->type)
2748     {
2749     case coding_type_no_conversion:
2750     label_no_conversion:
2751       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2752       if (produced > 0)
2753         {
2754           bcopy (source, destination, produced);
2755           if (coding->selective)
2756             {
2757               unsigned char *p = destination, *pend = destination + produced;
2758               while (p < pend)
2759                 if (*p++ == '\015') p[-1] = '\n';
2760             }
2761         }
2762       *consumed = produced;
2763       break;
2764
2765     case coding_type_emacs_mule:
2766     case coding_type_undecided:
2767       if (coding->eol_type == CODING_EOL_LF
2768           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2769         goto label_no_conversion;
2770       produced = encode_eol (coding, source, destination,
2771                              src_bytes, dst_bytes, consumed);
2772       break;
2773
2774     case coding_type_sjis:
2775       produced = encode_coding_sjis_big5 (coding, source, destination,
2776                                           src_bytes, dst_bytes, consumed,
2777                                           1);
2778       break;
2779
2780     case coding_type_iso2022:
2781       produced = encode_coding_iso2022 (coding, source, destination,
2782                                         src_bytes, dst_bytes, consumed);
2783       break;
2784
2785     case coding_type_big5:
2786       produced = encode_coding_sjis_big5 (coding, source, destination,
2787                                           src_bytes, dst_bytes, consumed,
2788                                           0);
2789       break;
2790
2791     case coding_type_ccl:
2792       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2793                              src_bytes, dst_bytes, consumed);
2794       break;
2795     }
2796
2797   return produced;
2798 }
2799
2800 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2801
2802 /* Return maximum size (bytes) of a buffer enough for decoding
2803    SRC_BYTES of text encoded in CODING.  */
2804
2805 int
2806 decoding_buffer_size (coding, src_bytes)
2807      struct coding_system *coding;
2808      int src_bytes;
2809 {
2810   int magnification;
2811
2812   if (coding->type == coding_type_iso2022)
2813     magnification = 3;
2814   else if (coding->type == coding_type_ccl)
2815     magnification = coding->spec.ccl.decoder.buf_magnification;
2816   else
2817     magnification = 2;
2818
2819   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2820 }
2821
2822 /* Return maximum size (bytes) of a buffer enough for encoding
2823    SRC_BYTES of text to CODING.  */
2824
2825 int
2826 encoding_buffer_size (coding, src_bytes)
2827      struct coding_system *coding;
2828      int src_bytes;
2829 {
2830   int magnification;
2831
2832   if (coding->type == coding_type_ccl)
2833     magnification = coding->spec.ccl.encoder.buf_magnification;
2834   else
2835     magnification = 3;
2836
2837   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2838 }
2839
2840 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2841 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2842 #endif
2843
2844 char *conversion_buffer;
2845 int conversion_buffer_size;
2846
2847 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2848    or decoding.  Sufficient memory is allocated automatically.  If we
2849    run out of memory, return NULL.  */
2850
2851 char *
2852 get_conversion_buffer (size)
2853      int size;
2854 {
2855   if (size > conversion_buffer_size)
2856     {
2857       char *buf;
2858       int real_size = conversion_buffer_size * 2;
2859
2860       while (real_size < size) real_size *= 2;
2861       buf = (char *) xmalloc (real_size);
2862       xfree (conversion_buffer);
2863       conversion_buffer = buf;
2864       conversion_buffer_size = real_size;
2865     }
2866   return conversion_buffer;
2867 }
2868
2869 \f
2870 #ifdef emacs
2871 /*** 7. Emacs Lisp library functions ***/
2872
2873 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2874        1, 1, 0,
2875   "Return coding-spec of CODING-SYSTEM.\n\
2876 If CODING-SYSTEM is not a valid coding-system, return nil.")
2877   (obj)
2878      Lisp_Object obj;
2879 {
2880   while (SYMBOLP (obj) && !NILP (obj))
2881     obj = Fget (obj, Qcoding_system);
2882   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2883           ? Qnil : obj);
2884 }
2885
2886 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2887   "Return t if OBJECT is nil or a coding-system.\n\
2888 See document of make-coding-system for coding-system object.")
2889   (obj)
2890      Lisp_Object obj;
2891 {
2892   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2893 }
2894
2895 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2896        Sread_non_nil_coding_system, 1, 1, 0,
2897   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2898   (prompt)
2899      Lisp_Object prompt;
2900 {
2901   Lisp_Object val;
2902   do
2903     {
2904       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2905                               Qt, Qnil, Qnil, Qnil);
2906     }
2907   while (XSTRING (val)->size == 0);
2908   return (Fintern (val, Qnil));
2909 }
2910
2911 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2912   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2913   (prompt)
2914      Lisp_Object prompt;
2915 {
2916   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2917                                       Qt, Qnil, Qnil, Qnil);
2918   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2919 }
2920
2921 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2922        1, 1, 0,
2923   "Check validity of CODING-SYSTEM.\n\
2924 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2925 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2926 The value of property should be a vector of length 5.")
2927   (coding_system)
2928      Lisp_Object coding_system;
2929 {
2930   CHECK_SYMBOL (coding_system, 0);
2931   if (!NILP (Fcoding_system_p (coding_system)))
2932     return coding_system;
2933   while (1)
2934     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2935 }
2936
2937 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2938        2, 2, 0,
2939   "Detect coding-system of the text in the region between START and END.\n\
2940 Return a list of possible coding-systems ordered by priority.\n\
2941 If only ASCII characters are found, it returns `undecided'\n\
2942  or its subsidiary coding-system according to a detected end-of-line format.")
2943   (b, e)
2944      Lisp_Object b, e;
2945 {
2946   int coding_mask, eol_type;
2947   Lisp_Object val;
2948   int beg, end;
2949
2950   validate_region (&b, &e);
2951   beg = XINT (b), end = XINT (e);
2952   if (beg < GPT && end >= GPT) move_gap (end);
2953
2954   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2955   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2956
2957   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2958     {
2959       val = intern ("undecided");
2960       if (eol_type != CODING_EOL_UNDECIDED)
2961         {
2962           Lisp_Object val2 = Fget (val, Qeol_type);
2963           if (VECTORP (val2))
2964             val = XVECTOR (val2)->contents[eol_type];
2965         }
2966     }
2967   else
2968     {
2969       Lisp_Object val2;
2970
2971       /* At first, gather possible coding-systems in VAL in a reverse
2972          order.  */
2973       val = Qnil;
2974       for (val2 = Vcoding_category_list;
2975            !NILP (val2);
2976            val2 = XCONS (val2)->cdr)
2977         {
2978           int idx
2979             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2980           if (coding_mask & (1 << idx))
2981             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2982         }
2983
2984       /* Then, change the order of the list, while getting subsidiary
2985          coding-systems.  */
2986       val2 = val;
2987       val = Qnil;
2988       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2989         {
2990           if (eol_type == CODING_EOL_UNDECIDED)
2991             val = Fcons (XCONS (val2)->car, val);
2992           else
2993             {
2994               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2995               if (VECTORP (val3))
2996                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2997               else
2998                 val = Fcons (XCONS (val2)->car, val);
2999             }
3000         }
3001     }
3002
3003   return val;
3004 }
3005
3006 /* Scan text in the region between *BEGP and *ENDP, skip characters
3007    which we never have to encode to (iff ENCODEP is 1) or decode from
3008    coding system CODING at the head and tail, then set BEGP and ENDP
3009    to the addresses of start and end of the text we actually convert.  */
3010
3011 void
3012 shrink_conversion_area (begp, endp, coding, encodep)
3013      unsigned char **begp, **endp;
3014      struct coding_system *coding;
3015      int encodep;
3016 {
3017   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3018
3019   if (coding->eol_type != CODING_EOL_LF
3020       && coding->eol_type != CODING_EOL_UNDECIDED)
3021     /* Since we anyway have to convert end-of-line format, it is not
3022        worth skipping at most 100 bytes or so.  */
3023     return;
3024
3025   if (encodep)                  /* for encoding */
3026     {
3027       switch (coding->type)
3028         {
3029         case coding_type_no_conversion:
3030         case coding_type_emacs_mule:
3031         case coding_type_undecided:
3032           /* We need no conversion.  */
3033           *begp = *endp;
3034           return;
3035         case coding_type_ccl:
3036           /* We can't skip any data.  */
3037           return;
3038         case coding_type_iso2022:
3039           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3040             {
3041               unsigned char *bol = beg_addr;
3042               while (beg_addr < end_addr && *beg_addr < 0x80)
3043                 {
3044                   beg_addr++;
3045                   if (*(beg_addr - 1) == '\n')
3046                     bol = beg_addr;
3047                 }
3048               beg_addr = bol;
3049               goto label_skip_tail;
3050             }
3051           /* fall down ... */
3052         default:
3053           /* We can skip all ASCII characters at the head and tail.  */
3054           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3055         label_skip_tail:
3056           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3057           break;
3058         }
3059     }
3060   else                          /* for decoding */
3061     {
3062       switch (coding->type)
3063         {
3064         case coding_type_no_conversion:
3065           /* We need no conversion.  */
3066           *begp = *endp;
3067           return;
3068         case coding_type_emacs_mule:
3069           if (coding->eol_type == CODING_EOL_LF)
3070             {
3071               /* We need no conversion.  */
3072               *begp = *endp;
3073               return;
3074             }
3075           /* We can skip all but carriage-return.  */
3076           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3077           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3078           break;
3079         case coding_type_sjis:
3080         case coding_type_big5:
3081           /* We can skip all ASCII characters at the head.  */
3082           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3083           /* We can skip all ASCII characters at the tail except for
3084              the second byte of SJIS or BIG5 code.  */
3085           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3086           if (end_addr != *endp)
3087             end_addr++;
3088           break;
3089         case coding_type_ccl:
3090           /* We can't skip any data.  */
3091           return;
3092         default:                /* i.e. case coding_type_iso2022: */
3093           {
3094             unsigned char c;
3095
3096             /* We can skip all ASCII characters except for a few
3097                control codes at the head.  */
3098             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3099                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3100                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3101               beg_addr++;
3102           }
3103           break;
3104         }
3105     }
3106   *begp = beg_addr;
3107   *endp = end_addr;
3108   return;
3109 }
3110
3111 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3112    text between B and E.  B and E are buffer position.  */
3113
3114 Lisp_Object
3115 code_convert_region (b, e, coding, encodep)
3116      Lisp_Object b, e;
3117      struct coding_system *coding;
3118      int encodep;
3119 {
3120   int beg, end, len, consumed, produced;
3121   char *buf;
3122   unsigned char *begp, *endp;
3123   int pos = PT;
3124
3125   validate_region (&b, &e);
3126   beg = XINT (b), end = XINT (e);
3127   if (beg < GPT && end >= GPT)
3128     move_gap (end);
3129
3130   if (encodep && !NILP (coding->pre_write_conversion))
3131     {
3132       /* We must call a pre-conversion function which may put a new
3133          text to be converted in a new buffer.  */
3134       struct buffer *old = current_buffer, *new;
3135
3136       TEMP_SET_PT (beg);
3137       call2 (coding->pre_write_conversion, b, e);
3138       if (old != current_buffer)
3139         {
3140           /* Replace the original text by the text just generated.  */
3141           len = ZV - BEGV;
3142           new = current_buffer;
3143           set_buffer_internal (old);
3144           del_range (beg, end);
3145           insert_from_buffer (new, 1, len, 0);
3146           end = beg + len;
3147         }
3148     }
3149
3150   /* We may be able to shrink the conversion region.  */
3151   begp = POS_ADDR (beg); endp = begp + (end - beg);
3152   shrink_conversion_area (&begp, &endp, coding, encodep);
3153
3154   if (begp == endp)
3155     /* We need no conversion.  */
3156     len = end - beg;
3157   else
3158     {
3159       beg += begp - POS_ADDR (beg);
3160       end =  beg + (endp - begp);
3161
3162       if (encodep)
3163         len = encoding_buffer_size (coding, end - beg);
3164       else
3165         len = decoding_buffer_size (coding, end - beg);
3166       buf = get_conversion_buffer (len);
3167
3168       coding->last_block = 1;
3169       produced = (encodep
3170                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3171                                    &consumed)
3172                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3173                                    &consumed));
3174
3175       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3176
3177       TEMP_SET_PT (beg);
3178       insert (buf, produced);
3179       del_range (PT, PT + end - beg);
3180       if (pos >= end)
3181         pos = PT + (pos - end);
3182       else if (pos > beg)
3183         pos = beg;
3184       TEMP_SET_PT (pos);
3185   }
3186
3187   if (!encodep && !NILP (coding->post_read_conversion))
3188     {
3189       /* We must call a post-conversion function which may alter
3190          the text just converted.  */
3191       Lisp_Object insval;
3192
3193       beg = XINT (b);
3194       TEMP_SET_PT (beg);
3195       insval = call1 (coding->post_read_conversion, make_number (len));
3196       CHECK_NUMBER (insval, 0);
3197       len = XINT (insval);
3198     }
3199
3200   return make_number (len);
3201 }
3202
3203 Lisp_Object
3204 code_convert_string (str, coding, encodep, nocopy)
3205      Lisp_Object str, nocopy;
3206      struct coding_system *coding;
3207      int encodep;
3208 {
3209   int len, consumed, produced;
3210   char *buf;
3211   unsigned char *begp, *endp;
3212   int head_skip, tail_skip;
3213   struct gcpro gcpro1;
3214
3215   if (encodep && !NILP (coding->pre_write_conversion)
3216       || !encodep && !NILP (coding->post_read_conversion))
3217     {
3218       /* Since we have to call Lisp functions which assume target text
3219          is in a buffer, after setting a temporary buffer, call
3220          code_convert_region.  */
3221       int count = specpdl_ptr - specpdl;
3222       int len = XSTRING (str)->size;
3223       Lisp_Object result;
3224       struct buffer *old = current_buffer;
3225
3226       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3227       temp_output_buffer_setup (" *code-converting-work*");
3228       set_buffer_internal (XBUFFER (Vstandard_output));
3229       insert_from_string (str, 0, len, 0);
3230       code_convert_region (make_number (BEGV), make_number (ZV),
3231                            coding, encodep);
3232       result = make_buffer_string (BEGV, ZV, 0);
3233       set_buffer_internal (old);
3234       return unbind_to (count, result);
3235     }
3236
3237   /* We may be able to shrink the conversion region.  */
3238   begp = XSTRING (str)->data;
3239   endp = begp + XSTRING (str)->size;
3240   shrink_conversion_area (&begp, &endp, coding, encodep);
3241
3242   if (begp == endp)
3243     /* We need no conversion.  */
3244     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3245
3246   head_skip = begp - XSTRING (str)->data;
3247   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3248
3249   GCPRO1 (str);
3250
3251   if (encodep)
3252     len = encoding_buffer_size (coding, endp - begp);
3253   else
3254     len = decoding_buffer_size (coding, endp - begp);
3255   buf = get_conversion_buffer (len + head_skip + tail_skip);
3256
3257   bcopy (XSTRING (str)->data, buf, head_skip);
3258   coding->last_block = 1;
3259   produced = (encodep
3260               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3261                                buf + head_skip, endp - begp, len, &consumed)
3262               : decode_coding (coding, XSTRING (str)->data + head_skip,
3263                                buf + head_skip, endp - begp, len, &consumed));
3264   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3265          buf + head_skip + produced,
3266          tail_skip);
3267
3268   UNGCPRO;
3269
3270   return make_string (buf, head_skip + produced + tail_skip);
3271 }
3272
3273 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3274        3, 3, "r\nzCoding system: ",
3275   "Decode current region by specified coding system.\n\
3276 When called from a program, takes three arguments:\n\
3277 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3278 Return length of decoded text.")
3279   (b, e, coding_system)
3280      Lisp_Object b, e, coding_system;
3281 {
3282   struct coding_system coding;
3283
3284   CHECK_NUMBER_COERCE_MARKER (b, 0);
3285   CHECK_NUMBER_COERCE_MARKER (e, 1);
3286   CHECK_SYMBOL (coding_system, 2);
3287
3288   if (NILP (coding_system))
3289     return make_number (XFASTINT (e) - XFASTINT (b));
3290   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3291     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3292
3293   return code_convert_region (b, e, &coding, 0);
3294 }
3295
3296 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3297        3, 3, "r\nzCoding system: ",
3298   "Encode current region by specified coding system.\n\
3299 When called from a program, takes three arguments:\n\
3300 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3301 Return length of encoded text.")
3302   (b, e, coding_system)
3303      Lisp_Object b, e, coding_system;
3304 {
3305   struct coding_system coding;
3306
3307   CHECK_NUMBER_COERCE_MARKER (b, 0);
3308   CHECK_NUMBER_COERCE_MARKER (e, 1);
3309   CHECK_SYMBOL (coding_system, 2);
3310
3311   if (NILP (coding_system))
3312     return make_number (XFASTINT (e) - XFASTINT (b));
3313   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3314     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3315
3316   return code_convert_region (b, e, &coding, 1);
3317 }
3318
3319 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3320        2, 3, 0,
3321   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3322 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3323 of decoding.")
3324   (string, coding_system, nocopy)
3325      Lisp_Object string, coding_system, nocopy;
3326 {
3327   struct coding_system coding;
3328
3329   CHECK_STRING (string, 0);
3330   CHECK_SYMBOL (coding_system, 1);
3331
3332   if (NILP (coding_system))
3333     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3334   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3335     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3336
3337   return code_convert_string (string, &coding, 0, nocopy);
3338 }
3339
3340 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3341        2, 3, 0,
3342   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3343 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3344 of encoding.")
3345   (string, coding_system, nocopy)
3346      Lisp_Object string, coding_system, nocopy;
3347 {
3348   struct coding_system coding;
3349
3350   CHECK_STRING (string, 0);
3351   CHECK_SYMBOL (coding_system, 1);
3352
3353   if (NILP (coding_system))
3354     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3355   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3356     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3357
3358   return code_convert_string (string, &coding, 1, nocopy);
3359 }
3360
3361 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3362   "Decode a JISX0208 character of shift-jis encoding.\n\
3363 CODE is the character code in SJIS.\n\
3364 Return the corresponding character.")
3365   (code)
3366      Lisp_Object code;
3367 {
3368   unsigned char c1, c2, s1, s2;
3369   Lisp_Object val;
3370
3371   CHECK_NUMBER (code, 0);
3372   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3373   DECODE_SJIS (s1, s2, c1, c2);
3374   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3375   return val;
3376 }
3377
3378 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3379   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3380 Return the corresponding character code in SJIS.")
3381   (ch)
3382      Lisp_Object ch;
3383 {
3384   int charset, c1, c2, s1, s2;
3385   Lisp_Object val;
3386
3387   CHECK_NUMBER (ch, 0);
3388   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3389   if (charset == charset_jisx0208)
3390     {
3391       ENCODE_SJIS (c1, c2, s1, s2);
3392       XSETFASTINT (val, (s1 << 8) | s2);
3393     }
3394   else
3395     XSETFASTINT (val, 0);
3396   return val;
3397 }
3398
3399 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3400   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3401 CODE is the character code in BIG5.\n\
3402 Return the corresponding character.")
3403   (code)
3404      Lisp_Object code;
3405 {
3406   int charset;
3407   unsigned char b1, b2, c1, c2;
3408   Lisp_Object val;
3409
3410   CHECK_NUMBER (code, 0);
3411   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3412   DECODE_BIG5 (b1, b2, charset, c1, c2);
3413   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3414   return val;
3415 }
3416
3417 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3418   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3419 Return the corresponding character code in Big5.")
3420   (ch)
3421      Lisp_Object ch;
3422 {
3423   int charset, c1, c2, b1, b2;
3424   Lisp_Object val;
3425
3426   CHECK_NUMBER (ch, 0);
3427   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3428   if (charset == charset_big5_1 || charset == charset_big5_2)
3429     {
3430       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3431       XSETFASTINT (val, (b1 << 8) | b2);
3432     }
3433   else
3434     XSETFASTINT (val, 0);
3435   return val;
3436 }
3437
3438 DEFUN ("set-terminal-coding-system-internal",
3439        Fset_terminal_coding_system_internal,
3440        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3441   (coding_system)
3442      Lisp_Object coding_system;
3443 {
3444   CHECK_SYMBOL (coding_system, 0);
3445   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3446   return Qnil;
3447 }
3448
3449 DEFUN ("terminal-coding-system",
3450        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3451   "Return coding-system of your terminal.")
3452   ()
3453 {
3454   return terminal_coding.symbol;
3455 }
3456
3457 DEFUN ("set-keyboard-coding-system-internal",
3458        Fset_keyboard_coding_system_internal,
3459        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3460   (coding_system)
3461      Lisp_Object coding_system;
3462 {
3463   CHECK_SYMBOL (coding_system, 0);
3464   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3465   return Qnil;
3466 }
3467
3468 DEFUN ("keyboard-coding-system",
3469        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3470   "Return coding-system of what is sent from terminal keyboard.")
3471   ()
3472 {
3473   return keyboard_coding.symbol;
3474 }
3475
3476 \f
3477 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3478        Sfind_operation_coding_system,  1, MANY, 0,
3479   "Choose a coding system for an operation based on the target name.\n\
3480 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3481 DECODING-SYSTEM is the coding system to use for decoding\n\
3482 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3483 for encoding (in case OPERATION does encoding).\n\
3484 \n\
3485 The first argument OPERATION specifies an I/O primitive:\n\
3486   For file I/O, `insert-file-contents' or `write-region'.\n\
3487   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3488   For network I/O, `open-network-stream'.\n\
3489 \n\
3490 The remaining arguments should be the same arguments that were passed\n\
3491 to the primitive.  Depending on which primitive, one of those arguments\n\
3492 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3493 whichever argument specifies the file name is TARGET.\n\
3494 \n\
3495 TARGET has a meaning which depends on OPERATION:\n\
3496   For file I/O, TARGET is a file name.\n\
3497   For process I/O, TARGET is a process name.\n\
3498   For network I/O, TARGET is a service name or a port number\n\
3499 \n\
3500 This function looks up what specified for TARGET in,\n\
3501 `file-coding-system-alist', `process-coding-system-alist',\n\
3502 or `network-coding-system-alist' depending on OPERATION.\n\
3503 They may specify a coding system, a cons of coding systems,\n\
3504 or a function symbol to call.\n\
3505 In the last case, we call the function with one argument,\n\
3506 which is a list of all the arguments given to this function.")
3507   (nargs, args)
3508      int nargs;
3509      Lisp_Object *args;
3510 {
3511   Lisp_Object operation, target_idx, target, val;
3512   register Lisp_Object chain;
3513
3514   if (nargs < 2)
3515     error ("Too few arguments");
3516   operation = args[0];
3517   if (!SYMBOLP (operation)
3518       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3519     error ("Invalid first arguement");
3520   if (nargs < 1 + XINT (target_idx))
3521     error ("Too few arguments for operation: %s",
3522            XSYMBOL (operation)->name->data);
3523   target = args[XINT (target_idx) + 1];
3524   if (!(STRINGP (target)
3525         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3526     error ("Invalid %dth argument", XINT (target_idx) + 1);
3527
3528   chain = ((EQ (operation, Qinsert_file_contents)
3529             || EQ (operation, Qwrite_region))
3530            ? Vfile_coding_system_alist
3531            : (EQ (operation, Qopen_network_stream)
3532               ? Vnetwork_coding_system_alist
3533               : Vprocess_coding_system_alist));
3534   if (NILP (chain))
3535     return Qnil;
3536
3537   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3538     {
3539       Lisp_Object elt = XCONS (chain)->car;
3540
3541       if (CONSP (elt)
3542           && ((STRINGP (target)
3543                && STRINGP (XCONS (elt)->car)
3544                && fast_string_match (XCONS (elt)->car, target) >= 0)
3545               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3546         {
3547           val = XCONS (elt)->cdr;
3548           if (CONSP (val))
3549             return val;
3550           if (! SYMBOLP (val))
3551             return Qnil;
3552           if (! NILP (Fcoding_system_p (val)))
3553             return Fcons (val, val);
3554           if (!NILP (Fboundp (val)))
3555             return call1 (val, Flist (nargs, args));
3556           return Qnil;
3557         }
3558     }
3559   return Qnil;
3560 }
3561
3562 #endif /* emacs */
3563
3564 \f
3565 /*** 8. Post-amble ***/
3566
3567 init_coding_once ()
3568 {
3569   int i;
3570
3571   /* Emacs' internal format specific initialize routine.  */
3572   for (i = 0; i <= 0x20; i++)
3573     emacs_code_class[i] = EMACS_control_code;
3574   emacs_code_class[0x0A] = EMACS_linefeed_code;
3575   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3576   for (i = 0x21 ; i < 0x7F; i++)
3577     emacs_code_class[i] = EMACS_ascii_code;
3578   emacs_code_class[0x7F] = EMACS_control_code;
3579   emacs_code_class[0x80] = EMACS_leading_code_composition;
3580   for (i = 0x81; i < 0xFF; i++)
3581     emacs_code_class[i] = EMACS_invalid_code;
3582   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3583   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3584   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3585   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3586
3587   /* ISO2022 specific initialize routine.  */
3588   for (i = 0; i < 0x20; i++)
3589     iso_code_class[i] = ISO_control_code;
3590   for (i = 0x21; i < 0x7F; i++)
3591     iso_code_class[i] = ISO_graphic_plane_0;
3592   for (i = 0x80; i < 0xA0; i++)
3593     iso_code_class[i] = ISO_control_code;
3594   for (i = 0xA1; i < 0xFF; i++)
3595     iso_code_class[i] = ISO_graphic_plane_1;
3596   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3597   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3598   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3599   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3600   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3601   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3602   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3603   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3604   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3605   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3606
3607   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3608   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3609
3610   setup_coding_system (Qnil, &keyboard_coding);
3611   setup_coding_system (Qnil, &terminal_coding);
3612
3613 #if defined (MSDOS) || defined (WINDOWSNT)
3614   system_eol_type = CODING_EOL_CRLF;
3615 #else
3616   system_eol_type = CODING_EOL_LF;
3617 #endif
3618 }
3619
3620 #ifdef emacs
3621
3622 syms_of_coding ()
3623 {
3624   Qtarget_idx = intern ("target-idx");
3625   staticpro (&Qtarget_idx);
3626
3627   /* Target FILENAME is the first argument.  */
3628   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3629   /* Target FILENAME is the third argument.  */
3630   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3631
3632   Qcall_process = intern ("call-process");
3633   staticpro (&Qcall_process);
3634   /* Target PROGRAM is the first argument.  */
3635   Fput (Qcall_process, Qtarget_idx, make_number (0));
3636
3637   Qcall_process_region = intern ("call-process-region");
3638   staticpro (&Qcall_process_region);
3639   /* Target PROGRAM is the third argument.  */
3640   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3641
3642   Qstart_process = intern ("start-process");
3643   staticpro (&Qstart_process);
3644   /* Target PROGRAM is the third argument.  */
3645   Fput (Qstart_process, Qtarget_idx, make_number (2));
3646
3647   Qopen_network_stream = intern ("open-network-stream");
3648   staticpro (&Qopen_network_stream);
3649   /* Target SERVICE is the fourth argument.  */
3650   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3651
3652   Qcoding_system = intern ("coding-system");
3653   staticpro (&Qcoding_system);
3654
3655   Qeol_type = intern ("eol-type");
3656   staticpro (&Qeol_type);
3657
3658   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3659   staticpro (&Qbuffer_file_coding_system);
3660
3661   Qpost_read_conversion = intern ("post-read-conversion");
3662   staticpro (&Qpost_read_conversion);
3663
3664   Qpre_write_conversion = intern ("pre-write-conversion");
3665   staticpro (&Qpre_write_conversion);
3666
3667   Qcoding_system_spec = intern ("coding-system-spec");
3668   staticpro (&Qcoding_system_spec);
3669
3670   Qcoding_system_p = intern ("coding-system-p");
3671   staticpro (&Qcoding_system_p);
3672
3673   Qcoding_system_error = intern ("coding-system-error");
3674   staticpro (&Qcoding_system_error);
3675
3676   Fput (Qcoding_system_error, Qerror_conditions,
3677         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3678   Fput (Qcoding_system_error, Qerror_message,
3679         build_string ("Invalid coding system"));
3680
3681   Qcoding_category_index = intern ("coding-category-index");
3682   staticpro (&Qcoding_category_index);
3683
3684   {
3685     int i;
3686     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3687       {
3688         coding_category_table[i] = intern (coding_category_name[i]);
3689         staticpro (&coding_category_table[i]);
3690         Fput (coding_category_table[i], Qcoding_category_index,
3691               make_number (i));
3692       }
3693   }
3694
3695   Qcharacter_unification_table = intern ("character-unification-table");
3696   staticpro (&Qcharacter_unification_table);
3697   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3698         make_number (0));
3699
3700   Qcharacter_unification_table_for_decode
3701     = intern ("character-unification-table-for-decode");
3702   staticpro (&Qcharacter_unification_table_for_decode);
3703
3704   Qcharacter_unification_table_for_encode
3705     = intern ("character-unification-table-for-encode");
3706   staticpro (&Qcharacter_unification_table_for_encode);
3707
3708   Qemacs_mule = intern ("emacs-mule");
3709   staticpro (&Qemacs_mule);
3710
3711   defsubr (&Scoding_system_spec);
3712   defsubr (&Scoding_system_p);
3713   defsubr (&Sread_coding_system);
3714   defsubr (&Sread_non_nil_coding_system);
3715   defsubr (&Scheck_coding_system);
3716   defsubr (&Sdetect_coding_region);
3717   defsubr (&Sdecode_coding_region);
3718   defsubr (&Sencode_coding_region);
3719   defsubr (&Sdecode_coding_string);
3720   defsubr (&Sencode_coding_string);
3721   defsubr (&Sdecode_sjis_char);
3722   defsubr (&Sencode_sjis_char);
3723   defsubr (&Sdecode_big5_char);
3724   defsubr (&Sencode_big5_char);
3725   defsubr (&Sset_terminal_coding_system_internal);
3726   defsubr (&Sterminal_coding_system);
3727   defsubr (&Sset_keyboard_coding_system_internal);
3728   defsubr (&Skeyboard_coding_system);
3729   defsubr (&Sfind_operation_coding_system);
3730
3731   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3732     "List of coding-categories (symbols) ordered by priority.");
3733   {
3734     int i;
3735
3736     Vcoding_category_list = Qnil;
3737     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3738       Vcoding_category_list
3739         = Fcons (coding_category_table[i], Vcoding_category_list);
3740   }
3741
3742   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3743     "A variable of internal use only.\n\
3744 If the value is a coding system, it is used for decoding on read operation.\n\
3745 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3746   Vcoding_system_for_read = Qnil;
3747
3748   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3749     "A variable of internal use only.\n\
3750 If the value is a coding system, it is used for encoding on write operation.\n\
3751 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3752   Vcoding_system_for_write = Qnil;
3753
3754   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3755     "Coding-system used in the latest file or process I/O.");
3756   Vlast_coding_system_used = Qnil;
3757
3758   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3759     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3760   inhibit_eol_conversion = 0;
3761
3762   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3763     "Alist to decide a coding system to use for a file I/O operation.\n\
3764 The format is ((PATTERN . VAL) ...),\n\
3765 where PATTERN is a regular expression matching a file name,\n\
3766 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3767 If VAL is a coding system, it is used for both decoding and encoding\n\
3768 the file contents.\n\
3769 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3770 and the cdr part is used for encoding.\n\
3771 If VAL is a function symbol, the function must return a coding system\n\
3772 or a cons of coding systems which are used as above.\n\
3773 \n\
3774 See also the function `find-operation-coding-system'.");
3775   Vfile_coding_system_alist = Qnil;
3776
3777   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3778     "Alist to decide a coding system to use for a process I/O operation.\n\
3779 The format is ((PATTERN . VAL) ...),\n\
3780 where PATTERN is a regular expression matching a program name,\n\
3781 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3782 If VAL is a coding system, it is used for both decoding what received\n\
3783 from the program and encoding what sent to the program.\n\
3784 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3785 and the cdr part is used for encoding.\n\
3786 If VAL is a function symbol, the function must return a coding system\n\
3787 or a cons of coding systems which are used as above.\n\
3788 \n\
3789 See also the function `find-operation-coding-system'.");
3790   Vprocess_coding_system_alist = Qnil;
3791
3792   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3793     "Alist to decide a coding system to use for a network I/O operation.\n\
3794 The format is ((PATTERN . VAL) ...),\n\
3795 where PATTERN is a regular expression matching a network service name\n\
3796 or is a port number to connect to,\n\
3797 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3798 If VAL is a coding system, it is used for both decoding what received\n\
3799 from the network stream and encoding what sent to the network stream.\n\
3800 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3801 and the cdr part is used for encoding.\n\
3802 If VAL is a function symbol, the function must return a coding system\n\
3803 or a cons of coding systems which are used as above.\n\
3804 \n\
3805 See also the function `find-operation-coding-system'.");
3806   Vnetwork_coding_system_alist = Qnil;
3807
3808   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3809     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3810   eol_mnemonic_unix = ':';
3811
3812   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3813     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3814   eol_mnemonic_dos = '\\';
3815
3816   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3817     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3818   eol_mnemonic_mac = '/';
3819
3820   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3821     "Mnemonic character indicating end-of-line format is not yet decided.");
3822   eol_mnemonic_undecided = ':';
3823
3824   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3825     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3826   Venable_character_unification = Qt;
3827
3828   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3829     &Vstandard_character_unification_table_for_decode,
3830     "Table for unifying characters when reading.");
3831   Vstandard_character_unification_table_for_decode = Qnil;
3832
3833   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3834     &Vstandard_character_unification_table_for_encode,
3835     "Table for unifying characters when writing.");
3836   Vstandard_character_unification_table_for_encode = Qnil;
3837
3838   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3839     "Alist of charsets vs revision numbers.\n\
3840 While encoding, if a charset (car part of an element) is found,\n\
3841 designate it with the escape sequence identifing revision (cdr part of the element).");
3842   Vcharset_revision_alist = Qnil;
3843
3844   DEFVAR_LISP ("default-process-coding-system",
3845                &Vdefault_process_coding_system,
3846     "Cons of coding systems used for process I/O by default.\n\
3847 The car part is used for decoding a process output,\n\
3848 the cdr part is used for encoding a text to be sent to a process.");
3849   Vdefault_process_coding_system = Qnil;
3850 }
3851
3852 #endif /* emacs */