src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Ver.1.0.
   3    Copyright (C) 1995 Free Software Foundation, Inc.
   4    Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   1. Preamble
  26   2. Emacs' internal format handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. End-of-line handlers
  30   6. C library functions
  31   7. Emacs Lisp library functions
  32   8. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format, and when we say "encode", it means
  42   converting Emacs' internal format to some other coding system.
  43
  44   0. Emacs' internal format
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_internal (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format.  The resulting text goes to a
 120   place pointed by DESTINATION, the length of which should not exceed
 121   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 122   The return value is the length of the decoded text.  Below is a
 123   template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs
 138   internal format to CODING.  The resulting text goes to a place
 139   pointed by DESTINATION, the length of which should not exceed
 140   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 141   The return value is the length of the encoded text.  Below is a
 142   template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 #ifdef emacs
 264
 265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
 266
 267 /* Coding-systems are handed between Emacs Lisp programs and C internal
 268    routines by the following three variables.  */
 269 /* Coding-system for reading files and receiving data from process.  */
 270 Lisp_Object Vcoding_system_for_read;
 271 /* Coding-system for writing files and sending data to process.  */
 272 Lisp_Object Vcoding_system_for_write;
 273 /* Coding-system actually used in the latest I/O.  */
 274 Lisp_Object Vlast_coding_system_used;
 275
 276 /* Coding-system of what terminal accept for displaying.  */
 277 struct coding_system terminal_coding;
 278
 279 /* Coding-system of what is sent from terminal keyboard.  */
 280 struct coding_system keyboard_coding;
 281
 282 Lisp_Object Vcoding_system_alist;
 283
 284 #endif /* emacs */
 285
 286 Lisp_Object Qcoding_category_index;
 287
 288 /* List of symbols `coding-category-xxx' ordered by priority.  */
 289 Lisp_Object Vcoding_category_list;
 290
 291 /* Table of coding-systems currently assigned to each coding-category.  */
 292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 293
 294 /* Table of names of symbol for each coding-category.  */
 295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 296   "coding-category-internal",
 297   "coding-category-sjis",
 298   "coding-category-iso-7",
 299   "coding-category-iso-8-1",
 300   "coding-category-iso-8-2",
 301   "coding-category-iso-else",
 302   "coding-category-big5",
 303   "coding-category-binary"
 304 };
 305
 306 /* Flag to tell if we look up unification table on character code
 307    conversion.  */
 308 Lisp_Object Venable_character_unification;
 309 /* Standard unification table to look up on reading (decoding).  */
 310 Lisp_Object Vstandard_character_unification_table_for_read;
 311 /* Standard unification table to look up on writing (encoding).  */
 312 Lisp_Object Vstandard_character_unification_table_for_write;
 313
 314 Lisp_Object Qcharacter_unification_table;
 315
 316 /* Alist of charsets vs revision number.  */
 317 Lisp_Object Vcharset_revision_alist;
 318
 319 \f
 320 /*** 2. Emacs internal format handlers ***/
 321
 322 /* Emacs' internal format for encoding multiple character sets is a
 323    kind of multi-byte encoding, i.e. encoding a character by a sequence
 324    of one-byte codes of variable length.  ASCII characters and control
 325    characters (e.g. `tab', `newline') are represented by one-byte as
 326    is.  It takes the range 0x00 through 0x7F.  The other characters
 327    are represented by a sequence of `base leading-code', optional
 328    `extended leading-code', and one or two `position-code's.  Length
 329    of the sequence is decided by the base leading-code.  Leading-code
 330    takes the range 0x80 through 0x9F, whereas extended leading-code
 331    and position-code take the range 0xA0 through 0xFF.  See the
 332    document of `charset.h' for more detail about leading-code and
 333    position-code.
 334
 335    There's one exception in this rule.  Special leading-code
 336    `leading-code-composition' denotes that the following several
 337    characters should be composed into one character.  Leading-codes of
 338    components (except for ASCII) are added 0x20.  An ASCII character
 339    component is represented by a 2-byte sequence of `0xA0' and
 340    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 341    detail of composite character.  Hence, we can summarize the code
 342    range as follows:
 343
 344    --- CODE RANGE of Emacs' internal format ---
 345    (character set)      (range)
 346    ASCII                0x00 .. 0x7F
 347    ELSE (1st byte)      0x80 .. 0x9F
 348         (rest bytes)    0xA0 .. 0xFF
 349    ---------------------------------------------
 350
 351   */
 352
 353 enum emacs_code_class_type emacs_code_class[256];
 354
 355 /* Go to the next statement only if *SRC is accessible and the code is
 356    greater than 0xA0.  */
 357 #define CHECK_CODE_RANGE_A0_FF  \
 358   do {                          \
 359     if (src >= src_end)         \
 360       goto label_end_of_switch; \
 361     else if (*src++ < 0xA0)     \
 362       return 0;                 \
 363   } while (0)
 364
 365 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 366    Check if a text is encoded in Emacs' internal format.  If it is,
 367    return CODING_CATEGORY_MASK_INTERNAL, else return 0.  */
 368
 369 int
 370 detect_coding_internal (src, src_end)
 371      unsigned char *src, *src_end;
 372 {
 373   unsigned char c;
 374   int composing = 0;
 375
 376   while (src < src_end)
 377     {
 378       c = *src++;
 379
 380       if (composing)
 381         {
 382           if (c < 0xA0)
 383             composing = 0;
 384           else
 385             c -= 0x20;
 386         }
 387
 388       switch (emacs_code_class[c])
 389         {
 390         case EMACS_ascii_code:
 391         case EMACS_linefeed_code:
 392           break;
 393
 394         case EMACS_control_code:
 395           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 396             return 0;
 397           break;
 398
 399         case EMACS_invalid_code:
 400           return 0;
 401
 402         case EMACS_leading_code_composition: /* c == 0x80 */
 403           if (composing)
 404             CHECK_CODE_RANGE_A0_FF;
 405           else
 406             composing = 1;
 407           break;
 408
 409         case EMACS_leading_code_4:
 410           CHECK_CODE_RANGE_A0_FF;
 411           /* fall down to check it two more times ...  */
 412
 413         case EMACS_leading_code_3:
 414           CHECK_CODE_RANGE_A0_FF;
 415           /* fall down to check it one more time ...  */
 416
 417         case EMACS_leading_code_2:
 418           CHECK_CODE_RANGE_A0_FF;
 419           break;
 420
 421         default:
 422         label_end_of_switch:
 423           break;
 424         }
 425     }
 426   return CODING_CATEGORY_MASK_INTERNAL;
 427 }
 428
 429 \f
 430 /*** 3. ISO2022 handlers ***/
 431
 432 /* The following note describes the coding system ISO2022 briefly.
 433    Since the intension of this note is to help understanding of the
 434    programs in this file, some parts are NOT ACCURATE or OVERLY
 435    SIMPLIFIED.  For the thorough understanding, please refer to the
 436    original document of ISO2022.
 437
 438    ISO2022 provides many mechanisms to encode several character sets
 439    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 440    all text is encoded by codes of less than 128.  This may make the
 441    encoded text a little bit longer, but the text get more stability
 442    to pass through several gateways (some of them split MSB off).
 443
 444    There are two kind of character set: control character set and
 445    graphic character set.  The former contains control characters such
 446    as `newline' and `escape' to provide control functions (control
 447    functions are provided also by escape sequence).  The latter
 448    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 449    two control character sets and many graphic character sets.
 450
 451    Graphic character sets are classified into one of the following
 452    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 453    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 454    bytes (DIMENSION) and the number of characters in one dimension
 455    (CHARS) of the set.  In addition, each character set is assigned an
 456    identification tag (called "final character" and denoted as <F>
 457    here after) which is unique in each class.  <F> of each character
 458    set is decided by ECMA(*) when it is registered in ISO.  Code range
 459    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 460
 461    Note (*): ECMA = European Computer Manufacturers Association
 462
 463    Here are examples of graphic character set [NAME(<F>)]:
 464         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 465         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 466         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 467         o DIMENSION2_CHARS96 -- none for the moment
 468
 469    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 470         C0 [0x00..0x1F] -- control character plane 0
 471         GL [0x20..0x7F] -- graphic character plane 0
 472         C1 [0x80..0x9F] -- control character plane 1
 473         GR [0xA0..0xFF] -- graphic character plane 1
 474
 475    A control character set is directly designated and invoked to C0 or
 476    C1 by an escape sequence.  The most common case is that ISO646's
 477    control character set is designated/invoked to C0 and ISO6429's
 478    control character set is designated/invoked to C1, and usually
 479    these designations/invocations are omitted in a coded text.  With
 480    7-bit environment, only C0 can be used, and a control character for
 481    C1 is encoded by an appropriate escape sequence to fit in the
 482    environment.  All control characters for C1 are defined the
 483    corresponding escape sequences.
 484
 485    A graphic character set is at first designated to one of four
 486    graphic registers (G0 through G3), then these graphic registers are
 487    invoked to GL or GR.  These designations and invocations can be
 488    done independently.  The most common case is that G0 is invoked to
 489    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 490    these invocations and designations are omitted in a coded text.
 491    With 7-bit environment, only GL can be used.
 492
 493    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 494    and 0x7F of GL area work as control characters SPACE and DEL
 495    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 496
 497    There are two ways of invocation: locking-shift and single-shift.
 498    With locking-shift, the invocation lasts until the next different
 499    invocation, whereas with single-shift, the invocation works only
 500    for the following character and doesn't affect locking-shift.
 501    Invocations are done by the following control characters or escape
 502    sequences.
 503
 504    ----------------------------------------------------------------------
 505    function             control char    escape sequence description
 506    ----------------------------------------------------------------------
 507    SI  (shift-in)               0x0F    none            invoke G0 to GL
 508    SI  (shift-out)              0x0E    none            invoke G1 to GL
 509    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 510    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 511    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 512    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 513    ----------------------------------------------------------------------
 514    The first four are for locking-shift.  Control characters for these
 515    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 516
 517    Designations are done by the following escape sequences.
 518    ----------------------------------------------------------------------
 519    escape sequence      description
 520    ----------------------------------------------------------------------
 521    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 522    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 523    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 524    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 525    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 526    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 527    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 528    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 529    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 530    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 531    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 532    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 533    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 534    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 535    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 536    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 537    ----------------------------------------------------------------------
 538
 539    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 540    of dimension 1, chars 94, and final character <F>, and etc.
 541
 542    Note (*): Although these designations are not allowed in ISO2022,
 543    Emacs accepts them on decoding, and produces them on encoding
 544    CHARS96 character set in a coding system which is characterized as
 545    7-bit environment, non-locking-shift, and non-single-shift.
 546
 547    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 548    '(' can be omitted.  We call this as "short-form" here after.
 549
 550    Now you may notice that there are a lot of ways for encoding the
 551    same multilingual text in ISO2022.  Actually, there exist many
 552    coding systems such as Compound Text (used in X's inter client
 553    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 554    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 555    localized platforms), and all of these are variants of ISO2022.
 556
 557    In addition to the above, Emacs handles two more kinds of escape
 558    sequences: ISO6429's direction specification and Emacs' private
 559    sequence for specifying character composition.
 560
 561    ISO6429's direction specification takes the following format:
 562         o CSI ']'      -- end of the current direction
 563         o CSI '0' ']'  -- end of the current direction
 564         o CSI '1' ']'  -- start of left-to-right text
 565         o CSI '2' ']'  -- start of right-to-left text
 566    The control character CSI (0x9B: control sequence introducer) is
 567    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 568
 569    Character composition specification takes the following format:
 570         o ESC '0' -- start character composition
 571         o ESC '1' -- end character composition
 572    Since these are not standard escape sequences of any ISO, the use
 573    of them for these meaning is restricted to Emacs only.  */
 574
 575 enum iso_code_class_type iso_code_class[256];
 576
 577 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 578    Check if a text is encoded in ISO2022.  If it is, returns an
 579    integer in which appropriate flag bits any of:
 580         CODING_CATEGORY_MASK_ISO_7
 581         CODING_CATEGORY_MASK_ISO_8_1
 582         CODING_CATEGORY_MASK_ISO_8_2
 583         CODING_CATEGORY_MASK_ISO_ELSE
 584    are set.  If a code which should never appear in ISO2022 is found,
 585    returns 0.  */
 586
 587 int
 588 detect_coding_iso2022 (src, src_end)
 589      unsigned char *src, *src_end;
 590 {
 591   int mask = CODING_CATEGORY_MASK_ANY;
 592   int g1 = 0;                   /* 1 iff designating to G1.  */
 593   int c, i;
 594
 595   while (src < src_end)
 596     {
 597       c = *src++;
 598       switch (c)
 599         {
 600         case ISO_CODE_ESC:
 601           if (src >= src_end)
 602             break;
 603           c = *src++;
 604           if (src < src_end
 605               && ((c >= '(' && c <= '/')
 606                   || c == '$' && ((*src >= '(' && *src <= '/')
 607                                   || (*src >= '@' && *src <= 'B'))))
 608             {
 609               /* Valid designation sequence.  */
 610               mask &= (CODING_CATEGORY_MASK_ISO_7
 611                        | CODING_CATEGORY_MASK_ISO_8_1
 612                        | CODING_CATEGORY_MASK_ISO_8_2
 613                        | CODING_CATEGORY_MASK_ISO_ELSE);
 614               if (c == ')' || (c == '$' && *src == ')'))
 615                 {
 616                   g1 = 1;
 617                   mask &= ~CODING_CATEGORY_MASK_ISO_7;
 618                 }
 619               src++;
 620               break;
 621             }
 622           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 623             return CODING_CATEGORY_MASK_ISO_ELSE;
 624           break;
 625
 626         case ISO_CODE_SO:
 627           if (g1)
 628             return CODING_CATEGORY_MASK_ISO_ELSE;
 629           break;
 630
 631         case ISO_CODE_CSI:
 632         case ISO_CODE_SS2:
 633         case ISO_CODE_SS3:
 634           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 635           break;
 636
 637         default:
 638           if (c < 0x80)
 639             break;
 640           else if (c < 0xA0)
 641             return 0;
 642           else
 643             {
 644               int count = 1;
 645
 646               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 647               while (src < src_end && *src >= 0xA0)
 648                 count++, src++;
 649               if (count & 1 && src < src_end)
 650                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 651             }
 652           break;
 653         }
 654     }
 655
 656   return mask;
 657 }
 658
 659 /* Decode a character of which charset is CHARSET and the 1st position
 660    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 661    fetched from SRC and set to C2.  If CHARSET is negative, it means
 662    that we are decoding ill formed text, and what we can do is just to
 663    read C1 as is.  */
 664
 665 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 666   do {                                                                  \
 667     int c_alt, charset_alt = (charset);                                 \
 668     if (COMPOSING_HEAD_P (coding->composing))                           \
 669       {                                                                 \
 670         *dst++ = LEADING_CODE_COMPOSITION;                              \
 671         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 672           /* To tell composition rules are embeded.  */                 \
 673           *dst++ = 0xFF;                                                \
 674         coding->composing += 2;                                         \
 675       }                                                                 \
 676     if ((charset) >= 0)                                                 \
 677       {                                                                 \
 678         if (CHARSET_DIMENSION (charset) == 2)                           \
 679           ONE_MORE_BYTE (c2);                                           \
 680         if (!NILP (unification_table)                                   \
 681             && ((c_alt = unify_char (unification_table,                 \
 682                                      -1, (charset), c1, c2)) >= 0))     \
 683           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 684       }                                                                 \
 685     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 686       DECODE_CHARACTER_ASCII (c1);                                      \
 687     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 688       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 689     else                                                                \
 690       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 691     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 692       /* To tell a composition rule follows.  */                        \
 693       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 694   } while (0)
 695
 696 /* Set designation state into CODING.  */
 697 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 698   do {                                                                  \
 699     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 700     if (charset >= 0)                                                   \
 701       {                                                                 \
 702         if (coding->direction == 1                                      \
 703             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 704           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 705         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 706       }                                                                 \
 707   } while (0)
 708
 709 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 710
 711 int
 712 decode_coding_iso2022 (coding, source, destination,
 713                        src_bytes, dst_bytes, consumed)
 714      struct coding_system *coding;
 715      unsigned char *source, *destination;
 716      int src_bytes, dst_bytes;
 717      int *consumed;
 718 {
 719   unsigned char *src = source;
 720   unsigned char *src_end = source + src_bytes;
 721   unsigned char *dst = destination;
 722   unsigned char *dst_end = destination + dst_bytes;
 723   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 724      from DST_END to assure that overflow checking is necessary only
 725      at the head of loop.  */
 726   unsigned char *adjusted_dst_end = dst_end - 6;
 727   int charset;
 728   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 729   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 730   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 731   Lisp_Object unification_table = coding->character_unification_table;
 732
 733   if (!NILP (Venable_character_unification) && NILP (unification_table))
 734     unification_table = Vstandard_character_unification_table_for_read;
 735
 736   while (src < src_end && dst < adjusted_dst_end)
 737     {
 738       /* SRC_BASE remembers the start position in source in each loop.
 739          The loop will be exited when there's not enough source text
 740          to analyze long escape sequence or 2-byte code (within macros
 741          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 742          to SRC_BASE before exiting.  */
 743       unsigned char *src_base = src;
 744       int c1 = *src++, c2;
 745
 746       switch (iso_code_class [c1])
 747         {
 748         case ISO_0x20_or_0x7F:
 749           if (!coding->composing
 750               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 751             {
 752               /* This is SPACE or DEL.  */
 753               *dst++ = c1;
 754               break;
 755             }
 756           /* This is a graphic character, we fall down ...  */
 757
 758         case ISO_graphic_plane_0:
 759           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 760             {
 761               /* This is a composition rule.  */
 762               *dst++ = c1 | 0x80;
 763               coding->composing = COMPOSING_WITH_RULE_TAIL;
 764             }
 765           else
 766             DECODE_ISO_CHARACTER (charset0, c1);
 767           break;
 768
 769         case ISO_0xA0_or_0xFF:
 770           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 771             {
 772               /* Invalid code.  */
 773               *dst++ = c1;
 774               break;
 775             }
 776           /* This is a graphic character, we fall down ... */
 777
 778         case ISO_graphic_plane_1:
 779           DECODE_ISO_CHARACTER (charset1, c1);
 780           break;
 781
 782         case ISO_control_code:
 783           /* All ISO2022 control characters in this class have the
 784              same representation in Emacs internal format.  */
 785           *dst++ = c1;
 786           break;
 787
 788         case ISO_carriage_return:
 789           if (coding->eol_type == CODING_EOL_CR)
 790             {
 791               *dst++ = '\n';
 792             }
 793           else if (coding->eol_type == CODING_EOL_CRLF)
 794             {
 795               ONE_MORE_BYTE (c1);
 796               if (c1 == ISO_CODE_LF)
 797                 *dst++ = '\n';
 798               else
 799                 {
 800                   src--;
 801                   *dst++ = c1;
 802                 }
 803             }
 804           else
 805             {
 806               *dst++ = c1;
 807             }
 808           break;
 809
 810         case ISO_shift_out:
 811           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 812             goto label_invalid_escape_sequence;
 813           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 814           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 815           break;
 816
 817         case ISO_shift_in:
 818           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 819           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 820           break;
 821
 822         case ISO_single_shift_2_7:
 823         case ISO_single_shift_2:
 824           /* SS2 is handled as an escape sequence of ESC 'N' */
 825           c1 = 'N';
 826           goto label_escape_sequence;
 827
 828         case ISO_single_shift_3:
 829           /* SS2 is handled as an escape sequence of ESC 'O' */
 830           c1 = 'O';
 831           goto label_escape_sequence;
 832
 833         case ISO_control_sequence_introducer:
 834           /* CSI is handled as an escape sequence of ESC '[' ...  */
 835           c1 = '[';
 836           goto label_escape_sequence;
 837
 838         case ISO_escape:
 839           ONE_MORE_BYTE (c1);
 840         label_escape_sequence:
 841           /* Escape sequences handled by Emacs are invocation,
 842              designation, direction specification, and character
 843              composition specification.  */
 844           switch (c1)
 845             {
 846             case '&':           /* revision of following character set */
 847               ONE_MORE_BYTE (c1);
 848               if (!(c1 >= '@' && c1 <= '~'))
 849                 goto label_invalid_escape_sequence;
 850               ONE_MORE_BYTE (c1);
 851               if (c1 != ISO_CODE_ESC)
 852                 goto label_invalid_escape_sequence;
 853               ONE_MORE_BYTE (c1);
 854               goto label_escape_sequence;
 855
 856             case '$':           /* designation of 2-byte character set */
 857               ONE_MORE_BYTE (c1);
 858               if (c1 >= '@' && c1 <= 'B')
 859                 {       /* designation of JISX0208.1978, GB2312.1980,
 860                                    or JISX0208.1980 */
 861                   DECODE_DESIGNATION (0, 2, 94, c1);
 862                 }
 863               else if (c1 >= 0x28 && c1 <= 0x2B)
 864                 {       /* designation of DIMENSION2_CHARS94 character set */
 865                   ONE_MORE_BYTE (c2);
 866                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 867                 }
 868               else if (c1 >= 0x2C && c1 <= 0x2F)
 869                 {       /* designation of DIMENSION2_CHARS96 character set */
 870                   ONE_MORE_BYTE (c2);
 871                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 872                 }
 873               else
 874                 goto label_invalid_escape_sequence;
 875               break;
 876
 877             case 'n':           /* invocation of locking-shift-2 */
 878               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 879                 goto label_invalid_escape_sequence;
 880               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 881               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 882               break;
 883
 884             case 'o':           /* invocation of locking-shift-3 */
 885               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 886                 goto label_invalid_escape_sequence;
 887               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 888               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 889               break;
 890
 891             case 'N':           /* invocation of single-shift-2 */
 892               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 893                 goto label_invalid_escape_sequence;
 894               ONE_MORE_BYTE (c1);
 895               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 896               DECODE_ISO_CHARACTER (charset, c1);
 897               break;
 898
 899             case 'O':           /* invocation of single-shift-3 */
 900               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 901                 goto label_invalid_escape_sequence;
 902               ONE_MORE_BYTE (c1);
 903               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 904               DECODE_ISO_CHARACTER (charset, c1);
 905               break;
 906
 907             case '0':           /* start composing without embeded rules */
 908               coding->composing = COMPOSING_NO_RULE_HEAD;
 909               break;
 910
 911             case '1':           /* end composing */
 912               coding->composing = COMPOSING_NO;
 913               break;
 914
 915             case '2':           /* start composing with embeded rules */
 916               coding->composing = COMPOSING_WITH_RULE_HEAD;
 917               break;
 918
 919             case '[':           /* specification of direction */
 920               /* For the moment, nested direction is not supported.
 921                  So, the value of `coding->direction' is 0 or 1: 0
 922                  means left-to-right, 1 means right-to-left.  */
 923               ONE_MORE_BYTE (c1);
 924               switch (c1)
 925                 {
 926                 case ']':       /* end of the current direction */
 927                   coding->direction = 0;
 928
 929                 case '0':       /* end of the current direction */
 930                 case '1':       /* start of left-to-right direction */
 931                   ONE_MORE_BYTE (c1);
 932                   if (c1 == ']')
 933                     coding->direction = 0;
 934                   else
 935                     goto label_invalid_escape_sequence;
 936                   break;
 937
 938                 case '2':       /* start of right-to-left direction */
 939                   ONE_MORE_BYTE (c1);
 940                   if (c1 == ']')
 941                     coding->direction= 1;
 942                   else
 943                     goto label_invalid_escape_sequence;
 944                   break;
 945
 946                 default:
 947                   goto label_invalid_escape_sequence;
 948                 }
 949               break;
 950
 951             default:
 952               if (c1 >= 0x28 && c1 <= 0x2B)
 953                 {       /* designation of DIMENSION1_CHARS94 character set */
 954                   ONE_MORE_BYTE (c2);
 955                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 956                 }
 957               else if (c1 >= 0x2C && c1 <= 0x2F)
 958                 {       /* designation of DIMENSION1_CHARS96 character set */
 959                   ONE_MORE_BYTE (c2);
 960                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 961                 }
 962               else
 963                 {
 964                   goto label_invalid_escape_sequence;
 965                 }
 966             }
 967           /* We must update these variables now.  */
 968           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 969           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 970           break;
 971
 972         label_invalid_escape_sequence:
 973           {
 974             int length = src - src_base;
 975
 976             bcopy (src_base, dst, length);
 977             dst += length;
 978           }
 979         }
 980       continue;
 981
 982     label_end_of_loop:
 983       coding->carryover_size = src - src_base;
 984       bcopy (src_base, coding->carryover, coding->carryover_size);
 985       src = src_base;
 986       break;
 987     }
 988
 989   /* If this is the last block of the text to be decoded, we had
 990      better just flush out all remaining codes in the text although
 991      they are not valid characters.  */
 992   if (coding->last_block)
 993     {
 994       bcopy (src, dst, src_end - src);
 995       dst += (src_end - src);
 996       src = src_end;
 997     }
 998   *consumed = src - source;
 999   return dst - destination;
1000 }
1001
1002 /* ISO2022 encoding staffs.  */
1003
1004 /*
1005    It is not enough to say just "ISO2022" on encoding, but we have to
1006    specify more details.  In Emacs, each coding-system of ISO2022
1007    variant has the following specifications:
1008         1. Initial designation to G0 thru G3.
1009         2. Allows short-form designation?
1010         3. ASCII should be designated to G0 before control characters?
1011         4. ASCII should be designated to G0 at end of line?
1012         5. 7-bit environment or 8-bit environment?
1013         6. Use locking-shift?
1014         7. Use Single-shift?
1015    And the following two are only for Japanese:
1016         8. Use ASCII in place of JIS0201-1976-Roman?
1017         9. Use JISX0208-1983 in place of JISX0208-1978?
1018    These specifications are encoded in `coding->flags' as flag bits
1019    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1020    detail.
1021 */
1022
1023 /* Produce codes (escape sequence) for designating CHARSET to graphic
1024    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1025    the coding system CODING allows, produce designation sequence of
1026    short-form.  */
1027
1028 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1029   do {                                                                  \
1030     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1031     char *intermediate_char_94 = "()*+";                                \
1032     char *intermediate_char_96 = ",-./";                                \
1033     Lisp_Object temp                                                    \
1034       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1035     if (! NILP (temp))                                                  \
1036         {                                                               \
1037         *dst++ = ISO_CODE_ESC;                                          \
1038         *dst++ = '&';                                                   \
1039         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1040       }                                                                 \
1041     *dst++ = ISO_CODE_ESC;                                              \
1042     if (CHARSET_DIMENSION (charset) == 1)                               \
1043       {                                                                 \
1044         if (CHARSET_CHARS (charset) == 94)                              \
1045           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1046         else                                                            \
1047           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1048       }                                                                 \
1049     else                                                                \
1050       {                                                                 \
1051         *dst++ = '$';                                                   \
1052         if (CHARSET_CHARS (charset) == 94)                              \
1053           {                                                             \
1054             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1055                 || reg != 0                                             \
1056                 || final_char < '@' || final_char > 'B')                \
1057               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1058           }                                                             \
1059         else                                                            \
1060           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1061       }                                                                 \
1062     *dst++ = final_char;                                                \
1063     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1064   } while (0)
1065
1066 /* The following two macros produce codes (control character or escape
1067    sequence) for ISO2022 single-shift functions (single-shift-2 and
1068    single-shift-3).  */
1069
1070 #define ENCODE_SINGLE_SHIFT_2                           \
1071   do {                                                  \
1072     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1073       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1074     else                                                \
1075       *dst++ = ISO_CODE_SS2;                            \
1076     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1077   } while (0)
1078
1079 #define ENCODE_SINGLE_SHIFT_3                           \
1080   do {                                                  \
1081     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1082       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1083     else                                                \
1084       *dst++ = ISO_CODE_SS3;                            \
1085     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1086   } while (0)
1087
1088 /* The following four macros produce codes (control character or
1089    escape sequence) for ISO2022 locking-shift functions (shift-in,
1090    shift-out, locking-shift-2, and locking-shift-3).  */
1091
1092 #define ENCODE_SHIFT_IN                         \
1093   do {                                          \
1094     *dst++ = ISO_CODE_SI;                       \
1095     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1096   } while (0)
1097
1098 #define ENCODE_SHIFT_OUT                        \
1099   do {                                          \
1100     *dst++ = ISO_CODE_SO;                       \
1101     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1102   } while (0)
1103
1104 #define ENCODE_LOCKING_SHIFT_2                  \
1105   do {                                          \
1106     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1107     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1108   } while (0)
1109
1110 #define ENCODE_LOCKING_SHIFT_3                  \
1111   do {                                          \
1112     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1113     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1114   } while (0)
1115
1116 /* Produce codes for a DIMENSION1 character of which character set is
1117    CHARSET and position-code is C1.  Designation and invocation
1118    sequences are also produced in advance if necessary.  */
1119
1120
1121 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1122   do {                                                                  \
1123     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1124       {                                                                 \
1125         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1126           *dst++ = c1 & 0x7F;                                           \
1127         else                                                            \
1128           *dst++ = c1 | 0x80;                                           \
1129         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1130         break;                                                          \
1131       }                                                                 \
1132     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1133       {                                                                 \
1134         *dst++ = c1 & 0x7F;                                             \
1135         break;                                                          \
1136       }                                                                 \
1137     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1138       {                                                                 \
1139         *dst++ = c1 | 0x80;                                             \
1140         break;                                                          \
1141       }                                                                 \
1142     else                                                                \
1143       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1144          must invoke it, or, at first, designate it to some graphic     \
1145          register.  Then repeat the loop to actually produce the        \
1146          character.  */                                                 \
1147       dst = encode_invocation_designation (charset, coding, dst);       \
1148   } while (1)
1149
1150 /* Produce codes for a DIMENSION2 character of which character set is
1151    CHARSET and position-codes are C1 and C2.  Designation and
1152    invocation codes are also produced in advance if necessary.  */
1153
1154 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1155   do {                                                                  \
1156     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1157       {                                                                 \
1158         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1159           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1160         else                                                            \
1161           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1162         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1163         break;                                                          \
1164       }                                                                 \
1165     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1166       {                                                                 \
1167         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1168         break;                                                          \
1169       }                                                                 \
1170     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1171       {                                                                 \
1172         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1173         break;                                                          \
1174       }                                                                 \
1175     else                                                                \
1176       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1177          must invoke it, or, at first, designate it to some graphic     \
1178          register.  Then repeat the loop to actually produce the        \
1179          character.  */                                                 \
1180       dst = encode_invocation_designation (charset, coding, dst);       \
1181   } while (1)
1182
1183 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1184   do {                                                                    \
1185     int c_alt, charset_alt;                                               \
1186     if (!NILP (unification_table)                                         \
1187         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1188             < 0))                                                         \
1189       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1190     else                                                                  \
1191       charset_alt = charset;                                              \
1192     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1193       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1194     else                                                                  \
1195       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1196   } while (0)
1197
1198 /* Produce designation and invocation codes at a place pointed by DST
1199    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1200    Return new DST.  */
1201
1202 unsigned char *
1203 encode_invocation_designation (charset, coding, dst)
1204      int charset;
1205      struct coding_system *coding;
1206      unsigned char *dst;
1207 {
1208   int reg;                      /* graphic register number */
1209
1210   /* At first, check designations.  */
1211   for (reg = 0; reg < 4; reg++)
1212     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1213       break;
1214
1215   if (reg >= 4)
1216     {
1217       /* CHARSET is not yet designated to any graphic registers.  */
1218       /* At first check the requested designation.  */
1219       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1220       if (reg < 0)
1221         /* Since CHARSET requests no special designation, designate to
1222            graphic register 0.  */
1223         reg = 0;
1224
1225       ENCODE_DESIGNATION (charset, reg, coding);
1226     }
1227
1228   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1229       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1230     {
1231       /* Since the graphic register REG is not invoked to any graphic
1232          planes, invoke it to graphic plane 0.  */
1233       switch (reg)
1234         {
1235         case 0:                 /* graphic register 0 */
1236           ENCODE_SHIFT_IN;
1237           break;
1238
1239         case 1:                 /* graphic register 1 */
1240           ENCODE_SHIFT_OUT;
1241           break;
1242
1243         case 2:                 /* graphic register 2 */
1244           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1245             ENCODE_SINGLE_SHIFT_2;
1246           else
1247             ENCODE_LOCKING_SHIFT_2;
1248           break;
1249
1250         case 3:                 /* graphic register 3 */
1251           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1252             ENCODE_SINGLE_SHIFT_3;
1253           else
1254             ENCODE_LOCKING_SHIFT_3;
1255           break;
1256         }
1257     }
1258   return dst;
1259 }
1260
1261 /* The following two macros produce codes for indicating composition.  */
1262 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1263 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1264 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1265
1266 /* The following three macros produce codes for indicating direction
1267    of text.  */
1268 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1269   do {                                                  \
1270     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1271       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1272     else                                                \
1273       *dst++ = ISO_CODE_CSI;                            \
1274   } while (0)
1275
1276 #define ENCODE_DIRECTION_R2L    \
1277   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1278
1279 #define ENCODE_DIRECTION_L2R    \
1280   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1281
1282 /* Produce codes for designation and invocation to reset the graphic
1283    planes and registers to initial state.  */
1284 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1285   do {                                                                      \
1286     int reg;                                                                \
1287     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1288       ENCODE_SHIFT_IN;                                                      \
1289     for (reg = 0; reg < 4; reg++)                                           \
1290       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1291           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1292               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1293         ENCODE_DESIGNATION                                                  \
1294           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1295   } while (0)
1296
1297 /* Produce designation sequences of charsets in the line started from
1298    *SRC to a place pointed by DSTP.
1299
1300    If the current block ends before any end-of-line, we may fail to
1301    find all the necessary *designations.  */
1302 encode_designation_at_bol (coding, table, src, src_end, dstp)
1303      struct coding_system *coding;
1304      Lisp_Object table;
1305      unsigned char *src, *src_end, **dstp;
1306 {
1307   int charset, c, found = 0, reg;
1308   /* Table of charsets to be designated to each graphic register.  */
1309   int r[4];
1310   unsigned char *dst = *dstp;
1311
1312   for (reg = 0; reg < 4; reg++)
1313     r[reg] = -1;
1314
1315   while (src < src_end && *src != '\n' && found < 4)
1316     {
1317       int bytes = BYTES_BY_CHAR_HEAD (*src);
1318
1319       if (NILP (table))
1320         charset = CHARSET_AT (src);
1321       else
1322         {
1323           int c_alt, c1, c2;
1324
1325           SPLIT_STRING(src, bytes, charset, c1, c2);
1326           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1327             charset = CHAR_CHARSET (c_alt);
1328         }
1329
1330       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1331       if (r[reg] < 0)
1332         {
1333           found++;
1334           r[reg] = charset;
1335         }
1336
1337       src += bytes;
1338     }
1339
1340   if (found)
1341     {
1342       for (reg = 0; reg < 4; reg++)
1343         if (r[reg] >= 0
1344             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1345           ENCODE_DESIGNATION (r[reg], reg, coding);
1346       *dstp = dst;
1347     }
1348 }
1349
1350 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1351
1352 int
1353 encode_coding_iso2022 (coding, source, destination,
1354                        src_bytes, dst_bytes, consumed)
1355      struct coding_system *coding;
1356      unsigned char *source, *destination;
1357      int src_bytes, dst_bytes;
1358      int *consumed;
1359 {
1360   unsigned char *src = source;
1361   unsigned char *src_end = source + src_bytes;
1362   unsigned char *dst = destination;
1363   unsigned char *dst_end = destination + dst_bytes;
1364   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1365      from DST_END to assure overflow checking is necessary only at the
1366      head of loop.  */
1367   unsigned char *adjusted_dst_end = dst_end - 19;
1368   Lisp_Object unification_table = coding->character_unification_table;
1369
1370   if (!NILP (Venable_character_unification) && NILP (unification_table))
1371     unification_table = Vstandard_character_unification_table_for_write;
1372
1373   while (src < src_end && dst < adjusted_dst_end)
1374     {
1375       /* SRC_BASE remembers the start position in source in each loop.
1376          The loop will be exited when there's not enough source text
1377          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1378          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1379          reset to SRC_BASE before exiting.  */
1380       unsigned char *src_base = src;
1381       int charset, c1, c2, c3, c4;
1382
1383       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1384           && CODING_SPEC_ISO_BOL (coding))
1385         {
1386           /* We have to produce designation sequences if any now.  */
1387           encode_designation_at_bol (coding, unification_table,
1388                                      src, src_end, &dst);
1389           CODING_SPEC_ISO_BOL (coding) = 0;
1390         }
1391
1392       c1 = *src++;
1393       /* If we are seeing a component of a composite character, we are
1394          seeing a leading-code specially encoded for composition, or a
1395          composition rule if composing with rule.  We must set C1
1396          to a normal leading-code or an ASCII code.  If we are not at
1397          a composed character, we must reset the composition state.  */
1398       if (COMPOSING_P (coding->composing))
1399         {
1400           if (c1 < 0xA0)
1401             {
1402               /* We are not in a composite character any longer.  */
1403               coding->composing = COMPOSING_NO;
1404               ENCODE_COMPOSITION_END;
1405             }
1406           else
1407             {
1408               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1409                 {
1410                   *dst++ = c1 & 0x7F;
1411                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1412                   continue;
1413                 }
1414               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1415                 coding->composing = COMPOSING_WITH_RULE_RULE;
1416               if (c1 == 0xA0)
1417                 {
1418                   /* This is an ASCII component.  */
1419                   ONE_MORE_BYTE (c1);
1420                   c1 &= 0x7F;
1421                 }
1422               else
1423                 /* This is a leading-code of non ASCII component.  */
1424                 c1 -= 0x20;
1425             }
1426         }
1427
1428       /* Now encode one character.  C1 is a control character, an
1429          ASCII character, or a leading-code of multi-byte character.  */
1430       switch (emacs_code_class[c1])
1431         {
1432         case EMACS_ascii_code:
1433           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1434           break;
1435
1436         case EMACS_control_code:
1437           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1438             ENCODE_RESET_PLANE_AND_REGISTER;
1439           *dst++ = c1;
1440           break;
1441
1442         case EMACS_carriage_return_code:
1443           if (!coding->selective)
1444             {
1445               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1446                 ENCODE_RESET_PLANE_AND_REGISTER;
1447               *dst++ = c1;
1448               break;
1449             }
1450           /* fall down to treat '\r' as '\n' ...  */
1451
1452         case EMACS_linefeed_code:
1453           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1454             ENCODE_RESET_PLANE_AND_REGISTER;
1455           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1456             bcopy (coding->spec.iso2022.initial_designation,
1457                    coding->spec.iso2022.current_designation,
1458                    sizeof coding->spec.iso2022.initial_designation);
1459           if (coding->eol_type == CODING_EOL_LF
1460               || coding->eol_type == CODING_EOL_AUTOMATIC)
1461             *dst++ = ISO_CODE_LF;
1462           else if (coding->eol_type == CODING_EOL_CRLF)
1463             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1464           else
1465             *dst++ = ISO_CODE_CR;
1466           CODING_SPEC_ISO_BOL (coding) = 1;
1467           break;
1468
1469         case EMACS_leading_code_2:
1470           ONE_MORE_BYTE (c2);
1471           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1472           break;
1473
1474         case EMACS_leading_code_3:
1475           TWO_MORE_BYTES (c2, c3);
1476           if (c1 < LEADING_CODE_PRIVATE_11)
1477             ENCODE_ISO_CHARACTER (c1, c2, c3);
1478           else
1479             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1480           break;
1481
1482         case EMACS_leading_code_4:
1483           THREE_MORE_BYTES (c2, c3, c4);
1484           ENCODE_ISO_CHARACTER (c2, c3, c4);
1485           break;
1486
1487         case EMACS_leading_code_composition:
1488           ONE_MORE_BYTE (c1);
1489           if (c1 == 0xFF)
1490             {
1491               coding->composing = COMPOSING_WITH_RULE_HEAD;
1492               ENCODE_COMPOSITION_WITH_RULE_START;
1493             }
1494           else
1495             {
1496               /* Rewind one byte because it is a character code of
1497                  composition elements.  */
1498               src--;
1499               coding->composing = COMPOSING_NO_RULE_HEAD;
1500               ENCODE_COMPOSITION_NO_RULE_START;
1501             }
1502           break;
1503
1504         case EMACS_invalid_code:
1505           *dst++ = c1;
1506           break;
1507         }
1508       continue;
1509     label_end_of_loop:
1510       coding->carryover_size = src - src_base;
1511       bcopy (src_base, coding->carryover, coding->carryover_size);
1512       break;
1513     }
1514
1515   /* If this is the last block of the text to be encoded, we must
1516      reset graphic planes and registers to the initial state.  */
1517   if (src >= src_end && coding->last_block)
1518     {
1519       ENCODE_RESET_PLANE_AND_REGISTER;
1520       if (coding->carryover_size > 0
1521           && coding->carryover_size < (dst_end - dst))
1522         {
1523           bcopy (coding->carryover, dst, coding->carryover_size);
1524           dst += coding->carryover_size;
1525           coding->carryover_size = 0;
1526         }
1527     }
1528   *consumed = src - source;
1529   return dst - destination;
1530 }
1531
1532 \f
1533 /*** 4. SJIS and BIG5 handlers ***/
1534
1535 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1536    quite widely.  So, for the moment, Emacs supports them in the bare
1537    C code.  But, in the future, they may be supported only by CCL.  */
1538
1539 /* SJIS is a coding system encoding three character sets: ASCII, right
1540    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1541    as is.  A character of charset katakana-jisx0201 is encoded by
1542    "position-code + 0x80".  A character of charset japanese-jisx0208
1543    is encoded in 2-byte but two position-codes are divided and shifted
1544    so that it fit in the range below.
1545
1546    --- CODE RANGE of SJIS ---
1547    (character set)      (range)
1548    ASCII                0x00 .. 0x7F
1549    KATAKANA-JISX0201    0xA0 .. 0xDF
1550    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1551             (2nd byte)  0x40 .. 0xFF
1552    -------------------------------
1553
1554 */
1555
1556 /* BIG5 is a coding system encoding two character sets: ASCII and
1557    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1558    character set and is encoded in two-byte.
1559
1560    --- CODE RANGE of BIG5 ---
1561    (character set)      (range)
1562    ASCII                0x00 .. 0x7F
1563    Big5 (1st byte)      0xA1 .. 0xFE
1564         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1565    --------------------------
1566
1567    Since the number of characters in Big5 is larger than maximum
1568    characters in Emacs' charset (96x96), it can't be handled as one
1569    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1570    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1571    contains frequently used characters and the latter contains less
1572    frequently used characters.  */
1573
1574 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1575    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1576    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1577    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1578
1579 /* Number of Big5 characters which have the same code in 1st byte.  */
1580 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1581
1582 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1583   do {                                                                  \
1584     unsigned int temp                                                   \
1585       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1586     if (b1 < 0xC9)                                                      \
1587       charset = charset_big5_1;                                         \
1588     else                                                                \
1589       {                                                                 \
1590         charset = charset_big5_2;                                       \
1591         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1592       }                                                                 \
1593     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1594     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1595   } while (0)
1596
1597 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1598   do {                                                                  \
1599     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1600     if (charset == charset_big5_2)                                      \
1601       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1602     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1603     b2 = temp % BIG5_SAME_ROW;                                          \
1604     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1605   } while (0)
1606
1607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1608    Check if a text is encoded in SJIS.  If it is, return
1609    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1610
1611 int
1612 detect_coding_sjis (src, src_end)
1613      unsigned char *src, *src_end;
1614 {
1615   unsigned char c;
1616
1617   while (src < src_end)
1618     {
1619       c = *src++;
1620       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1621         return 0;
1622       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1623         {
1624           if (src < src_end && *src++ < 0x40)
1625             return 0;
1626         }
1627     }
1628   return CODING_CATEGORY_MASK_SJIS;
1629 }
1630
1631 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1632    Check if a text is encoded in BIG5.  If it is, return
1633    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1634
1635 int
1636 detect_coding_big5 (src, src_end)
1637      unsigned char *src, *src_end;
1638 {
1639   unsigned char c;
1640
1641   while (src < src_end)
1642     {
1643       c = *src++;
1644       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1645         return 0;
1646       if (c >= 0xA1)
1647         {
1648           if (src >= src_end)
1649             break;
1650           c = *src++;
1651           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1652             return 0;
1653         }
1654     }
1655   return CODING_CATEGORY_MASK_BIG5;
1656 }
1657
1658 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1659    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1660
1661 int
1662 decode_coding_sjis_big5 (coding, source, destination,
1663                          src_bytes, dst_bytes, consumed, sjis_p)
1664      struct coding_system *coding;
1665      unsigned char *source, *destination;
1666      int src_bytes, dst_bytes;
1667      int *consumed;
1668      int sjis_p;
1669 {
1670   unsigned char *src = source;
1671   unsigned char *src_end = source + src_bytes;
1672   unsigned char *dst = destination;
1673   unsigned char *dst_end = destination + dst_bytes;
1674   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1675      from DST_END to assure overflow checking is necessary only at the
1676      head of loop.  */
1677   unsigned char *adjusted_dst_end = dst_end - 3;
1678
1679   while (src < src_end && dst < adjusted_dst_end)
1680     {
1681       /* SRC_BASE remembers the start position in source in each loop.
1682          The loop will be exited when there's not enough source text
1683          to analyze two-byte character (within macro ONE_MORE_BYTE).
1684          In that case, SRC is reset to SRC_BASE before exiting.  */
1685       unsigned char *src_base = src;
1686       unsigned char c1 = *src++, c2, c3, c4;
1687
1688       if (c1 == '\r')
1689         {
1690           if (coding->eol_type == CODING_EOL_CRLF)
1691             {
1692               ONE_MORE_BYTE (c2);
1693               if (c2 == '\n')
1694                 *dst++ = c2;
1695               else
1696                 /* To process C2 again, SRC is subtracted by 1.  */
1697                 *dst++ = c1, src--;
1698             }
1699           else
1700             *dst++ = c1;
1701         }
1702       else if (c1 < 0x80)
1703         *dst++ = c1;
1704       else if (c1 < 0xA0 || c1 >= 0xE0)
1705         {
1706           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1707           if (sjis_p)
1708             {
1709               ONE_MORE_BYTE (c2);
1710               DECODE_SJIS (c1, c2, c3, c4);
1711               DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1712             }
1713           else if (c1 >= 0xE0 && c1 < 0xFF)
1714             {
1715               int charset;
1716
1717               ONE_MORE_BYTE (c2);
1718               DECODE_BIG5 (c1, c2, charset, c3, c4);
1719               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1720             }
1721           else                  /* Invalid code */
1722             *dst++ = c1;
1723         }
1724       else
1725         {
1726           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1727           if (sjis_p)
1728             DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1729           else
1730             {
1731               int charset;
1732
1733               ONE_MORE_BYTE (c2);
1734               DECODE_BIG5 (c1, c2, charset, c3, c4);
1735               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1736             }
1737         }
1738       continue;
1739
1740     label_end_of_loop:
1741       coding->carryover_size = src - src_base;
1742       bcopy (src_base, coding->carryover, coding->carryover_size);
1743       src = src_base;
1744       break;
1745     }
1746
1747   *consumed = src - source;
1748   return dst - destination;
1749 }
1750
1751 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1752    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1753    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1754    sure that all these charsets are registered as official charset
1755    (i.e. do not have extended leading-codes).  Characters of other
1756    charsets are produced without any encoding.  If SJIS_P is 1, encode
1757    SJIS text, else encode BIG5 text.  */
1758
1759 int
1760 encode_coding_sjis_big5 (coding, source, destination,
1761                          src_bytes, dst_bytes, consumed, sjis_p)
1762      struct coding_system *coding;
1763      unsigned char *source, *destination;
1764      int src_bytes, dst_bytes;
1765      int *consumed;
1766      int sjis_p;
1767 {
1768   unsigned char *src = source;
1769   unsigned char *src_end = source + src_bytes;
1770   unsigned char *dst = destination;
1771   unsigned char *dst_end = destination + dst_bytes;
1772   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1773      from DST_END to assure overflow checking is necessary only at the
1774      head of loop.  */
1775   unsigned char *adjusted_dst_end = dst_end - 1;
1776
1777   while (src < src_end && dst < adjusted_dst_end)
1778     {
1779       /* SRC_BASE remembers the start position in source in each loop.
1780          The loop will be exited when there's not enough source text
1781          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1782          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1783          before exiting.  */
1784       unsigned char *src_base = src;
1785       unsigned char c1 = *src++, c2, c3, c4;
1786
1787       if (coding->composing)
1788         {
1789           if (c1 == 0xA0)
1790             {
1791               ONE_MORE_BYTE (c1);
1792               c1 &= 0x7F;
1793             }
1794           else if (c1 >= 0xA0)
1795             c1 -= 0x20;
1796           else
1797             coding->composing = 0;
1798         }
1799
1800       switch (emacs_code_class[c1])
1801         {
1802         case EMACS_ascii_code:
1803         case EMACS_control_code:
1804           *dst++ = c1;
1805           break;
1806
1807         case EMACS_carriage_return_code:
1808           if (!coding->selective)
1809             {
1810               *dst++ = c1;
1811               break;
1812             }
1813           /* fall down to treat '\r' as '\n' ...  */
1814
1815         case EMACS_linefeed_code:
1816           if (coding->eol_type == CODING_EOL_LF
1817               || coding->eol_type == CODING_EOL_AUTOMATIC)
1818             *dst++ = '\n';
1819           else if (coding->eol_type == CODING_EOL_CRLF)
1820             *dst++ = '\r', *dst++ = '\n';
1821           else
1822             *dst++ = '\r';
1823           break;
1824
1825         case EMACS_leading_code_2:
1826           ONE_MORE_BYTE (c2);
1827           if (sjis_p && c1 == charset_katakana_jisx0201)
1828             *dst++ = c2;
1829           else
1830             *dst++ = c1, *dst++ = c2;
1831           break;
1832
1833         case EMACS_leading_code_3:
1834           TWO_MORE_BYTES (c2, c3);
1835           c2 &= 0x7F, c3 &= 0x7F;
1836           if (sjis_p && c1 == charset_jisx0208)
1837             {
1838               unsigned char s1, s2;
1839
1840               ENCODE_SJIS (c2, c3, s1, s2);
1841               *dst++ = s1, *dst++ = s2;
1842             }
1843           else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1844             {
1845               unsigned char b1, b2;
1846
1847               ENCODE_BIG5 (c1, c2, c3, b1, b2);
1848               *dst++ = b1, *dst++ = b2;
1849             }
1850           else
1851             *dst++ = c1, *dst++ = c2, *dst++ = c3;
1852           break;
1853
1854         case EMACS_leading_code_4:
1855           THREE_MORE_BYTES (c2, c3, c4);
1856           *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1857           break;
1858
1859         case EMACS_leading_code_composition:
1860           coding->composing = 1;
1861           break;
1862
1863         default:                /* i.e. case EMACS_invalid_code: */
1864           *dst++ = c1;
1865         }
1866       continue;
1867
1868     label_end_of_loop:
1869       coding->carryover_size = src - src_base;
1870       bcopy (src_base, coding->carryover, coding->carryover_size);
1871       src = src_base;
1872       break;
1873     }
1874
1875   *consumed = src - source;
1876   return dst - destination;
1877 }
1878
1879 \f
1880 /*** 5. End-of-line handlers ***/
1881
1882 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1883    This function is called only when `coding->eol_type' is
1884    CODING_EOL_CRLF or CODING_EOL_CR.  */
1885
1886 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1887      struct coding_system *coding;
1888      unsigned char *source, *destination;
1889      int src_bytes, dst_bytes;
1890      int *consumed;
1891 {
1892   unsigned char *src = source;
1893   unsigned char *src_end = source + src_bytes;
1894   unsigned char *dst = destination;
1895   unsigned char *dst_end = destination + dst_bytes;
1896   int produced;
1897
1898   switch (coding->eol_type)
1899     {
1900     case CODING_EOL_CRLF:
1901       {
1902         /* Since the maximum bytes produced by each loop is 2, we
1903            subtract 1 from DST_END to assure overflow checking is
1904            necessary only at the head of loop.  */
1905         unsigned char *adjusted_dst_end = dst_end - 1;
1906
1907         while (src < src_end && dst < adjusted_dst_end)
1908           {
1909             unsigned char *src_base = src;
1910             unsigned char c = *src++;
1911             if (c == '\r')
1912               {
1913                 ONE_MORE_BYTE (c);
1914                 if (c != '\n')
1915                   *dst++ = '\r';
1916                 *dst++ = c;
1917               }
1918             else
1919               *dst++ = c;
1920             continue;
1921
1922           label_end_of_loop:
1923             coding->carryover_size = src - src_base;
1924             bcopy (src_base, coding->carryover, coding->carryover_size);
1925             src = src_base;
1926             break;
1927           }
1928         *consumed = src - source;
1929         produced = dst - destination;
1930         break;
1931       }
1932
1933     case CODING_EOL_CR:
1934       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1935       bcopy (source, destination, produced);
1936       dst_end = destination + produced;
1937       while (dst < dst_end)
1938         if (*dst++ == '\r') dst[-1] = '\n';
1939       *consumed = produced;
1940       break;
1941
1942     default:                    /* i.e. case: CODING_EOL_LF */
1943       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1944       bcopy (source, destination, produced);
1945       *consumed = produced;
1946       break;
1947     }
1948
1949   return produced;
1950 }
1951
1952 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
1953    format of end-of-line according to `coding->eol_type'.  If
1954    `coding->selective' is 1, code '\r' in source text also means
1955    end-of-line.  */
1956
1957 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1958      struct coding_system *coding;
1959      unsigned char *source, *destination;
1960      int src_bytes, dst_bytes;
1961      int *consumed;
1962 {
1963   unsigned char *src = source;
1964   unsigned char *dst = destination;
1965   int produced;
1966
1967   if (src_bytes <= 0)
1968     return 0;
1969
1970   switch (coding->eol_type)
1971     {
1972     case CODING_EOL_LF:
1973     case CODING_EOL_AUTOMATIC:
1974       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1975       bcopy (source, destination, produced);
1976       if (coding->selective)
1977         {
1978           int i = produced;
1979           while (i--)
1980             if (*dst++ == '\r') dst[-1] = '\n';
1981         }
1982       *consumed = produced;
1983
1984     case CODING_EOL_CRLF:
1985       {
1986         unsigned char c;
1987         unsigned char *src_end = source + src_bytes;
1988         unsigned char *dst_end = destination + dst_bytes;
1989         /* Since the maximum bytes produced by each loop is 2, we
1990            subtract 1 from DST_END to assure overflow checking is
1991            necessary only at the head of loop.  */
1992         unsigned char *adjusted_dst_end = dst_end - 1;
1993
1994         while (src < src_end && dst < adjusted_dst_end)
1995           {
1996             c = *src++;
1997             if (c == '\n' || (c == '\r' && coding->selective))
1998               *dst++ = '\r', *dst++ = '\n';
1999             else
2000               *dst++ = c;
2001           }
2002         produced = dst - destination;
2003         *consumed = src - source;
2004         break;
2005       }
2006
2007     default:                    /* i.e. case CODING_EOL_CR: */
2008       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2009       bcopy (source, destination, produced);
2010       {
2011         int i = produced;
2012         while (i--)
2013           if (*dst++ == '\n') dst[-1] = '\r';
2014       }
2015       *consumed = produced;
2016     }
2017
2018   return produced;
2019 }
2020
2021 \f
2022 /*** 6. C library functions ***/
2023
2024 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2025    has a property `coding-system'.  The value of this property is a
2026    vector of length 5 (called as coding-vector).  Among elements of
2027    this vector, the first (element[0]) and the fifth (element[4])
2028    carry important information for decoding/encoding.  Before
2029    decoding/encoding, this information should be set in fields of a
2030    structure of type `coding_system'.
2031
2032    A value of property `coding-system' can be a symbol of another
2033    subsidiary coding-system.  In that case, Emacs gets coding-vector
2034    from that symbol.
2035
2036    `element[0]' contains information to be set in `coding->type'.  The
2037    value and its meaning is as follows:
2038
2039    0 -- coding_system_internal
2040    1 -- coding_system_sjis
2041    2 -- coding_system_iso2022
2042    3 -- coding_system_big5
2043    4 -- coding_system_ccl
2044    nil -- coding_system_no_conversion
2045    t -- coding_system_automatic
2046
2047    `element[4]' contains information to be set in `coding->flags' and
2048    `coding->spec'.  The meaning varies by `coding->type'.
2049
2050    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2051    of length 32 (of which the first 13 sub-elements are used now).
2052    Meanings of these sub-elements are:
2053
2054    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2055         If the value is an integer of valid charset, the charset is
2056         assumed to be designated to graphic register N initially.
2057
2058         If the value is minus, it is a minus value of charset which
2059         reserves graphic register N, which means that the charset is
2060         not designated initially but should be designated to graphic
2061         register N just before encoding a character in that charset.
2062
2063         If the value is nil, graphic register N is never used on
2064         encoding.
2065
2066    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2067         Each value takes t or nil.  See the section ISO2022 of
2068         `coding.h' for more information.
2069
2070    If `coding->type' is `coding_type_big5', element[4] is t to denote
2071    BIG5-ETen or nil to denote BIG5-HKU.
2072
2073    If `coding->type' takes the other value, element[4] is ignored.
2074
2075    Emacs Lisp's coding system also carries information about format of
2076    end-of-line in a value of property `eol-type'.  If the value is
2077    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2078    means CODING_EOL_CR.  If it is not integer, it should be a vector
2079    of subsidiary coding systems of which property `eol-type' has one
2080    of above values.
2081
2082 */
2083
2084 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2085    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2086    is setup so that no conversion is necessary and return -1, else
2087    return 0.  */
2088
2089 int
2090 setup_coding_system (coding_system, coding)
2091      Lisp_Object coding_system;
2092      struct coding_system *coding;
2093 {
2094   Lisp_Object type, eol_type;
2095
2096   /* At first, set several fields default values.  */
2097   coding->require_flushing = 0;
2098   coding->last_block = 0;
2099   coding->selective = 0;
2100   coding->composing = 0;
2101   coding->direction = 0;
2102   coding->carryover_size = 0;
2103   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2104   /* We have not yet implemented a way to specify unification table in
2105      a coding system.  */
2106   coding->character_unification_table = Qnil;
2107
2108   Vlast_coding_system_used = coding->symbol = coding_system;
2109   eol_type = Qnil;
2110   /* Get value of property `coding-system' until we get a vector.
2111      While doing that, also get values of properties
2112      `post-read-conversion', `pre-write-conversion', and `eol-type'.  */
2113   while (!NILP (coding_system) && SYMBOLP (coding_system))
2114     {
2115       if (NILP (coding->post_read_conversion))
2116         coding->post_read_conversion = Fget (coding_system,
2117                                              Qpost_read_conversion);
2118       if (NILP (coding->pre_write_conversion))
2119         coding->pre_write_conversion = Fget (coding_system,
2120                                              Qpre_write_conversion);
2121       if (NILP (eol_type))
2122         eol_type = Fget (coding_system, Qeol_type);
2123       coding_system = Fget (coding_system, Qcoding_system);
2124     }
2125   if (!VECTORP (coding_system)
2126       || XVECTOR (coding_system)->size != 5)
2127     goto label_invalid_coding_system;
2128
2129   if (VECTORP (eol_type))
2130     coding->eol_type = CODING_EOL_AUTOMATIC;
2131   else if (XFASTINT (eol_type) == 1)
2132     coding->eol_type = CODING_EOL_CRLF;
2133   else if (XFASTINT (eol_type) == 2)
2134     coding->eol_type = CODING_EOL_CR;
2135   else
2136     coding->eol_type = CODING_EOL_LF;
2137
2138   type = XVECTOR (coding_system)->contents[0];
2139   switch (XFASTINT (type))
2140     {
2141     case 0:
2142       coding->type = coding_type_internal;
2143       break;
2144
2145     case 1:
2146       coding->type = coding_type_sjis;
2147       break;
2148
2149     case 2:
2150       coding->type = coding_type_iso2022;
2151       {
2152         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2153         Lisp_Object *flags;
2154         int i, charset, default_reg_bits = 0;
2155
2156         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2157           goto label_invalid_coding_system;
2158
2159         flags = XVECTOR (val)->contents;
2160         coding->flags
2161           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2162              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2163              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2164              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2165              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2166              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2167              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2168              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2169              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2170              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2171              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2172
2173         /* Invoke graphic register 0 to plane 0.  */
2174         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2175         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2176         CODING_SPEC_ISO_INVOCATION (coding, 1)
2177           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2178         /* Not single shifting at first.  */
2179         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2180         /* Beginning of buffer should also be regarded as bol. */
2181         CODING_SPEC_ISO_BOL(coding) = 1;
2182
2183         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2184            FLAGS[REG] can be one of below:
2185                 integer CHARSET: CHARSET occupies register I,
2186                 t: designate nothing to REG initially, but can be used
2187                   by any charsets,
2188                 list of integer, nil, or t: designate the first
2189                   element (if integer) to REG initially, the remaining
2190                   elements (if integer) is designated to REG on request,
2191                   if an element is t, REG can be used by any charset,
2192                 nil: REG is never used.  */
2193         for (charset = 0; charset <= MAX_CHARSET; charset++)
2194           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2195         for (i = 0; i < 4; i++)
2196           {
2197             if (INTEGERP (flags[i])
2198                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2199                 || (charset = get_charset_id (flags[i])) >= 0)
2200               {
2201                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2202                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2203               }
2204             else if (EQ (flags[i], Qt))
2205               {
2206                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2207                 default_reg_bits |= 1 << i;
2208               }
2209             else if (CONSP (flags[i]))
2210               {
2211                 Lisp_Object tail = flags[i];
2212
2213                 if (INTEGERP (XCONS (tail)->car)
2214                     && (charset = XINT (XCONS (tail)->car),
2215                         CHARSET_VALID_P (charset))
2216                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2217                   {
2218                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2219                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2220                   }
2221                 else
2222                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2223                 tail = XCONS (tail)->cdr;
2224                 while (CONSP (tail))
2225                   {
2226                     if (INTEGERP (XCONS (tail)->car)
2227                         && (charset = XINT (XCONS (tail)->car),
2228                             CHARSET_VALID_P (charset))
2229                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2230                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2231                         = i;
2232                     else if (EQ (XCONS (tail)->car, Qt))
2233                       default_reg_bits |= 1 << i;
2234                     tail = XCONS (tail)->cdr;
2235                   }
2236               }
2237             else
2238               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2239
2240             CODING_SPEC_ISO_DESIGNATION (coding, i)
2241               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2242           }
2243
2244         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2245           {
2246             /* REG 1 can be used only by locking shift in 7-bit env.  */
2247             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2248               default_reg_bits &= ~2;
2249             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2250               /* Without any shifting, only REG 0 and 1 can be used.  */
2251               default_reg_bits &= 3;
2252           }
2253
2254         for (charset = 0; charset <= MAX_CHARSET; charset++)
2255           if (CHARSET_VALID_P (charset)
2256               && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2257             {
2258               /* We have not yet decided where to designate CHARSET.  */
2259               int reg_bits = default_reg_bits;
2260
2261               if (CHARSET_CHARS (charset) == 96)
2262                 /* A charset of CHARS96 can't be designated to REG 0.  */
2263                 reg_bits &= ~1;
2264
2265               if (reg_bits)
2266                 /* There exist some default graphic register.  */
2267                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2268                   = (reg_bits & 1
2269                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2270               else
2271                 /* We anyway have to designate CHARSET to somewhere.  */
2272                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2273                   = (CHARSET_CHARS (charset) == 94
2274                      ? 0
2275                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2276                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2277                         ? 1
2278                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2279                            ? 2 : 0)));
2280             }
2281       }
2282       coding->require_flushing = 1;
2283       break;
2284
2285     case 3:
2286       coding->type = coding_type_big5;
2287       coding->flags
2288         = (NILP (XVECTOR (coding_system)->contents[4])
2289            ? CODING_FLAG_BIG5_HKU
2290            : CODING_FLAG_BIG5_ETEN);
2291       break;
2292
2293     case 4:
2294       coding->type = coding_type_ccl;
2295       {
2296         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2297         if (CONSP  (val)
2298             && VECTORP (XCONS (val)->car)
2299             && VECTORP (XCONS (val)->cdr))
2300           {
2301             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2302             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2303           }
2304         else
2305           goto label_invalid_coding_system;
2306       }
2307       coding->require_flushing = 1;
2308       break;
2309
2310     default:
2311       if (EQ (type, Qt))
2312         coding->type = coding_type_automatic;
2313       else
2314         coding->type = coding_type_no_conversion;
2315       break;
2316     }
2317   return 0;
2318
2319  label_invalid_coding_system:
2320   coding->type = coding_type_no_conversion;
2321   coding->eol_type = CODING_EOL_LF;
2322   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2323     = Qnil;
2324   return -1;
2325 }
2326
2327 /* Emacs has a mechanism to automatically detect a coding system if it
2328    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2329    it's impossible to distinguish some coding systems accurately
2330    because they use the same range of codes.  So, at first, coding
2331    systems are categorized into 7, those are:
2332
2333    o coding-category-internal
2334
2335         The category for a coding system which has the same code range
2336         as Emacs' internal format.  Assigned the coding-system (Lisp
2337         symbol) `internal' by default.
2338
2339    o coding-category-sjis
2340
2341         The category for a coding system which has the same code range
2342         as SJIS.  Assigned the coding-system (Lisp
2343         symbol) `shift-jis' by default.
2344
2345    o coding-category-iso-7
2346
2347         The category for a coding system which has the same code range
2348         as ISO2022 of 7-bit environment.  Assigned the coding-system
2349         (Lisp symbol) `iso-2022-7' by default.
2350
2351    o coding-category-iso-8-1
2352
2353         The category for a coding system which has the same code range
2354         as ISO2022 of 8-bit environment and graphic plane 1 used only
2355         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2356         symbol) `iso-8859-1' by default.
2357
2358    o coding-category-iso-8-2
2359
2360         The category for a coding system which has the same code range
2361         as ISO2022 of 8-bit environment and graphic plane 1 used only
2362         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2363         symbol) `euc-japan' by default.
2364
2365    o coding-category-iso-else
2366
2367         The category for a coding system which has the same code range
2368         as ISO2022 but not belongs to any of the above three
2369         categories.  Assigned the coding-system (Lisp symbol)
2370         `iso-2022-ss2-7' by default.
2371
2372    o coding-category-big5
2373
2374         The category for a coding system which has the same code range
2375         as BIG5.  Assigned the coding-system (Lisp symbol)
2376         `cn-big5' by default.
2377
2378    o coding-category-binary
2379
2380         The category for a coding system not categorized in any of the
2381         above.  Assigned the coding-system (Lisp symbol)
2382         `no-conversion' by default.
2383
2384    Each of them is a Lisp symbol and the value is an actual
2385    `coding-system's (this is also a Lisp symbol) assigned by a user.
2386    What Emacs does actually is to detect a category of coding system.
2387    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2388    decide only one possible category, it selects a category of the
2389    highest priority.  Priorities of categories are also specified by a
2390    user in a Lisp variable `coding-category-list'.
2391
2392 */
2393
2394 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2395    If it detects possible coding systems, return an integer in which
2396    appropriate flag bits are set.  Flag bits are defined by macros
2397    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2398
2399 int
2400 detect_coding_mask (src, src_bytes)
2401      unsigned char *src;
2402      int src_bytes;
2403 {
2404   register unsigned char c;
2405   unsigned char *src_end = src + src_bytes;
2406   int mask;
2407
2408   /* At first, skip all ASCII characters and control characters except
2409      for three ISO2022 specific control characters.  */
2410  label_loop_detect_coding:
2411   while (src < src_end)
2412     {
2413       c = *src;
2414       if (c >= 0x80
2415           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2416         break;
2417       src++;
2418     }
2419
2420   if (src >= src_end)
2421     /* We found nothing other than ASCII.  There's nothing to do.  */
2422     return CODING_CATEGORY_MASK_ANY;
2423
2424   /* The text seems to be encoded in some multilingual coding system.
2425      Now, try to find in which coding system the text is encoded.  */
2426   if (c < 0x80)
2427     {
2428       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2429       /* C is an ISO2022 specific control code of C0.  */
2430       mask = detect_coding_iso2022 (src, src_end);
2431       src++;
2432       if (mask == CODING_CATEGORY_MASK_ANY)
2433         /* No valid ISO2022 code follows C.  Try again.  */
2434         goto label_loop_detect_coding;
2435     }
2436   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2437     /* C is an ISO2022 specific control code of C1,
2438        or the first byte of SJIS's 2-byte character code,
2439        or a leading code of Emacs.  */
2440     mask = (detect_coding_iso2022 (src, src_end)
2441             | detect_coding_sjis (src, src_end)
2442             | detect_coding_internal (src, src_end));
2443
2444   else if (c < 0xA0)
2445     /* C is the first byte of SJIS character code,
2446        or a leading-code of Emacs.  */
2447     mask = (detect_coding_sjis (src, src_end)
2448             | detect_coding_internal (src, src_end));
2449
2450   else
2451     /* C is a character of ISO2022 in graphic plane right,
2452        or a SJIS's 1-byte character code (i.e. JISX0201),
2453        or the first byte of BIG5's 2-byte code.  */
2454     mask = (detect_coding_iso2022 (src, src_end)
2455             | detect_coding_sjis (src, src_end)
2456             | detect_coding_big5 (src, src_end));
2457
2458   return mask;
2459 }
2460
2461 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2462    The information of the detected coding system is set in CODING.  */
2463
2464 void
2465 detect_coding (coding, src, src_bytes)
2466      struct coding_system *coding;
2467      unsigned char *src;
2468      int src_bytes;
2469 {
2470   int mask = detect_coding_mask (src, src_bytes);
2471   int idx;
2472
2473   if (mask == CODING_CATEGORY_MASK_ANY)
2474     /* We found nothing other than ASCII.  There's nothing to do.  */
2475     return;
2476
2477   if (!mask)
2478     /* The source text seems to be encoded in unknown coding system.
2479        Emacs regards the category of such a kind of coding system as
2480        `coding-category-binary'.  We assume that a user has assigned
2481        an appropriate coding system for a `coding-category-binary'.  */
2482     idx = CODING_CATEGORY_IDX_BINARY;
2483   else
2484     {
2485       /* We found some plausible coding systems.  Let's use a coding
2486          system of the highest priority.  */
2487       Lisp_Object val = Vcoding_category_list;
2488
2489       if (CONSP (val))
2490         while (!NILP (val))
2491           {
2492             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2493             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2494               break;
2495             val = XCONS (val)->cdr;
2496           }
2497       else
2498         val = Qnil;
2499
2500       if (NILP (val))
2501         {
2502           /* For unknown reason, `Vcoding_category_list' contains none
2503              of found categories.  Let's use any of them.  */
2504           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2505             if (mask & (1 << idx))
2506               break;
2507         }
2508     }
2509   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2510 }
2511
2512 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2513    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2514    CODING_EOL_CR, and CODING_EOL_AUTOMATIC.  */
2515
2516 int
2517 detect_eol_type (src, src_bytes)
2518      unsigned char *src;
2519      int src_bytes;
2520 {
2521   unsigned char *src_end = src + src_bytes;
2522   unsigned char c;
2523
2524   while (src < src_end)
2525     {
2526       c = *src++;
2527       if (c == '\n')
2528         return CODING_EOL_LF;
2529       else if (c == '\r')
2530         {
2531           if (src < src_end && *src == '\n')
2532             return CODING_EOL_CRLF;
2533           else
2534             return CODING_EOL_CR;
2535         }
2536     }
2537   return CODING_EOL_AUTOMATIC;
2538 }
2539
2540 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2541    is encoded.  If it detects an appropriate format of end-of-line, it
2542    sets the information in *CODING.  */
2543
2544 void
2545 detect_eol (coding, src, src_bytes)
2546      struct coding_system *coding;
2547      unsigned char *src;
2548      int src_bytes;
2549 {
2550   Lisp_Object val;
2551   int eol_type = detect_eol_type (src, src_bytes);
2552
2553   if (eol_type == CODING_EOL_AUTOMATIC)
2554     /*  We found no end-of-line in the source text.  */
2555     return;
2556
2557   val = Fget (coding->symbol, Qeol_type);
2558   if (VECTORP (val) && XVECTOR (val)->size == 3)
2559     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2560 }
2561
2562 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2563    decoding, it may detect coding system and format of end-of-line if
2564    those are not yet decided.  */
2565
2566 int
2567 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2568      struct coding_system *coding;
2569      unsigned char *source, *destination;
2570      int src_bytes, dst_bytes;
2571      int *consumed;
2572 {
2573   int produced;
2574
2575   if (src_bytes <= 0)
2576     {
2577       *consumed = 0;
2578       return 0;
2579     }
2580
2581   if (coding->type == coding_type_automatic)
2582     detect_coding (coding, source, src_bytes);
2583
2584   if (coding->eol_type == CODING_EOL_AUTOMATIC)
2585     detect_eol (coding, source, src_bytes);
2586
2587   coding->carryover_size = 0;
2588   switch (coding->type)
2589     {
2590     case coding_type_no_conversion:
2591     label_no_conversion:
2592       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2593       bcopy (source, destination, produced);
2594       *consumed = produced;
2595       break;
2596
2597     case coding_type_internal:
2598     case coding_type_automatic:
2599       if (coding->eol_type == CODING_EOL_LF
2600           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2601         goto label_no_conversion;
2602       produced = decode_eol (coding, source, destination,
2603                              src_bytes, dst_bytes, consumed);
2604       break;
2605
2606     case coding_type_sjis:
2607       produced = decode_coding_sjis_big5 (coding, source, destination,
2608                                           src_bytes, dst_bytes, consumed,
2609                                           1);
2610       break;
2611
2612     case coding_type_iso2022:
2613       produced = decode_coding_iso2022 (coding, source, destination,
2614                                         src_bytes, dst_bytes, consumed);
2615       break;
2616
2617     case coding_type_big5:
2618       produced = decode_coding_sjis_big5 (coding, source, destination,
2619                                           src_bytes, dst_bytes, consumed,
2620                                           0);
2621       break;
2622
2623     case coding_type_ccl:
2624       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2625                              src_bytes, dst_bytes, consumed);
2626       break;
2627     }
2628
2629   return produced;
2630 }
2631
2632 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2633
2634 int
2635 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2636      struct coding_system *coding;
2637      unsigned char *source, *destination;
2638      int src_bytes, dst_bytes;
2639      int *consumed;
2640 {
2641   int produced;
2642
2643   coding->carryover_size = 0;
2644   switch (coding->type)
2645     {
2646     case coding_type_no_conversion:
2647     label_no_conversion:
2648       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2649       if (produced > 0)
2650         {
2651           bcopy (source, destination, produced);
2652           if (coding->selective)
2653             {
2654               unsigned char *p = destination, *pend = destination + produced;
2655               while (p < pend)
2656                 if (*p++ == '\015') p[-1] = '\n';
2657             }
2658         }
2659       *consumed = produced;
2660       break;
2661
2662     case coding_type_internal:
2663     case coding_type_automatic:
2664       if (coding->eol_type == CODING_EOL_LF
2665           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2666         goto label_no_conversion;
2667       produced = encode_eol (coding, source, destination,
2668                              src_bytes, dst_bytes, consumed);
2669       break;
2670
2671     case coding_type_sjis:
2672       produced = encode_coding_sjis_big5 (coding, source, destination,
2673                                           src_bytes, dst_bytes, consumed,
2674                                           1);
2675       break;
2676
2677     case coding_type_iso2022:
2678       produced = encode_coding_iso2022 (coding, source, destination,
2679                                         src_bytes, dst_bytes, consumed);
2680       break;
2681
2682     case coding_type_big5:
2683       produced = encode_coding_sjis_big5 (coding, source, destination,
2684                                           src_bytes, dst_bytes, consumed,
2685                                           0);
2686       break;
2687
2688     case coding_type_ccl:
2689       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2690                              src_bytes, dst_bytes, consumed);
2691       break;
2692     }
2693
2694   return produced;
2695 }
2696
2697 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2698
2699 /* Return maximum size (bytes) of a buffer enough for decoding
2700    SRC_BYTES of text encoded in CODING.  */
2701
2702 int
2703 decoding_buffer_size (coding, src_bytes)
2704      struct coding_system *coding;
2705      int src_bytes;
2706 {
2707   int magnification;
2708
2709   if (coding->type == coding_type_iso2022)
2710     magnification = 3;
2711   else if (coding->type == coding_type_ccl)
2712     magnification = coding->spec.ccl.decoder.buf_magnification;
2713   else
2714     magnification = 2;
2715
2716   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2717 }
2718
2719 /* Return maximum size (bytes) of a buffer enough for encoding
2720    SRC_BYTES of text to CODING.  */
2721
2722 int
2723 encoding_buffer_size (coding, src_bytes)
2724      struct coding_system *coding;
2725      int src_bytes;
2726 {
2727   int magnification;
2728
2729   if (coding->type == coding_type_ccl)
2730     magnification = coding->spec.ccl.encoder.buf_magnification;
2731   else
2732     magnification = 3;
2733
2734   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2735 }
2736
2737 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2738 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2739 #endif
2740
2741 char *conversion_buffer;
2742 int conversion_buffer_size;
2743
2744 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2745    or decoding.  Sufficient memory is allocated automatically.  If we
2746    run out of memory, return NULL.  */
2747
2748 char *
2749 get_conversion_buffer (size)
2750      int size;
2751 {
2752   if (size > conversion_buffer_size)
2753     {
2754       char *buf;
2755       int real_size = conversion_buffer_size * 2;
2756
2757       while (real_size < size) real_size *= 2;
2758       buf = (char *) xmalloc (real_size);
2759       xfree (conversion_buffer);
2760       conversion_buffer = buf;
2761       conversion_buffer_size = real_size;
2762     }
2763   return conversion_buffer;
2764 }
2765
2766 \f
2767 #ifdef emacs
2768 /*** 7. Emacs Lisp library functions ***/
2769
2770 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2771        1, 1, 0,
2772   "Return coding-vector of CODING-SYSTEM.\n\
2773 If CODING-SYSTEM is not a valid coding-system, return nil.")
2774   (obj)
2775      Lisp_Object obj;
2776 {
2777   while (SYMBOLP (obj) && !NILP (obj))
2778     obj = Fget (obj, Qcoding_system);
2779   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2780           ? Qnil : obj);
2781 }
2782
2783 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2784   "Return t if OBJECT is nil or a coding-system.\n\
2785 See document of make-coding-system for coding-system object.")
2786   (obj)
2787      Lisp_Object obj;
2788 {
2789   return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2790 }
2791
2792 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2793        Sread_non_nil_coding_system, 1, 1, 0,
2794   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2795   (prompt)
2796      Lisp_Object prompt;
2797 {
2798   Lisp_Object val;
2799   do
2800     {
2801       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2802                               Qt, Qnil, Qnil, Qnil);
2803     }
2804   while (XSTRING (val)->size == 0);
2805   return (Fintern (val, Qnil));
2806 }
2807
2808 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2809   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2810   (prompt)
2811      Lisp_Object prompt;
2812 {
2813   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2814                                       Qt, Qnil, Qnil, Qnil);
2815   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2816 }
2817
2818 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2819        1, 1, 0,
2820   "Check validity of CODING-SYSTEM.\n\
2821 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2822 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2823 The value of property should be a vector of length 5.")
2824   (coding_system)
2825      Lisp_Object coding_system;
2826 {
2827   CHECK_SYMBOL (coding_system, 0);
2828   if (!NILP (Fcoding_system_p (coding_system)))
2829     return coding_system;
2830   while (1)
2831     Fsignal (Qcoding_system_error, coding_system);
2832 }
2833
2834 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2835        2, 2, 0,
2836   "Detect coding-system of the text in the region between START and END.\n\
2837 Return a list of possible coding-systems ordered by priority.\n\
2838 If only ASCII characters are found, it returns `automatic-conversion'\n\
2839  or its subsidiary coding-system according to a detected end-of-line format.")
2840   (b, e)
2841      Lisp_Object b, e;
2842 {
2843   int coding_mask, eol_type;
2844   Lisp_Object val;
2845   int beg, end;
2846
2847   validate_region (&b, &e);
2848   beg = XINT (b), end = XINT (e);
2849   if (beg < GPT && end >= GPT) move_gap (end);
2850
2851   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2852   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2853
2854   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2855     {
2856       val = intern ("automatic-conversion");
2857       if (eol_type != CODING_EOL_AUTOMATIC)
2858         {
2859           Lisp_Object val2 = Fget (val, Qeol_type);
2860           if (VECTORP (val2))
2861             val = XVECTOR (val2)->contents[eol_type];
2862         }
2863     }
2864   else
2865     {
2866       Lisp_Object val2;
2867
2868       /* At first, gather possible coding-systems in VAL in a reverse
2869          order.  */
2870       val = Qnil;
2871       for (val2 = Vcoding_category_list;
2872            !NILP (val2);
2873            val2 = XCONS (val2)->cdr)
2874         {
2875           int idx
2876             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2877           if (coding_mask & (1 << idx))
2878             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2879         }
2880
2881       /* Then, change the order of the list, while getting subsidiary
2882          coding-systems.  */
2883       val2 = val;
2884       val = Qnil;
2885       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2886         {
2887           if (eol_type == CODING_EOL_AUTOMATIC)
2888             val = Fcons (XCONS (val2)->car, val);
2889           else
2890             {
2891               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2892               if (VECTORP (val3))
2893                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2894               else
2895                 val = Fcons (XCONS (val2)->car, val);
2896             }
2897         }
2898     }
2899
2900   return val;
2901 }
2902
2903 /* Scan text in the region between *BEGP and *ENDP, skip characters
2904    which we never have to encode to (iff ENCODEP is 1) or decode from
2905    coding system CODING at the head and tail, then set BEGP and ENDP
2906    to the addresses of start and end of the text we actually convert.  */
2907
2908 void
2909 shrink_conversion_area (begp, endp, coding, encodep)
2910      unsigned char **begp, **endp;
2911      struct coding_system *coding;
2912      int encodep;
2913 {
2914   register unsigned char *beg_addr = *begp, *end_addr = *endp;
2915
2916   if (coding->eol_type != CODING_EOL_LF
2917       && coding->eol_type != CODING_EOL_AUTOMATIC)
2918     /* Since we anyway have to convert end-of-line format, it is not
2919        worth skipping at most 100 bytes or so.  */
2920     return;
2921
2922   if (encodep)                  /* for encoding */
2923     {
2924       switch (coding->type)
2925         {
2926         case coding_type_no_conversion:
2927         case coding_type_internal:
2928         case coding_type_automatic:
2929           /* We need no conversion.  */
2930           *begp = *endp;
2931           return;
2932         case coding_type_ccl:
2933           /* We can't skip any data.  */
2934           return;
2935         case coding_type_iso2022:
2936           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2937             {
2938               unsigned char *bol = beg_addr;
2939               while (beg_addr < end_addr && *beg_addr < 0x80)
2940                 {
2941                   beg_addr++;
2942                   if (*(beg_addr - 1) == '\n')
2943                     bol = beg_addr;
2944                 }
2945               beg_addr = bol;
2946               goto label_skip_tail;
2947             }
2948           /* fall down ... */
2949         default:
2950           /* We can skip all ASCII characters at the head and tail.  */
2951           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2952         label_skip_tail:
2953           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2954           break;
2955         }
2956     }
2957   else                          /* for decoding */
2958     {
2959       switch (coding->type)
2960         {
2961         case coding_type_no_conversion:
2962           /* We need no conversion.  */
2963           *begp = *endp;
2964           return;
2965         case coding_type_internal:
2966           if (coding->eol_type == CODING_EOL_LF)
2967             {
2968               /* We need no conversion.  */
2969               *begp = *endp;
2970               return;
2971             }
2972           /* We can skip all but carriage-return.  */
2973           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2974           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2975           break;
2976         case coding_type_sjis:
2977         case coding_type_big5:
2978           /* We can skip all ASCII characters at the head.  */
2979           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2980           /* We can skip all ASCII characters at the tail except for
2981              the second byte of SJIS or BIG5 code.  */
2982           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2983           if (end_addr != *endp)
2984             end_addr++;
2985           break;
2986         case coding_type_ccl:
2987           /* We can't skip any data.  */
2988           return;
2989         default:                /* i.e. case coding_type_iso2022: */
2990           {
2991             unsigned char c;
2992
2993             /* We can skip all ASCII characters except for a few
2994                control codes at the head.  */
2995             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2996                    && c != ISO_CODE_CR && c != ISO_CODE_SO
2997                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2998               beg_addr++;
2999           }
3000           break;
3001         }
3002     }
3003   *begp = beg_addr;
3004   *endp = end_addr;
3005   return;
3006 }
3007
3008 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3009    text between B and E.  B and E are buffer position.  */
3010
3011 Lisp_Object
3012 code_convert_region (b, e, coding, encodep)
3013      Lisp_Object b, e;
3014      struct coding_system *coding;
3015      int encodep;
3016 {
3017   int beg, end, len, consumed, produced;
3018   char *buf;
3019   unsigned char *begp, *endp;
3020   int pos = PT;
3021
3022   validate_region (&b, &e);
3023   beg = XINT (b), end = XINT (e);
3024   if (beg < GPT && end >= GPT)
3025     move_gap (end);
3026
3027   if (encodep && !NILP (coding->pre_write_conversion))
3028     {
3029       /* We must call a pre-conversion function which may put a new
3030          text to be converted in a new buffer.  */
3031       struct buffer *old = current_buffer, *new;
3032
3033       TEMP_SET_PT (beg);
3034       call2 (coding->pre_write_conversion, b, e);
3035       if (old != current_buffer)
3036         {
3037           /* Replace the original text by the text just generated.  */
3038           len = ZV - BEGV;
3039           new = current_buffer;
3040           set_buffer_internal (old);
3041           del_range (beg, end);
3042           insert_from_buffer (new, 1, len, 0);
3043           end = beg + len;
3044         }
3045     }
3046
3047   /* We may be able to shrink the conversion region.  */
3048   begp = POS_ADDR (beg); endp = begp + (end - beg);
3049   shrink_conversion_area (&begp, &endp, coding, encodep);
3050
3051   if (begp == endp)
3052     /* We need no conversion.  */
3053     len = end - beg;
3054   else
3055     {
3056       beg += begp - POS_ADDR (beg);
3057       end =  beg + (endp - begp);
3058
3059       if (encodep)
3060         len = encoding_buffer_size (coding, end - beg);
3061       else
3062         len = decoding_buffer_size (coding, end - beg);
3063       buf = get_conversion_buffer (len);
3064
3065       coding->last_block = 1;
3066       produced = (encodep
3067                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3068                                    &consumed)
3069                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3070                                    &consumed));
3071
3072       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3073
3074       TEMP_SET_PT (beg);
3075       insert (buf, produced);
3076       del_range (PT, PT + end - beg);
3077       if (pos >= end)
3078         pos = PT + (pos - end);
3079       else if (pos > beg)
3080         pos = beg;
3081       TEMP_SET_PT (pos);
3082   }
3083
3084   if (!encodep && !NILP (coding->post_read_conversion))
3085     {
3086       /* We must call a post-conversion function which may alter
3087          the text just converted.  */
3088       Lisp_Object insval;
3089
3090       beg = XINT (b);
3091       TEMP_SET_PT (beg);
3092       insval = call1 (coding->post_read_conversion, make_number (len));
3093       CHECK_NUMBER (insval, 0);
3094       len = XINT (insval);
3095     }
3096
3097   return make_number (len);
3098 }
3099
3100 Lisp_Object
3101 code_convert_string (str, coding, encodep, nocopy)
3102      Lisp_Object str, nocopy;
3103      struct coding_system *coding;
3104      int encodep;
3105 {
3106   int len, consumed, produced;
3107   char *buf;
3108   unsigned char *begp, *endp;
3109   int head_skip, tail_skip;
3110   struct gcpro gcpro1;
3111
3112   if (encodep && !NILP (coding->pre_write_conversion)
3113       || !encodep && !NILP (coding->post_read_conversion))
3114     {
3115       /* Since we have to call Lisp functions which assume target text
3116          is in a buffer, after setting a temporary buffer, call
3117          code_convert_region.  */
3118       int count = specpdl_ptr - specpdl;
3119       int len = XSTRING (str)->size;
3120       Lisp_Object result;
3121       struct buffer *old = current_buffer;
3122
3123       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3124       temp_output_buffer_setup (" *code-converting-work*");
3125       set_buffer_internal (XBUFFER (Vstandard_output));
3126       insert_from_string (str, 0, len, 0);
3127       code_convert_region (make_number (BEGV), make_number (ZV),
3128                            coding, encodep);
3129       result = make_buffer_string (BEGV, ZV, 0);
3130       set_buffer_internal (old);
3131       return unbind_to (count, result);
3132     }
3133
3134   /* We may be able to shrink the conversion region.  */
3135   begp = XSTRING (str)->data;
3136   endp = begp + XSTRING (str)->size;
3137   shrink_conversion_area (&begp, &endp, coding, encodep);
3138
3139   if (begp == endp)
3140     /* We need no conversion.  */
3141     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3142
3143   head_skip = begp - XSTRING (str)->data;
3144   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3145
3146   GCPRO1 (str);
3147
3148   if (encodep)
3149     len = encoding_buffer_size (coding, endp - begp);
3150   else
3151     len = decoding_buffer_size (coding, endp - begp);
3152   buf = get_conversion_buffer (len + head_skip + tail_skip);
3153
3154   bcopy (XSTRING (str)->data, buf, head_skip);
3155   coding->last_block = 1;
3156   produced = (encodep
3157               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3158                                buf + head_skip, endp - begp, len, &consumed)
3159               : decode_coding (coding, XSTRING (str)->data + head_skip,
3160                                buf + head_skip, endp - begp, len, &consumed));
3161   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3162          buf + head_skip + produced,
3163          tail_skip);
3164
3165   UNGCPRO;
3166
3167   return make_string (buf, head_skip + produced + tail_skip);
3168 }
3169
3170 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3171        3, 3, "r\nzCoding system: ",
3172   "Decode current region by specified coding system.\n\
3173 When called from a program, takes three arguments:\n\
3174 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3175 Return length of decoded text.")
3176   (b, e, coding_system)
3177      Lisp_Object b, e, coding_system;
3178 {
3179   struct coding_system coding;
3180
3181   CHECK_NUMBER_COERCE_MARKER (b, 0);
3182   CHECK_NUMBER_COERCE_MARKER (e, 1);
3183   CHECK_SYMBOL (coding_system, 2);
3184
3185   if (NILP (coding_system))
3186     return make_number (XFASTINT (e) - XFASTINT (b));
3187   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3188     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3189
3190   return code_convert_region (b, e, &coding, 0);
3191 }
3192
3193 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3194        3, 3, "r\nzCoding system: ",
3195   "Encode current region by specified coding system.\n\
3196 When called from a program, takes three arguments:\n\
3197 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3198 Return length of encoded text.")
3199   (b, e, coding_system)
3200      Lisp_Object b, e, coding_system;
3201 {
3202   struct coding_system coding;
3203
3204   CHECK_NUMBER_COERCE_MARKER (b, 0);
3205   CHECK_NUMBER_COERCE_MARKER (e, 1);
3206   CHECK_SYMBOL (coding_system, 2);
3207
3208   if (NILP (coding_system))
3209     return make_number (XFASTINT (e) - XFASTINT (b));
3210   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3211     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3212
3213   return code_convert_region (b, e, &coding, 1);
3214 }
3215
3216 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3217        2, 3, 0,
3218   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3219 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3220 of decoding.")
3221   (string, coding_system, nocopy)
3222      Lisp_Object string, coding_system, nocopy;
3223 {
3224   struct coding_system coding;
3225
3226   CHECK_STRING (string, 0);
3227   CHECK_SYMBOL (coding_system, 1);
3228
3229   if (NILP (coding_system))
3230     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3231   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3232     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3233
3234   return code_convert_string (string, &coding, 0, nocopy);
3235 }
3236
3237 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3238        2, 3, 0,
3239   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3240 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3241 of encoding.")
3242   (string, coding_system, nocopy)
3243      Lisp_Object string, coding_system, nocopy;
3244 {
3245   struct coding_system coding;
3246
3247   CHECK_STRING (string, 0);
3248   CHECK_SYMBOL (coding_system, 1);
3249
3250   if (NILP (coding_system))
3251     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3252   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3253     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3254
3255   return code_convert_string (string, &coding, 1, nocopy);
3256 }
3257
3258 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3259   "Decode a JISX0208 character of shift-jis encoding.\n\
3260 CODE is the character code in SJIS.\n\
3261 Return the corresponding character.")
3262   (code)
3263      Lisp_Object code;
3264 {
3265   unsigned char c1, c2, s1, s2;
3266   Lisp_Object val;
3267
3268   CHECK_NUMBER (code, 0);
3269   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3270   DECODE_SJIS (s1, s2, c1, c2);
3271   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3272   return val;
3273 }
3274
3275 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3276   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3277 Return the corresponding character code in SJIS.")
3278   (ch)
3279      Lisp_Object ch;
3280 {
3281   int charset, c1, c2, s1, s2;
3282   Lisp_Object val;
3283
3284   CHECK_NUMBER (ch, 0);
3285   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3286   if (charset == charset_jisx0208)
3287     {
3288       ENCODE_SJIS (c1, c2, s1, s2);
3289       XSETFASTINT (val, (s1 << 8) | s2);
3290     }
3291   else
3292     XSETFASTINT (val, 0);
3293   return val;
3294 }
3295
3296 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3297   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3298 CODE is the character code in BIG5.\n\
3299 Return the corresponding character.")
3300   (code)
3301      Lisp_Object code;
3302 {
3303   int charset;
3304   unsigned char b1, b2, c1, c2;
3305   Lisp_Object val;
3306
3307   CHECK_NUMBER (code, 0);
3308   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3309   DECODE_BIG5 (b1, b2, charset, c1, c2);
3310   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3311   return val;
3312 }
3313
3314 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3315   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3316 Return the corresponding character code in Big5.")
3317   (ch)
3318      Lisp_Object ch;
3319 {
3320   int charset, c1, c2, b1, b2;
3321   Lisp_Object val;
3322
3323   CHECK_NUMBER (ch, 0);
3324   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3325   if (charset == charset_big5_1 || charset == charset_big5_2)
3326     {
3327       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3328       XSETFASTINT (val, (b1 << 8) | b2);
3329     }
3330   else
3331     XSETFASTINT (val, 0);
3332   return val;
3333 }
3334
3335 DEFUN ("set-terminal-coding-system",
3336        Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3337        "zCoding-system for terminal display: ",
3338   "Set coding-system of your terminal to CODING-SYSTEM.\n\
3339 All outputs to terminal are encoded to this coding-system.")
3340   (coding_system)
3341      Lisp_Object coding_system;
3342 {
3343   CHECK_SYMBOL (coding_system, 0);
3344   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3345   update_mode_lines++;
3346   if (!NILP (Finteractive_p ()))
3347     Fredraw_display ();
3348   return Qnil;
3349 }
3350
3351 DEFUN ("terminal-coding-system",
3352        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3353   "Return coding-system of your terminal.")
3354   ()
3355 {
3356   return terminal_coding.symbol;
3357 }
3358
3359 DEFUN ("set-keyboard-coding-system",
3360        Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1, 0,
3361   "Set coding-system of codes sent from terminal keyboard to CODING-SYSTEM.\n\
3362 In Encoded-kbd minor mode, user inputs are decoded\n\
3363 accoding to CODING-SYSTEM.\n\
3364 Do not call this function directly, but use the command\n\
3365 encoded-kbd-set-coding-system to activate Encoded-kbd mode\n\
3366 with a specific coding system.")
3367   (coding_system)
3368      Lisp_Object coding_system;
3369 {
3370   CHECK_SYMBOL (coding_system, 0);
3371   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3372   return Qnil;
3373 }
3374
3375 DEFUN ("keyboard-coding-system",
3376        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3377   "Return coding-system of what is sent from terminal keyboard.")
3378   ()
3379 {
3380   return keyboard_coding.symbol;
3381 }
3382
3383 \f
3384 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3385        1, MANY, 0,
3386   "Choose a coding system for a file operation based on file name.\n\
3387 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3388 ENCODING-SYSTEM is the coding system to use for encoding\n\
3389 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3390 for decoding (in case OPERATION does decoding).\n\
3391 \n\
3392 The first argument OPERATION specifies an I/O primitive:\n\
3393   For file I/O, `insert-file-contents' or `write-region'.\n\
3394   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3395   For network I/O, `open-network-stream'.\n\
3396 \n\
3397 The remaining arguments should be the same arguments that were passed\n\
3398 to the primitive.  Depending on which primitive, one of those arguments\n\
3399 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3400 whichever argument specifies the file name is TARGET.\n\
3401 \n\
3402 TARGET has a meaning which depends on OPERATION:\n\
3403   For file I/O, TARGET is a file name.\n\
3404   For process I/O, TARGET is a process name.\n\
3405   For network I/O, TARGET is a service name or a port number\n\
3406 \n\
3407 This function looks up what `coding-system-alist' specifies for\n\
3408 OPERATION and TARGET.  It may specify a cons cell which represents\n\
3409 a particular coding system or it may have a function to call.\n\
3410 In the latter case, we call the function with one argument,\n\
3411 which is a list of all the arguments given to `find-coding-system'.")
3412   (nargs, args)
3413      int nargs;
3414      Lisp_Object *args;
3415 {
3416   Lisp_Object operation, target_idx, target, val;
3417   register Lisp_Object chain;
3418
3419   if (nargs < 2)
3420     error ("Too few arguments");
3421   operation = args[0];
3422   if (!SYMBOLP (operation)
3423       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3424     error ("Invalid first arguement");
3425   if (nargs < 1 + XINT (target_idx))
3426     error ("Too few arguments for operation: %s",
3427            XSYMBOL (operation)->name->data);
3428   target = args[XINT (target_idx) + 1];
3429   if (!(STRINGP (target)
3430         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3431     error ("Invalid %dth argument", XINT (target_idx) + 1);
3432
3433   chain = Fassq (operation, Vcoding_system_alist);
3434   if (NILP (chain))
3435     return Qnil;
3436
3437   for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3438     {
3439       Lisp_Object elt = XCONS (chain)->car;
3440
3441       if (CONSP (elt)
3442           && ((STRINGP (target)
3443                && STRINGP (XCONS (elt)->car)
3444                && fast_string_match (XCONS (elt)->car, target) >= 0)
3445               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3446         return (val = XCONS (elt)->cdr, CONSP (val)
3447                 ? val
3448                 : ((SYMBOLP (val) && !NILP (Fboundp (val))
3449                     ? call2 (val, Flist (nargs, args))
3450                     : Qnil)));
3451     }
3452   return Qnil;
3453 }
3454
3455 #endif /* emacs */
3456
3457 \f
3458 /*** 8. Post-amble ***/
3459
3460 init_coding_once ()
3461 {
3462   int i;
3463
3464   /* Emacs internal format specific initialize routine.  */
3465   for (i = 0; i <= 0x20; i++)
3466     emacs_code_class[i] = EMACS_control_code;
3467   emacs_code_class[0x0A] = EMACS_linefeed_code;
3468   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3469   for (i = 0x21 ; i < 0x7F; i++)
3470     emacs_code_class[i] = EMACS_ascii_code;
3471   emacs_code_class[0x7F] = EMACS_control_code;
3472   emacs_code_class[0x80] = EMACS_leading_code_composition;
3473   for (i = 0x81; i < 0xFF; i++)
3474     emacs_code_class[i] = EMACS_invalid_code;
3475   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3476   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3477   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3478   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3479
3480   /* ISO2022 specific initialize routine.  */
3481   for (i = 0; i < 0x20; i++)
3482     iso_code_class[i] = ISO_control_code;
3483   for (i = 0x21; i < 0x7F; i++)
3484     iso_code_class[i] = ISO_graphic_plane_0;
3485   for (i = 0x80; i < 0xA0; i++)
3486     iso_code_class[i] = ISO_control_code;
3487   for (i = 0xA1; i < 0xFF; i++)
3488     iso_code_class[i] = ISO_graphic_plane_1;
3489   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3490   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3491   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3492   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3493   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3494   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3495   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3496   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3497   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3498   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3499
3500   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3501   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3502
3503   setup_coding_system (Qnil, &keyboard_coding);
3504   setup_coding_system (Qnil, &terminal_coding);
3505 }
3506
3507 #ifdef emacs
3508
3509 syms_of_coding ()
3510 {
3511   Qtarget_idx = intern ("target-idx");
3512   staticpro (&Qtarget_idx);
3513
3514   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3515   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3516
3517   Qcall_process = intern ("call-process");
3518   staticpro (&Qcall_process);
3519   Fput (Qcall_process, Qtarget_idx, make_number (0));
3520
3521   Qcall_process_region = intern ("call-process-region");
3522   staticpro (&Qcall_process_region);
3523   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3524
3525   Qstart_process = intern ("start-process");
3526   staticpro (&Qstart_process);
3527   Fput (Qstart_process, Qtarget_idx, make_number (2));
3528
3529   Qopen_network_stream = intern ("open-network-stream");
3530   staticpro (&Qopen_network_stream);
3531   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3532
3533   Qcoding_system = intern ("coding-system");
3534   staticpro (&Qcoding_system);
3535
3536   Qeol_type = intern ("eol-type");
3537   staticpro (&Qeol_type);
3538
3539   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3540   staticpro (&Qbuffer_file_coding_system);
3541
3542   Qpost_read_conversion = intern ("post-read-conversion");
3543   staticpro (&Qpost_read_conversion);
3544
3545   Qpre_write_conversion = intern ("pre-write-conversion");
3546   staticpro (&Qpre_write_conversion);
3547
3548   Qcoding_system_vector = intern ("coding-system-vector");
3549   staticpro (&Qcoding_system_vector);
3550
3551   Qcoding_system_p = intern ("coding-system-p");
3552   staticpro (&Qcoding_system_p);
3553
3554   Qcoding_system_error = intern ("coding-system-error");
3555   staticpro (&Qcoding_system_error);
3556
3557   Fput (Qcoding_system_error, Qerror_conditions,
3558         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3559   Fput (Qcoding_system_error, Qerror_message,
3560         build_string ("Coding-system error"));
3561
3562   Qcoding_category_index = intern ("coding-category-index");
3563   staticpro (&Qcoding_category_index);
3564
3565   {
3566     int i;
3567     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3568       {
3569         coding_category_table[i] = intern (coding_category_name[i]);
3570         staticpro (&coding_category_table[i]);
3571         Fput (coding_category_table[i], Qcoding_category_index,
3572               make_number (i));
3573       }
3574   }
3575
3576   Qcharacter_unification_table = intern ("character-unification-table");
3577   staticpro (&Qcharacter_unification_table);
3578   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3579         make_number (0));
3580
3581   defsubr (&Scoding_system_vector);
3582   defsubr (&Scoding_system_p);
3583   defsubr (&Sread_coding_system);
3584   defsubr (&Sread_non_nil_coding_system);
3585   defsubr (&Scheck_coding_system);
3586   defsubr (&Sdetect_coding_region);
3587   defsubr (&Sdecode_coding_region);
3588   defsubr (&Sencode_coding_region);
3589   defsubr (&Sdecode_coding_string);
3590   defsubr (&Sencode_coding_string);
3591   defsubr (&Sdecode_sjis_char);
3592   defsubr (&Sencode_sjis_char);
3593   defsubr (&Sdecode_big5_char);
3594   defsubr (&Sencode_big5_char);
3595   defsubr (&Sset_terminal_coding_system);
3596   defsubr (&Sterminal_coding_system);
3597   defsubr (&Sset_keyboard_coding_system);
3598   defsubr (&Skeyboard_coding_system);
3599   defsubr (&Sfind_coding_system);
3600
3601   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3602     "List of coding-categories (symbols) ordered by priority.");
3603   {
3604     int i;
3605
3606     Vcoding_category_list = Qnil;
3607     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3608       Vcoding_category_list
3609         = Fcons (coding_category_table[i], Vcoding_category_list);
3610   }
3611
3612   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3613     "A variable of internal use only.\n\
3614 If the value is a coding system, it is used for decoding on read operation.\n\
3615 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3616   Vcoding_system_for_read = Qnil;
3617
3618   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3619     "A variable of internal use only.\n\
3620 If the value is a coding system, it is used for encoding on write operation.\n\
3621 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3622   Vcoding_system_for_write = Qnil;
3623
3624   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3625     "Coding-system used in the latest file or process I/O.");
3626   Vlast_coding_system_used = Qnil;
3627
3628   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3629     "Nested alist to decide a coding system for a specific I/O operation.\n\
3630 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3631 \n\
3632 OPERATION is one of the following Emacs I/O primitives:\n\
3633   For file I/O, insert-file-contents and write-region.\n\
3634   For process I/O, call-process, call-process-region, and start-process.\n\
3635   For network I/O, open-network-stream.\n\
3636 In addition, for process I/O, `process-argument' can be specified for\n\
3637 encoding arguments of the process.\n\
3638 \n\
3639 REGEXP is a regular expression matching a target of OPERATION, where\n\
3640 target is a file name for file I/O operations, a process name for\n\
3641 process I/O operations, or a service name for network I/O\n\
3642 operations.  REGEXP might be a port number for network I/O operation.\n\
3643 \n\
3644 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3645 character code on OPERATION, or a function symbol returning the cons.\n\
3646 See the documentation of `find-coding-system' for more detail.");
3647   Vcoding_system_alist = Qnil;
3648
3649   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3650     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3651   eol_mnemonic_unix = '.';
3652
3653   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3654     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3655   eol_mnemonic_dos = ':';
3656
3657   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3658     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3659   eol_mnemonic_mac = '\'';
3660
3661   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3662     "Mnemonic character indicating end-of-line format is not yet decided.");
3663   eol_mnemonic_undecided = '-';
3664
3665   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3666     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3667   Venable_character_unification = Qt;
3668
3669   DEFVAR_LISP ("standard-character-unification-table-for-read",
3670     &Vstandard_character_unification_table_for_read,
3671     "Table for unifying characters when reading.");
3672   Vstandard_character_unification_table_for_read = Qnil;
3673
3674   DEFVAR_LISP ("standard-character-unification-table-for-write",
3675     &Vstandard_character_unification_table_for_write,
3676     "Table for unifying characters when writing.");
3677   Vstandard_character_unification_table_for_write = Qnil;
3678
3679   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3680     "Alist of charsets vs revision numbers.\n\
3681 While encoding, if a charset (car part of an element) is found,\n\
3682 designate it with the escape sequence identifing revision (cdr part of the element).");
3683   Vcharset_revision_alist = Qnil;
3684 }
3685
3686 #endif /* emacs */