src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Ver.1.0.
   3    Copyright (C) 1995 Free Software Foundation, Inc.
   4    Copyright (C) 1995 Electrotechnical Laboratory, JAPAN.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   1. Preamble
  26   2. Emacs' internal format handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. End-of-line handlers
  30   6. C library functions
  31   7. Emacs Lisp library functions
  32   8. Post-amble
  33
  34 */
  35
  36 /*** GENERAL NOTE on CODING SYSTEM ***
  37
  38   Coding system is an encoding mechanism of one or more character
  39   sets.  Here's a list of coding systems which Emacs can handle.  When
  40   we say "decode", it means converting some other coding system to
  41   Emacs' internal format, and when we say "encode", it means
  42   converting Emacs' internal format to some other coding system.
  43
  44   0. Emacs' internal format
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in the section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and such coding
  53   systems used in Internet communication as ISO-2022-JP are all
  54   variants of ISO2022.  Details are described in the section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   the section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in the section 4.  In this file, when written as "BIG5"
  67   (all uppercase), it means the coding system, and when written as
  68   "Big5" (capitalized), it means the character set.
  69
  70   4. Else
  71
  72   If a user want to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represent a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See the section 6 for more
  81   detail.
  82
  83 */
  84
  85 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  86
  87   How end-of-line of a text is encoded depends on a system.  For
  88   instance, Unix's format is just one byte of `line-feed' code,
  89   whereas DOS's format is two bytes sequence of `carriage-return' and
  90   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  91
  92   Since how characters in a text is encoded and how end-of-line is
  93   encoded is independent, any coding system described above can take
  94   any format of end-of-line.  So, Emacs has information of format of
  95   end-of-line in each coding-system.  See the section 6 for more
  96   detail.
  97
  98 */
  99
 100 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 101
 102   These functions check if a text between SRC and SRC_END is encoded
 103   in the coding system category XXX.  Each returns an integer value in
 104   which appropriate flag bits for the category XXX is set.  The flag
 105   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 106   template of these functions.  */
 107 #if 0
 108 int
 109 detect_coding_internal (src, src_end)
 110      unsigned char *src, *src_end;
 111 {
 112   ...
 113 }
 114 #endif
 115
 116 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 117
 118   These functions decode SRC_BYTES length text at SOURCE encoded in
 119   CODING to Emacs' internal format.  The resulting text goes to a
 120   place pointed by DESTINATION, the length of which should not exceed
 121   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 122   The return value is the length of the decoded text.  Below is a
 123   template of these functions.  */
 124 #if 0
 125 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 126      struct coding_system *coding;
 127      unsigned char *source, *destination;
 128      int src_bytes, dst_bytes;
 129      int *consumed;
 130 {
 131   ...
 132 }
 133 #endif
 134
 135 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 136
 137   These functions encode SRC_BYTES length text at SOURCE of Emacs
 138   internal format to CODING.  The resulting text goes to a place
 139   pointed by DESTINATION, the length of which should not exceed
 140   DST_BYTES.  The bytes actually processed is returned as *CONSUMED.
 141   The return value is the length of the encoded text.  Below is a
 142   template of these functions.  */
 143 #if 0
 144 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 145      struct coding_system *coding;
 146      unsigned char *source, *destination;
 147      int src_bytes, dst_bytes;
 148      int *consumed;
 149 {
 150   ...
 151 }
 152 #endif
 153
 154 /*** COMMONLY USED MACROS ***/
 155
 156 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 157    THREE_MORE_BYTES safely get one, two, and three bytes from the
 158    source text respectively.  If there are not enough bytes in the
 159    source, they jump to `label_end_of_loop'.  The caller should set
 160    variables `src' and `src_end' to appropriate areas in advance.  */
 161
 162 #define ONE_MORE_BYTE(c1)       \
 163   do {                          \
 164     if (src < src_end)          \
 165       c1 = *src++;              \
 166     else                        \
 167       goto label_end_of_loop;   \
 168   } while (0)
 169
 170 #define TWO_MORE_BYTES(c1, c2)  \
 171   do {                          \
 172     if (src + 1 < src_end)      \
 173       c1 = *src++, c2 = *src++; \
 174     else                        \
 175       goto label_end_of_loop;   \
 176   } while (0)
 177
 178 #define THREE_MORE_BYTES(c1, c2, c3)            \
 179   do {                                          \
 180     if (src + 2 < src_end)                      \
 181       c1 = *src++, c2 = *src++, c3 = *src++;    \
 182     else                                        \
 183       goto label_end_of_loop;                   \
 184   } while (0)
 185
 186 /* The following three macros DECODE_CHARACTER_ASCII,
 187    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 188    the multi-byte form of a character of each class at the place
 189    pointed by `dst'.  The caller should set the variable `dst' to
 190    point to an appropriate area and the variable `coding' to point to
 191    the coding-system of the currently decoding text in advance.  */
 192
 193 /* Decode one ASCII character C.  */
 194
 195 #define DECODE_CHARACTER_ASCII(c)                               \
 196   do {                                                          \
 197     if (COMPOSING_P (coding->composing))                        \
 198       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 199     else                                                        \
 200       *dst++ = (c);                                             \
 201   } while (0)
 202
 203 /* Decode one DIMENSION1 character of which charset is CHARSET and
 204    position-code is C.  */
 205
 206 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 207   do {                                                                  \
 208     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 209     if (COMPOSING_P (coding->composing))                                \
 210       *dst++ = leading_code + 0x20;                                     \
 211     else                                                                \
 212       *dst++ = leading_code;                                            \
 213     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 214       *dst++ = leading_code;                                            \
 215     *dst++ = (c) | 0x80;                                                \
 216   } while (0)
 217
 218 /* Decode one DIMENSION2 character of which charset is CHARSET and
 219    position-codes are C1 and C2.  */
 220
 221 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 222   do {                                                  \
 223     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 224     *dst++ = (c2) | 0x80;                               \
 225   } while (0)
 226
 227 \f
 228 /*** 1. Preamble ***/
 229
 230 #include <stdio.h>
 231
 232 #ifdef emacs
 233
 234 #include <config.h>
 235 #include "lisp.h"
 236 #include "buffer.h"
 237 #include "charset.h"
 238 #include "ccl.h"
 239 #include "coding.h"
 240 #include "window.h"
 241
 242 #else  /* not emacs */
 243
 244 #include "mulelib.h"
 245
 246 #endif /* not emacs */
 247
 248 Lisp_Object Qcoding_system, Qeol_type;
 249 Lisp_Object Qbuffer_file_coding_system;
 250 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 251
 252 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 253 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 254 Lisp_Object Qstart_process, Qopen_network_stream;
 255 Lisp_Object Qtarget_idx;
 256
 257 /* Mnemonic character of each format of end-of-line.  */
 258 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 259 /* Mnemonic character to indicate format of end-of-line is not yet
 260    decided.  */
 261 int eol_mnemonic_undecided;
 262
 263 #ifdef emacs
 264
 265 Lisp_Object Qcoding_system_vector, Qcoding_system_p, Qcoding_system_error;
 266
 267 /* Coding-systems are handed between Emacs Lisp programs and C internal
 268    routines by the following three variables.  */
 269 /* Coding-system for reading files and receiving data from process.  */
 270 Lisp_Object Vcoding_system_for_read;
 271 /* Coding-system for writing files and sending data to process.  */
 272 Lisp_Object Vcoding_system_for_write;
 273 /* Coding-system actually used in the latest I/O.  */
 274 Lisp_Object Vlast_coding_system_used;
 275
 276 /* Coding-system of what terminal accept for displaying.  */
 277 struct coding_system terminal_coding;
 278
 279 /* Coding-system of what is sent from terminal keyboard.  */
 280 struct coding_system keyboard_coding;
 281
 282 Lisp_Object Vcoding_system_alist;
 283
 284 #endif /* emacs */
 285
 286 Lisp_Object Qcoding_category_index;
 287
 288 /* List of symbols `coding-category-xxx' ordered by priority.  */
 289 Lisp_Object Vcoding_category_list;
 290
 291 /* Table of coding-systems currently assigned to each coding-category.  */
 292 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 293
 294 /* Table of names of symbol for each coding-category.  */
 295 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 296   "coding-category-internal",
 297   "coding-category-sjis",
 298   "coding-category-iso-7",
 299   "coding-category-iso-8-1",
 300   "coding-category-iso-8-2",
 301   "coding-category-iso-else",
 302   "coding-category-big5",
 303   "coding-category-binary"
 304 };
 305
 306 /* Alist of charsets vs the alternate charsets.  */
 307 Lisp_Object Valternate_charset_table;
 308
 309 /* Alist of charsets vs revision number.  */
 310 Lisp_Object Vcharset_revision_alist;
 311
 312 \f
 313 /*** 2. Emacs internal format handlers ***/
 314
 315 /* Emacs' internal format for encoding multiple character sets is a
 316    kind of multi-byte encoding, i.e. encoding a character by a sequence
 317    of one-byte codes of variable length.  ASCII characters and control
 318    characters (e.g. `tab', `newline') are represented by one-byte as
 319    is.  It takes the range 0x00 through 0x7F.  The other characters
 320    are represented by a sequence of `base leading-code', optional
 321    `extended leading-code', and one or two `position-code's.  Length
 322    of the sequence is decided by the base leading-code.  Leading-code
 323    takes the range 0x80 through 0x9F, whereas extended leading-code
 324    and position-code take the range 0xA0 through 0xFF.  See the
 325    document of `charset.h' for more detail about leading-code and
 326    position-code.
 327
 328    There's one exception in this rule.  Special leading-code
 329    `leading-code-composition' denotes that the following several
 330    characters should be composed into one character.  Leading-codes of
 331    components (except for ASCII) are added 0x20.  An ASCII character
 332    component is represented by a 2-byte sequence of `0xA0' and
 333    `ASCII-code + 0x80'.  See also the document in `charset.h' for the
 334    detail of composite character.  Hence, we can summarize the code
 335    range as follows:
 336
 337    --- CODE RANGE of Emacs' internal format ---
 338    (character set)      (range)
 339    ASCII                0x00 .. 0x7F
 340    ELSE (1st byte)      0x80 .. 0x9F
 341         (rest bytes)    0xA0 .. 0xFF
 342    ---------------------------------------------
 343
 344   */
 345
 346 enum emacs_code_class_type emacs_code_class[256];
 347
 348 /* Go to the next statement only if *SRC is accessible and the code is
 349    greater than 0xA0.  */
 350 #define CHECK_CODE_RANGE_A0_FF  \
 351   do {                          \
 352     if (src >= src_end)         \
 353       goto label_end_of_switch; \
 354     else if (*src++ < 0xA0)     \
 355       return 0;                 \
 356   } while (0)
 357
 358 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 359    Check if a text is encoded in Emacs' internal format.  If it is,
 360    return CODING_CATEGORY_MASK_INTERNAL, else return 0.  */
 361
 362 int
 363 detect_coding_internal (src, src_end)
 364      unsigned char *src, *src_end;
 365 {
 366   unsigned char c;
 367   int composing = 0;
 368
 369   while (src < src_end)
 370     {
 371       c = *src++;
 372
 373       if (composing)
 374         {
 375           if (c < 0xA0)
 376             composing = 0;
 377           else
 378             c -= 0x20;
 379         }
 380
 381       switch (emacs_code_class[c])
 382         {
 383         case EMACS_ascii_code:
 384         case EMACS_linefeed_code:
 385           break;
 386
 387         case EMACS_control_code:
 388           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 389             return 0;
 390           break;
 391
 392         case EMACS_invalid_code:
 393           return 0;
 394
 395         case EMACS_leading_code_composition: /* c == 0x80 */
 396           if (composing)
 397             CHECK_CODE_RANGE_A0_FF;
 398           else
 399             composing = 1;
 400           break;
 401
 402         case EMACS_leading_code_4:
 403           CHECK_CODE_RANGE_A0_FF;
 404           /* fall down to check it two more times ...  */
 405
 406         case EMACS_leading_code_3:
 407           CHECK_CODE_RANGE_A0_FF;
 408           /* fall down to check it one more time ...  */
 409
 410         case EMACS_leading_code_2:
 411           CHECK_CODE_RANGE_A0_FF;
 412           break;
 413
 414         default:
 415         label_end_of_switch:
 416           break;
 417         }
 418     }
 419   return CODING_CATEGORY_MASK_INTERNAL;
 420 }
 421
 422 \f
 423 /*** 3. ISO2022 handlers ***/
 424
 425 /* The following note describes the coding system ISO2022 briefly.
 426    Since the intension of this note is to help understanding of the
 427    programs in this file, some parts are NOT ACCURATE or OVERLY
 428    SIMPLIFIED.  For the thorough understanding, please refer to the
 429    original document of ISO2022.
 430
 431    ISO2022 provides many mechanisms to encode several character sets
 432    in 7-bit and 8-bit environment.  If one choose 7-bite environment,
 433    all text is encoded by codes of less than 128.  This may make the
 434    encoded text a little bit longer, but the text get more stability
 435    to pass through several gateways (some of them split MSB off).
 436
 437    There are two kind of character set: control character set and
 438    graphic character set.  The former contains control characters such
 439    as `newline' and `escape' to provide control functions (control
 440    functions are provided also by escape sequence).  The latter
 441    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 442    two control character sets and many graphic character sets.
 443
 444    Graphic character sets are classified into one of the following
 445    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 446    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 447    bytes (DIMENSION) and the number of characters in one dimension
 448    (CHARS) of the set.  In addition, each character set is assigned an
 449    identification tag (called "final character" and denoted as <F>
 450    here after) which is unique in each class.  <F> of each character
 451    set is decided by ECMA(*) when it is registered in ISO.  Code range
 452    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 453
 454    Note (*): ECMA = European Computer Manufacturers Association
 455
 456    Here are examples of graphic character set [NAME(<F>)]:
 457         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 458         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 459         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 460         o DIMENSION2_CHARS96 -- none for the moment
 461
 462    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 463         C0 [0x00..0x1F] -- control character plane 0
 464         GL [0x20..0x7F] -- graphic character plane 0
 465         C1 [0x80..0x9F] -- control character plane 1
 466         GR [0xA0..0xFF] -- graphic character plane 1
 467
 468    A control character set is directly designated and invoked to C0 or
 469    C1 by an escape sequence.  The most common case is that ISO646's
 470    control character set is designated/invoked to C0 and ISO6429's
 471    control character set is designated/invoked to C1, and usually
 472    these designations/invocations are omitted in a coded text.  With
 473    7-bit environment, only C0 can be used, and a control character for
 474    C1 is encoded by an appropriate escape sequence to fit in the
 475    environment.  All control characters for C1 are defined the
 476    corresponding escape sequences.
 477
 478    A graphic character set is at first designated to one of four
 479    graphic registers (G0 through G3), then these graphic registers are
 480    invoked to GL or GR.  These designations and invocations can be
 481    done independently.  The most common case is that G0 is invoked to
 482    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 483    these invocations and designations are omitted in a coded text.
 484    With 7-bit environment, only GL can be used.
 485
 486    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 487    and 0x7F of GL area work as control characters SPACE and DEL
 488    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 489
 490    There are two ways of invocation: locking-shift and single-shift.
 491    With locking-shift, the invocation lasts until the next different
 492    invocation, whereas with single-shift, the invocation works only
 493    for the following character and doesn't affect locking-shift.
 494    Invocations are done by the following control characters or escape
 495    sequences.
 496
 497    ----------------------------------------------------------------------
 498    function             control char    escape sequence description
 499    ----------------------------------------------------------------------
 500    SI  (shift-in)               0x0F    none            invoke G0 to GL
 501    SI  (shift-out)              0x0E    none            invoke G1 to GL
 502    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 503    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 504    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 505    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 506    ----------------------------------------------------------------------
 507    The first four are for locking-shift.  Control characters for these
 508    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 509
 510    Designations are done by the following escape sequences.
 511    ----------------------------------------------------------------------
 512    escape sequence      description
 513    ----------------------------------------------------------------------
 514    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 515    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 516    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 517    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 518    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 519    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 520    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 521    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 522    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 523    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 524    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 525    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 526    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 527    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 528    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 529    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 530    ----------------------------------------------------------------------
 531
 532    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 533    of dimension 1, chars 94, and final character <F>, and etc.
 534
 535    Note (*): Although these designations are not allowed in ISO2022,
 536    Emacs accepts them on decoding, and produces them on encoding
 537    CHARS96 character set in a coding system which is characterized as
 538    7-bit environment, non-locking-shift, and non-single-shift.
 539
 540    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 541    '(' can be omitted.  We call this as "short-form" here after.
 542
 543    Now you may notice that there are a lot of ways for encoding the
 544    same multilingual text in ISO2022.  Actually, there exist many
 545    coding systems such as Compound Text (used in X's inter client
 546    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 547    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 548    localized platforms), and all of these are variants of ISO2022.
 549
 550    In addition to the above, Emacs handles two more kinds of escape
 551    sequences: ISO6429's direction specification and Emacs' private
 552    sequence for specifying character composition.
 553
 554    ISO6429's direction specification takes the following format:
 555         o CSI ']'      -- end of the current direction
 556         o CSI '0' ']'  -- end of the current direction
 557         o CSI '1' ']'  -- start of left-to-right text
 558         o CSI '2' ']'  -- start of right-to-left text
 559    The control character CSI (0x9B: control sequence introducer) is
 560    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 561
 562    Character composition specification takes the following format:
 563         o ESC '0' -- start character composition
 564         o ESC '1' -- end character composition
 565    Since these are not standard escape sequences of any ISO, the use
 566    of them for these meaning is restricted to Emacs only.  */
 567
 568 enum iso_code_class_type iso_code_class[256];
 569
 570 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 571    Check if a text is encoded in ISO2022.  If it is, returns an
 572    integer in which appropriate flag bits any of:
 573         CODING_CATEGORY_MASK_ISO_7
 574         CODING_CATEGORY_MASK_ISO_8_1
 575         CODING_CATEGORY_MASK_ISO_8_2
 576         CODING_CATEGORY_MASK_ISO_ELSE
 577    are set.  If a code which should never appear in ISO2022 is found,
 578    returns 0.  */
 579
 580 int
 581 detect_coding_iso2022 (src, src_end)
 582      unsigned char *src, *src_end;
 583 {
 584   unsigned char c, g1 = 0;
 585   int mask = (CODING_CATEGORY_MASK_ISO_7
 586               | CODING_CATEGORY_MASK_ISO_8_1
 587               | CODING_CATEGORY_MASK_ISO_8_2);
 588   /* We may look ahead at most 4 bytes.  */
 589   unsigned char *adjusted_src_end = src_end - 4;
 590   int i;
 591
 592   while (src < src_end)
 593     {
 594       c = *src++;
 595       switch (c)
 596         {
 597         case ISO_CODE_ESC:
 598           if (src >= src_end)
 599             break;
 600           c = *src++;
 601           if (src + 2 >= src_end
 602               && ((c >= '(' && c <= '/')
 603                   || c == '$' && ((*src >= '(' && *src <= '/')
 604                                   || (*src >= '@' && *src <= 'B'))))
 605             {
 606               /* Valid designation sequence.  */
 607               if (c == ')' || (c == '$' && *src == ')'))
 608                 g1 = 1;
 609               src++;
 610               break;
 611             }
 612           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 613             return CODING_CATEGORY_MASK_ISO_ELSE;
 614           break;
 615
 616         case ISO_CODE_SO:
 617           if (g1)
 618             return CODING_CATEGORY_MASK_ISO_ELSE;
 619           break;
 620
 621         case ISO_CODE_CSI:
 622         case ISO_CODE_SS2:
 623         case ISO_CODE_SS3:
 624           mask &= ~CODING_CATEGORY_MASK_ISO_7;
 625           break;
 626
 627         default:
 628           if (c < 0x80)
 629             break;
 630           else if (c < 0xA0)
 631             return 0;
 632           else
 633             {
 634               int count = 1;
 635
 636               mask &= ~CODING_CATEGORY_MASK_ISO_7;
 637               while (src < src_end && *src >= 0xA0)
 638                 count++, src++;
 639               if (count & 1 && src < src_end)
 640                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 641             }
 642           break;
 643         }
 644     }
 645
 646   return mask;
 647 }
 648
 649 /* Decode a character of which charset is CHARSET and the 1st position
 650    code is C1.  If dimension of CHARSET 2, the 2nd position code is
 651    fetched from SRC and set to C2.  If CHARSET is negative, it means
 652    that we are decoding ill formed text, and what we can do is just to
 653    read C1 as is.  */
 654
 655 #define DECODE_ISO_CHARACTER(charset, c1)                       \
 656   do {                                                          \
 657     if ((charset) >= 0 && CHARSET_DIMENSION (charset) == 2)     \
 658       ONE_MORE_BYTE (c2);                                       \
 659     if (COMPOSING_HEAD_P (coding->composing))                   \
 660       {                                                         \
 661         *dst++ = LEADING_CODE_COMPOSITION;                      \
 662         if (COMPOSING_WITH_RULE_P (coding->composing))          \
 663           /* To tell composition rules are embeded.  */         \
 664           *dst++ = 0xFF;                                        \
 665         coding->composing += 2;                                 \
 666       }                                                         \
 667     if ((charset) < 0)                                          \
 668       *dst++ = c1;                                              \
 669     else if ((charset) == CHARSET_ASCII)                        \
 670       DECODE_CHARACTER_ASCII (c1);                              \
 671     else if (CHARSET_DIMENSION (charset) == 1)                  \
 672       DECODE_CHARACTER_DIMENSION1 (charset, c1);                \
 673     else                                                        \
 674       DECODE_CHARACTER_DIMENSION2 (charset, c1, c2);            \
 675     if (COMPOSING_WITH_RULE_P (coding->composing))              \
 676       /* To tell a composition rule follows.  */                \
 677       coding->composing = COMPOSING_WITH_RULE_RULE;             \
 678   } while (0)
 679
 680 /* Set designation state into CODING.  */
 681 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 682   do {                                                                  \
 683     int charset = ISO_CHARSET_TABLE (dimension, chars, final_char);     \
 684     Lisp_Object temp                                                    \
 685       = Fassq (CHARSET_SYMBOL (charset), Valternate_charset_table);     \
 686     if (! NILP (temp))                                                  \
 687       charset = get_charset_id (XCONS (temp)->cdr);                     \
 688     if (charset >= 0)                                                   \
 689       {                                                                 \
 690         if (coding->direction == 1                                      \
 691             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 692           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 693         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 694       }                                                                 \
 695   } while (0)
 696
 697 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 698
 699 int
 700 decode_coding_iso2022 (coding, source, destination,
 701                        src_bytes, dst_bytes, consumed)
 702      struct coding_system *coding;
 703      unsigned char *source, *destination;
 704      int src_bytes, dst_bytes;
 705      int *consumed;
 706 {
 707   unsigned char *src = source;
 708   unsigned char *src_end = source + src_bytes;
 709   unsigned char *dst = destination;
 710   unsigned char *dst_end = destination + dst_bytes;
 711   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 712      from DST_END to assure that overflow checking is necessary only
 713      at the head of loop.  */
 714   unsigned char *adjusted_dst_end = dst_end - 6;
 715   int charset;
 716   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 717   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 718   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 719
 720   while (src < src_end && dst < adjusted_dst_end)
 721     {
 722       /* SRC_BASE remembers the start position in source in each loop.
 723          The loop will be exited when there's not enough source text
 724          to analyze long escape sequence or 2-byte code (within macros
 725          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 726          to SRC_BASE before exiting.  */
 727       unsigned char *src_base = src;
 728       unsigned char c1 = *src++, c2, cmprule;
 729
 730       switch (iso_code_class [c1])
 731         {
 732         case ISO_0x20_or_0x7F:
 733           if (!coding->composing
 734               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 735             {
 736               /* This is SPACE or DEL.  */
 737               *dst++ = c1;
 738               break;
 739             }
 740           /* This is a graphic character, we fall down ...  */
 741
 742         case ISO_graphic_plane_0:
 743           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 744             {
 745               /* This is a composition rule.  */
 746               *dst++ = c1 | 0x80;
 747               coding->composing = COMPOSING_WITH_RULE_TAIL;
 748             }
 749           else
 750             DECODE_ISO_CHARACTER (charset0, c1);
 751           break;
 752
 753         case ISO_0xA0_or_0xFF:
 754           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 755             {
 756               /* Invalid code.  */
 757               *dst++ = c1;
 758               break;
 759             }
 760           /* This is a graphic character, we fall down ... */
 761
 762         case ISO_graphic_plane_1:
 763           DECODE_ISO_CHARACTER (charset1, c1);
 764           break;
 765
 766         case ISO_control_code:
 767           /* All ISO2022 control characters in this class have the
 768              same representation in Emacs internal format.  */
 769           *dst++ = c1;
 770           break;
 771
 772         case ISO_carriage_return:
 773           if (coding->eol_type == CODING_EOL_CR)
 774             {
 775               *dst++ = '\n';
 776             }
 777           else if (coding->eol_type == CODING_EOL_CRLF)
 778             {
 779               ONE_MORE_BYTE (c1);
 780               if (c1 == ISO_CODE_LF)
 781                 *dst++ = '\n';
 782               else
 783                 {
 784                   src--;
 785                   *dst++ = c1;
 786                 }
 787             }
 788           else
 789             {
 790               *dst++ = c1;
 791             }
 792           break;
 793
 794         case ISO_shift_out:
 795           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 796             goto label_invalid_escape_sequence;
 797           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 798           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 799           break;
 800
 801         case ISO_shift_in:
 802           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 803           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 804           break;
 805
 806         case ISO_single_shift_2_7:
 807         case ISO_single_shift_2:
 808           /* SS2 is handled as an escape sequence of ESC 'N' */
 809           c1 = 'N';
 810           goto label_escape_sequence;
 811
 812         case ISO_single_shift_3:
 813           /* SS2 is handled as an escape sequence of ESC 'O' */
 814           c1 = 'O';
 815           goto label_escape_sequence;
 816
 817         case ISO_control_sequence_introducer:
 818           /* CSI is handled as an escape sequence of ESC '[' ...  */
 819           c1 = '[';
 820           goto label_escape_sequence;
 821
 822         case ISO_escape:
 823           ONE_MORE_BYTE (c1);
 824         label_escape_sequence:
 825           /* Escape sequences handled by Emacs are invocation,
 826              designation, direction specification, and character
 827              composition specification.  */
 828           switch (c1)
 829             {
 830             case '&':           /* revision of following character set */
 831               ONE_MORE_BYTE (c1);
 832               if (!(c1 >= '@' && c1 <= '~'))
 833                 goto label_invalid_escape_sequence;
 834               ONE_MORE_BYTE (c1);
 835               if (c1 != ISO_CODE_ESC)
 836                 goto label_invalid_escape_sequence;
 837               ONE_MORE_BYTE (c1);
 838               goto label_escape_sequence;
 839
 840             case '$':           /* designation of 2-byte character set */
 841               ONE_MORE_BYTE (c1);
 842               if (c1 >= '@' && c1 <= 'B')
 843                 {       /* designation of JISX0208.1978, GB2312.1980,
 844                                    or JISX0208.1980 */
 845                   DECODE_DESIGNATION (0, 2, 94, c1);
 846                 }
 847               else if (c1 >= 0x28 && c1 <= 0x2B)
 848                 {       /* designation of DIMENSION2_CHARS94 character set */
 849                   ONE_MORE_BYTE (c2);
 850                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 851                 }
 852               else if (c1 >= 0x2C && c1 <= 0x2F)
 853                 {       /* designation of DIMENSION2_CHARS96 character set */
 854                   ONE_MORE_BYTE (c2);
 855                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 856                 }
 857               else
 858                 goto label_invalid_escape_sequence;
 859               break;
 860
 861             case 'n':           /* invocation of locking-shift-2 */
 862               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 863                 goto label_invalid_escape_sequence;
 864               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 865               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 866               break;
 867
 868             case 'o':           /* invocation of locking-shift-3 */
 869               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 870                 goto label_invalid_escape_sequence;
 871               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 872               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 873               break;
 874
 875             case 'N':           /* invocation of single-shift-2 */
 876               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 877                 goto label_invalid_escape_sequence;
 878               ONE_MORE_BYTE (c1);
 879               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 880               DECODE_ISO_CHARACTER (charset, c1);
 881               break;
 882
 883             case 'O':           /* invocation of single-shift-3 */
 884               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 885                 goto label_invalid_escape_sequence;
 886               ONE_MORE_BYTE (c1);
 887               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 888               DECODE_ISO_CHARACTER (charset, c1);
 889               break;
 890
 891             case '0':           /* start composing without embeded rules */
 892               coding->composing = COMPOSING_NO_RULE_HEAD;
 893               break;
 894
 895             case '1':           /* end composing */
 896               coding->composing = COMPOSING_NO;
 897               break;
 898
 899             case '2':           /* start composing with embeded rules */
 900               coding->composing = COMPOSING_WITH_RULE_HEAD;
 901               break;
 902
 903             case '[':           /* specification of direction */
 904               /* For the moment, nested direction is not supported.
 905                  So, the value of `coding->direction' is 0 or 1: 0
 906                  means left-to-right, 1 means right-to-left.  */
 907               ONE_MORE_BYTE (c1);
 908               switch (c1)
 909                 {
 910                 case ']':       /* end of the current direction */
 911                   coding->direction = 0;
 912
 913                 case '0':       /* end of the current direction */
 914                 case '1':       /* start of left-to-right direction */
 915                   ONE_MORE_BYTE (c1);
 916                   if (c1 == ']')
 917                     coding->direction = 0;
 918                   else
 919                     goto label_invalid_escape_sequence;
 920                   break;
 921
 922                 case '2':       /* start of right-to-left direction */
 923                   ONE_MORE_BYTE (c1);
 924                   if (c1 == ']')
 925                     coding->direction= 1;
 926                   else
 927                     goto label_invalid_escape_sequence;
 928                   break;
 929
 930                 default:
 931                   goto label_invalid_escape_sequence;
 932                 }
 933               break;
 934
 935             default:
 936               if (c1 >= 0x28 && c1 <= 0x2B)
 937                 {       /* designation of DIMENSION1_CHARS94 character set */
 938                   ONE_MORE_BYTE (c2);
 939                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 940                 }
 941               else if (c1 >= 0x2C && c1 <= 0x2F)
 942                 {       /* designation of DIMENSION1_CHARS96 character set */
 943                   ONE_MORE_BYTE (c2);
 944                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 945                 }
 946               else
 947                 {
 948                   goto label_invalid_escape_sequence;
 949                 }
 950             }
 951           /* We must update these variables now.  */
 952           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 953           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 954           break;
 955
 956         label_invalid_escape_sequence:
 957           {
 958             int length = src - src_base;
 959
 960             bcopy (src_base, dst, length);
 961             dst += length;
 962           }
 963         }
 964       continue;
 965
 966     label_end_of_loop:
 967       coding->carryover_size = src - src_base;
 968       bcopy (src_base, coding->carryover, coding->carryover_size);
 969       src = src_base;
 970       break;
 971     }
 972
 973   /* If this is the last block of the text to be decoded, we had
 974      better just flush out all remaining codes in the text although
 975      they are not valid characters.  */
 976   if (coding->last_block)
 977     {
 978       bcopy (src, dst, src_end - src);
 979       dst += (src_end - src);
 980       src = src_end;
 981     }
 982   *consumed = src - source;
 983   return dst - destination;
 984 }
 985
 986 /* ISO2022 encoding staffs.  */
 987
 988 /*
 989    It is not enough to say just "ISO2022" on encoding, but we have to
 990    specify more details.  In Emacs, each coding-system of ISO2022
 991    variant has the following specifications:
 992         1. Initial designation to G0 thru G3.
 993         2. Allows short-form designation?
 994         3. ASCII should be designated to G0 before control characters?
 995         4. ASCII should be designated to G0 at end of line?
 996         5. 7-bit environment or 8-bit environment?
 997         6. Use locking-shift?
 998         7. Use Single-shift?
 999    And the following two are only for Japanese:
1000         8. Use ASCII in place of JIS0201-1976-Roman?
1001         9. Use JISX0208-1983 in place of JISX0208-1978?
1002    These specifications are encoded in `coding->flags' as flag bits
1003    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1004    detail.
1005 */
1006
1007 /* Produce codes (escape sequence) for designating CHARSET to graphic
1008    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1009    the coding system CODING allows, produce designation sequence of
1010    short-form.  */
1011
1012 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1013   do {                                                                  \
1014     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1015     char *intermediate_char_94 = "()*+";                                \
1016     char *intermediate_char_96 = ",-./";                                \
1017     Lisp_Object temp                                                    \
1018       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1019     if (! NILP (temp))                                                  \
1020         {                                                               \
1021         *dst++ = ISO_CODE_ESC;                                          \
1022         *dst++ = '&';                                                   \
1023         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1024       }                                                                 \
1025     *dst++ = ISO_CODE_ESC;                                              \
1026     if (CHARSET_DIMENSION (charset) == 1)                               \
1027       {                                                                 \
1028         if (CHARSET_CHARS (charset) == 94)                              \
1029           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1030         else                                                            \
1031           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1032       }                                                                 \
1033     else                                                                \
1034       {                                                                 \
1035         *dst++ = '$';                                                   \
1036         if (CHARSET_CHARS (charset) == 94)                              \
1037           {                                                             \
1038             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1039                 || reg != 0                                             \
1040                 || final_char < '@' || final_char > 'B')                \
1041               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1042           }                                                             \
1043         else                                                            \
1044           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1045       }                                                                 \
1046     *dst++ = final_char;                                                \
1047     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1048   } while (0)
1049
1050 /* The following two macros produce codes (control character or escape
1051    sequence) for ISO2022 single-shift functions (single-shift-2 and
1052    single-shift-3).  */
1053
1054 #define ENCODE_SINGLE_SHIFT_2                           \
1055   do {                                                  \
1056     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1057       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1058     else                                                \
1059       *dst++ = ISO_CODE_SS2;                            \
1060     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1061   } while (0)
1062
1063 #define ENCODE_SINGLE_SHIFT_3                           \
1064   do {                                                  \
1065     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1066       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1067     else                                                \
1068       *dst++ = ISO_CODE_SS3;                            \
1069     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1070   } while (0)
1071
1072 /* The following four macros produce codes (control character or
1073    escape sequence) for ISO2022 locking-shift functions (shift-in,
1074    shift-out, locking-shift-2, and locking-shift-3).  */
1075
1076 #define ENCODE_SHIFT_IN                         \
1077   do {                                          \
1078     *dst++ = ISO_CODE_SI;                       \
1079     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1080   } while (0)
1081
1082 #define ENCODE_SHIFT_OUT                        \
1083   do {                                          \
1084     *dst++ = ISO_CODE_SO;                       \
1085     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1086   } while (0)
1087
1088 #define ENCODE_LOCKING_SHIFT_2                  \
1089   do {                                          \
1090     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1091     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1092   } while (0)
1093
1094 #define ENCODE_LOCKING_SHIFT_3                  \
1095   do {                                          \
1096     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1097     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1098   } while (0)
1099
1100 /* Produce codes for a DIMENSION1 character of which character set is
1101    CHARSET and position-code is C1.  Designation and invocation
1102    sequences are also produced in advance if necessary.  */
1103
1104
1105 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1106   do {                                                                  \
1107     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1108       {                                                                 \
1109         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1110           *dst++ = c1 & 0x7F;                                           \
1111         else                                                            \
1112           *dst++ = c1 | 0x80;                                           \
1113         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1114         break;                                                          \
1115       }                                                                 \
1116     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1117       {                                                                 \
1118         *dst++ = c1 & 0x7F;                                             \
1119         break;                                                          \
1120       }                                                                 \
1121     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1122       {                                                                 \
1123         *dst++ = c1 | 0x80;                                             \
1124         break;                                                          \
1125       }                                                                 \
1126     else                                                                \
1127       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1128          must invoke it, or, at first, designate it to some graphic     \
1129          register.  Then repeat the loop to actually produce the        \
1130          character.  */                                                 \
1131       dst = encode_invocation_designation (charset, coding, dst);       \
1132   } while (1)
1133
1134 /* Produce codes for a DIMENSION2 character of which character set is
1135    CHARSET and position-codes are C1 and C2.  Designation and
1136    invocation codes are also produced in advance if necessary.  */
1137
1138 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1139   do {                                                                  \
1140     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1141       {                                                                 \
1142         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1143           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1144         else                                                            \
1145           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1146         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1147         break;                                                          \
1148       }                                                                 \
1149     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1150       {                                                                 \
1151         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1152         break;                                                          \
1153       }                                                                 \
1154     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1155       {                                                                 \
1156         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1157         break;                                                          \
1158       }                                                                 \
1159     else                                                                \
1160       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1161          must invoke it, or, at first, designate it to some graphic     \
1162          register.  Then repeat the loop to actually produce the        \
1163          character.  */                                                 \
1164       dst = encode_invocation_designation (charset, coding, dst);       \
1165   } while (1)
1166
1167 /* Produce designation and invocation codes at a place pointed by DST
1168    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1169    Return new DST.  */
1170
1171 unsigned char *
1172 encode_invocation_designation (charset, coding, dst)
1173      int charset;
1174      struct coding_system *coding;
1175      unsigned char *dst;
1176 {
1177   int reg;                      /* graphic register number */
1178
1179   /* At first, check designations.  */
1180   for (reg = 0; reg < 4; reg++)
1181     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1182       break;
1183
1184   if (reg >= 4)
1185     {
1186       /* CHARSET is not yet designated to any graphic registers.  */
1187       /* At first check the requested designation.  */
1188       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1189       if (reg < 0)
1190         /* Since CHARSET requests no special designation, designate to
1191            graphic register 0.  */
1192         reg = 0;
1193
1194       ENCODE_DESIGNATION (charset, reg, coding);
1195     }
1196
1197   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1198       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1199     {
1200       /* Since the graphic register REG is not invoked to any graphic
1201          planes, invoke it to graphic plane 0.  */
1202       switch (reg)
1203         {
1204         case 0:                 /* graphic register 0 */
1205           ENCODE_SHIFT_IN;
1206           break;
1207
1208         case 1:                 /* graphic register 1 */
1209           ENCODE_SHIFT_OUT;
1210           break;
1211
1212         case 2:                 /* graphic register 2 */
1213           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1214             ENCODE_SINGLE_SHIFT_2;
1215           else
1216             ENCODE_LOCKING_SHIFT_2;
1217           break;
1218
1219         case 3:                 /* graphic register 3 */
1220           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1221             ENCODE_SINGLE_SHIFT_3;
1222           else
1223             ENCODE_LOCKING_SHIFT_3;
1224           break;
1225         }
1226     }
1227   return dst;
1228 }
1229
1230 /* The following two macros produce codes for indicating composition.  */
1231 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1232 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1233 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1234
1235 /* The following three macros produce codes for indicating direction
1236    of text.  */
1237 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1238   do {                                                  \
1239     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1240       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1241     else                                                \
1242       *dst++ = ISO_CODE_CSI;                            \
1243   } while (0)
1244
1245 #define ENCODE_DIRECTION_R2L    \
1246   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1247
1248 #define ENCODE_DIRECTION_L2R    \
1249   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1250
1251 /* Produce codes for designation and invocation to reset the graphic
1252    planes and registers to initial state.  */
1253 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1254   do {                                                                      \
1255     int reg;                                                                \
1256     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1257       ENCODE_SHIFT_IN;                                                      \
1258     for (reg = 0; reg < 4; reg++)                                           \
1259       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1260           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1261               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1262         ENCODE_DESIGNATION                                                  \
1263           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1264   } while (0)
1265
1266 int
1267 encode_designation_at_bol (coding, src, src_end, dstp)
1268      struct coding_system *coding;
1269      unsigned char *src, *src_end, **dstp;
1270 {
1271   int charset, reg, r[4];
1272   unsigned char *dst = *dstp, c;
1273   for (reg = 0; reg < 4; reg++) r[reg] = -1;
1274   while (src < src_end && (c = *src++) != '\n')
1275     {
1276       switch (emacs_code_class[c])
1277         {
1278         case EMACS_ascii_code:
1279           charset = CHARSET_ASCII;
1280           break;
1281         case EMACS_leading_code_2:
1282           if (++src >= src_end) continue;
1283           charset = c;
1284           break;
1285         case EMACS_leading_code_3:
1286           if ((src += 2) >= src_end) continue;
1287           charset =  (c < LEADING_CODE_PRIVATE_11 ? c : *(src - 2));
1288           break;
1289         case EMACS_leading_code_4:
1290           if ((src += 3) >= src_end) continue;
1291           charset = *(src - 3);
1292           break;
1293         default:
1294           continue;
1295         }
1296       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1297       if (r[reg] < 0
1298           && CODING_SPEC_ISO_DESIGNATION (coding, reg) != charset)
1299         r[reg] = charset;
1300     }
1301   if (c != '\n' && !coding->last_block)
1302     return -1;
1303   for (reg = 0; reg < 4; reg++)
1304     if (r[reg] >= 0)
1305       ENCODE_DESIGNATION (r[reg], reg, coding);
1306   *dstp = dst;
1307   return 0;
1308 }
1309
1310 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1311
1312 int
1313 encode_coding_iso2022 (coding, source, destination,
1314                        src_bytes, dst_bytes, consumed)
1315      struct coding_system *coding;
1316      unsigned char *source, *destination;
1317      int src_bytes, dst_bytes;
1318      int *consumed;
1319 {
1320   unsigned char *src = source;
1321   unsigned char *src_end = source + src_bytes;
1322   unsigned char *dst = destination;
1323   unsigned char *dst_end = destination + dst_bytes;
1324   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1325      from DST_END to assure overflow checking is necessary only at the
1326      head of loop.  */
1327   unsigned char *adjusted_dst_end = dst_end - 19;
1328
1329   while (src < src_end && dst < adjusted_dst_end)
1330     {
1331       /* SRC_BASE remembers the start position in source in each loop.
1332          The loop will be exited when there's not enough source text
1333          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1334          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1335          reset to SRC_BASE before exiting.  */
1336       unsigned char *src_base = src;
1337       unsigned char c1, c2, c3, c4;
1338       int charset;
1339
1340       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1341           && CODING_SPEC_ISO_BOL (coding))
1342         {
1343           /* We have to produce destination sequences now.  */
1344           if (encode_designation_at_bol (coding, src, src_end, &dst) < 0)
1345             /* We can't find end of line in the current block.  Let's
1346              repeat encoding starting from the current position
1347              pointed by SRC.  */
1348             break;
1349           CODING_SPEC_ISO_BOL (coding) = 0;
1350         }
1351
1352       c1 = *src++;
1353       /* If we are seeing a component of a composite character, we are
1354          seeing a leading-code specially encoded for composition, or a
1355          composition rule if composing with rule.  We must set C1
1356          to a normal leading-code or an ASCII code.  If we are not at
1357          a composed character, we must reset the composition state.  */
1358       if (COMPOSING_P (coding->composing))
1359         {
1360           if (c1 < 0xA0)
1361             {
1362               /* We are not in a composite character any longer.  */
1363               coding->composing = COMPOSING_NO;
1364               ENCODE_COMPOSITION_END;
1365             }
1366           else
1367             {
1368               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1369                 {
1370                   *dst++ = c1 & 0x7F;
1371                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1372                   continue;
1373                 }
1374               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1375                 coding->composing = COMPOSING_WITH_RULE_RULE;
1376               if (c1 == 0xA0)
1377                 {
1378                   /* This is an ASCII component.  */
1379                   ONE_MORE_BYTE (c1);
1380                   c1 &= 0x7F;
1381                 }
1382               else
1383                 /* This is a leading-code of non ASCII component.  */
1384                 c1 -= 0x20;
1385             }
1386         }
1387
1388       /* Now encode one character.  C1 is a control character, an
1389          ASCII character, or a leading-code of multi-byte character.  */
1390       switch (emacs_code_class[c1])
1391         {
1392         case EMACS_ascii_code:
1393           ENCODE_ISO_CHARACTER_DIMENSION1 (CHARSET_ASCII, c1);
1394           break;
1395
1396         case EMACS_control_code:
1397           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1398             ENCODE_RESET_PLANE_AND_REGISTER;
1399           *dst++ = c1;
1400           break;
1401
1402         case EMACS_carriage_return_code:
1403           if (!coding->selective)
1404             {
1405               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1406                 ENCODE_RESET_PLANE_AND_REGISTER;
1407               *dst++ = c1;
1408               break;
1409             }
1410           /* fall down to treat '\r' as '\n' ...  */
1411
1412         case EMACS_linefeed_code:
1413           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1414             ENCODE_RESET_PLANE_AND_REGISTER;
1415           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1416             bcopy (coding->spec.iso2022.initial_designation,
1417                    coding->spec.iso2022.current_designation,
1418                    sizeof coding->spec.iso2022.initial_designation);
1419           if (coding->eol_type == CODING_EOL_LF
1420               || coding->eol_type == CODING_EOL_AUTOMATIC)
1421             *dst++ = ISO_CODE_LF;
1422           else if (coding->eol_type == CODING_EOL_CRLF)
1423             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1424           else
1425             *dst++ = ISO_CODE_CR;
1426           CODING_SPEC_ISO_BOL (coding) = 1;
1427           break;
1428
1429         case EMACS_leading_code_2:
1430           ONE_MORE_BYTE (c2);
1431           ENCODE_ISO_CHARACTER_DIMENSION1 (c1, c2);
1432           break;
1433
1434         case EMACS_leading_code_3:
1435           TWO_MORE_BYTES (c2, c3);
1436           if (c1 < LEADING_CODE_PRIVATE_11)
1437             ENCODE_ISO_CHARACTER_DIMENSION2 (c1, c2, c3);
1438           else
1439             ENCODE_ISO_CHARACTER_DIMENSION1 (c2, c3);
1440           break;
1441
1442         case EMACS_leading_code_4:
1443           THREE_MORE_BYTES (c2, c3, c4);
1444           ENCODE_ISO_CHARACTER_DIMENSION2 (c2, c3, c4);
1445           break;
1446
1447         case EMACS_leading_code_composition:
1448           ONE_MORE_BYTE (c1);
1449           if (c1 == 0xFF)
1450             {
1451               coding->composing = COMPOSING_WITH_RULE_HEAD;
1452               ENCODE_COMPOSITION_WITH_RULE_START;
1453             }
1454           else
1455             {
1456               /* Rewind one byte because it is a character code of
1457                  composition elements.  */
1458               src--;
1459               coding->composing = COMPOSING_NO_RULE_HEAD;
1460               ENCODE_COMPOSITION_NO_RULE_START;
1461             }
1462           break;
1463
1464         case EMACS_invalid_code:
1465           *dst++ = c1;
1466           break;
1467         }
1468       continue;
1469     label_end_of_loop:
1470       coding->carryover_size = src - src_base;
1471       bcopy (src_base, coding->carryover, coding->carryover_size);
1472       src = src_base;
1473       break;
1474     }
1475
1476   /* If this is the last block of the text to be encoded, we must
1477      reset the state of graphic planes and registers to initial one.
1478      In addition, we had better just flush out all remaining codes in
1479      the text although they are not valid characters.  */
1480   if (coding->last_block)
1481     {
1482       ENCODE_RESET_PLANE_AND_REGISTER;
1483       bcopy(src, dst, src_end - src);
1484       dst += (src_end - src);
1485       src = src_end;
1486     }
1487   *consumed = src - source;
1488   return dst - destination;
1489 }
1490
1491 \f
1492 /*** 4. SJIS and BIG5 handlers ***/
1493
1494 /* Although SJIS and BIG5 are not ISO's coding system, They are used
1495    quite widely.  So, for the moment, Emacs supports them in the bare
1496    C code.  But, in the future, they may be supported only by CCL.  */
1497
1498 /* SJIS is a coding system encoding three character sets: ASCII, right
1499    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1500    as is.  A character of charset katakana-jisx0201 is encoded by
1501    "position-code + 0x80".  A character of charset japanese-jisx0208
1502    is encoded in 2-byte but two position-codes are divided and shifted
1503    so that it fit in the range below.
1504
1505    --- CODE RANGE of SJIS ---
1506    (character set)      (range)
1507    ASCII                0x00 .. 0x7F
1508    KATAKANA-JISX0201    0xA0 .. 0xDF
1509    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1510             (2nd byte)  0x40 .. 0xFF
1511    -------------------------------
1512
1513 */
1514
1515 /* BIG5 is a coding system encoding two character sets: ASCII and
1516    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1517    character set and is encoded in two-byte.
1518
1519    --- CODE RANGE of BIG5 ---
1520    (character set)      (range)
1521    ASCII                0x00 .. 0x7F
1522    Big5 (1st byte)      0xA1 .. 0xFE
1523         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1524    --------------------------
1525
1526    Since the number of characters in Big5 is larger than maximum
1527    characters in Emacs' charset (96x96), it can't be handled as one
1528    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1529    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1530    contains frequently used characters and the latter contains less
1531    frequently used characters.  */
1532
1533 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1534    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1535    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1536    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1537
1538 /* Number of Big5 characters which have the same code in 1st byte.  */
1539 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1540
1541 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1542   do {                                                                  \
1543     unsigned int temp                                                   \
1544       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1545     if (b1 < 0xC9)                                                      \
1546       charset = charset_big5_1;                                         \
1547     else                                                                \
1548       {                                                                 \
1549         charset = charset_big5_2;                                       \
1550         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1551       }                                                                 \
1552     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1553     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1554   } while (0)
1555
1556 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1557   do {                                                                  \
1558     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1559     if (charset == charset_big5_2)                                      \
1560       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1561     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1562     b2 = temp % BIG5_SAME_ROW;                                          \
1563     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1564   } while (0)
1565
1566 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1567    Check if a text is encoded in SJIS.  If it is, return
1568    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1569
1570 int
1571 detect_coding_sjis (src, src_end)
1572      unsigned char *src, *src_end;
1573 {
1574   unsigned char c;
1575
1576   while (src < src_end)
1577     {
1578       c = *src++;
1579       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1580         return 0;
1581       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1582         {
1583           if (src < src_end && *src++ < 0x40)
1584             return 0;
1585         }
1586     }
1587   return CODING_CATEGORY_MASK_SJIS;
1588 }
1589
1590 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1591    Check if a text is encoded in BIG5.  If it is, return
1592    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1593
1594 int
1595 detect_coding_big5 (src, src_end)
1596      unsigned char *src, *src_end;
1597 {
1598   unsigned char c;
1599
1600   while (src < src_end)
1601     {
1602       c = *src++;
1603       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1604         return 0;
1605       if (c >= 0xA1)
1606         {
1607           if (src >= src_end)
1608             break;
1609           c = *src++;
1610           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1611             return 0;
1612         }
1613     }
1614   return CODING_CATEGORY_MASK_BIG5;
1615 }
1616
1617 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1618    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1619
1620 int
1621 decode_coding_sjis_big5 (coding, source, destination,
1622                          src_bytes, dst_bytes, consumed, sjis_p)
1623      struct coding_system *coding;
1624      unsigned char *source, *destination;
1625      int src_bytes, dst_bytes;
1626      int *consumed;
1627      int sjis_p;
1628 {
1629   unsigned char *src = source;
1630   unsigned char *src_end = source + src_bytes;
1631   unsigned char *dst = destination;
1632   unsigned char *dst_end = destination + dst_bytes;
1633   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1634      from DST_END to assure overflow checking is necessary only at the
1635      head of loop.  */
1636   unsigned char *adjusted_dst_end = dst_end - 3;
1637
1638   while (src < src_end && dst < adjusted_dst_end)
1639     {
1640       /* SRC_BASE remembers the start position in source in each loop.
1641          The loop will be exited when there's not enough source text
1642          to analyze two-byte character (within macro ONE_MORE_BYTE).
1643          In that case, SRC is reset to SRC_BASE before exiting.  */
1644       unsigned char *src_base = src;
1645       unsigned char c1 = *src++, c2, c3, c4;
1646
1647       if (c1 == '\r')
1648         {
1649           if (coding->eol_type == CODING_EOL_CRLF)
1650             {
1651               ONE_MORE_BYTE (c2);
1652               if (c2 == '\n')
1653                 *dst++ = c2;
1654               else
1655                 /* To process C2 again, SRC is subtracted by 1.  */
1656                 *dst++ = c1, src--;
1657             }
1658           else
1659             *dst++ = c1;
1660         }
1661       else if (c1 < 0x80)
1662         *dst++ = c1;
1663       else if (c1 < 0xA0 || c1 >= 0xE0)
1664         {
1665           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1666           if (sjis_p)
1667             {
1668               ONE_MORE_BYTE (c2);
1669               DECODE_SJIS (c1, c2, c3, c4);
1670               DECODE_CHARACTER_DIMENSION2 (charset_jisx0208, c3, c4);
1671             }
1672           else if (c1 >= 0xE0 && c1 < 0xFF)
1673             {
1674               int charset;
1675
1676               ONE_MORE_BYTE (c2);
1677               DECODE_BIG5 (c1, c2, charset, c3, c4);
1678               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1679             }
1680           else                  /* Invalid code */
1681             *dst++ = c1;
1682         }
1683       else
1684         {
1685           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1686           if (sjis_p)
1687             DECODE_CHARACTER_DIMENSION1 (charset_katakana_jisx0201, c1);
1688           else
1689             {
1690               int charset;
1691
1692               ONE_MORE_BYTE (c2);
1693               DECODE_BIG5 (c1, c2, charset, c3, c4);
1694               DECODE_CHARACTER_DIMENSION2 (charset, c3, c4);
1695             }
1696         }
1697       continue;
1698
1699     label_end_of_loop:
1700       coding->carryover_size = src - src_base;
1701       bcopy (src_base, coding->carryover, coding->carryover_size);
1702       src = src_base;
1703       break;
1704     }
1705
1706   *consumed = src - source;
1707   return dst - destination;
1708 }
1709
1710 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1711    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1712    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1713    sure that all these charsets are registered as official charset
1714    (i.e. do not have extended leading-codes).  Characters of other
1715    charsets are produced without any encoding.  If SJIS_P is 1, encode
1716    SJIS text, else encode BIG5 text.  */
1717
1718 int
1719 encode_coding_sjis_big5 (coding, source, destination,
1720                          src_bytes, dst_bytes, consumed, sjis_p)
1721      struct coding_system *coding;
1722      unsigned char *source, *destination;
1723      int src_bytes, dst_bytes;
1724      int *consumed;
1725      int sjis_p;
1726 {
1727   unsigned char *src = source;
1728   unsigned char *src_end = source + src_bytes;
1729   unsigned char *dst = destination;
1730   unsigned char *dst_end = destination + dst_bytes;
1731   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1732      from DST_END to assure overflow checking is necessary only at the
1733      head of loop.  */
1734   unsigned char *adjusted_dst_end = dst_end - 1;
1735
1736   while (src < src_end && dst < adjusted_dst_end)
1737     {
1738       /* SRC_BASE remembers the start position in source in each loop.
1739          The loop will be exited when there's not enough source text
1740          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1741          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1742          before exiting.  */
1743       unsigned char *src_base = src;
1744       unsigned char c1 = *src++, c2, c3, c4;
1745
1746       if (coding->composing)
1747         {
1748           if (c1 == 0xA0)
1749             {
1750               ONE_MORE_BYTE (c1);
1751               c1 &= 0x7F;
1752             }
1753           else if (c1 >= 0xA0)
1754             c1 -= 0x20;
1755           else
1756             coding->composing = 0;
1757         }
1758
1759       switch (emacs_code_class[c1])
1760         {
1761         case EMACS_ascii_code:
1762         case EMACS_control_code:
1763           *dst++ = c1;
1764           break;
1765
1766         case EMACS_carriage_return_code:
1767           if (!coding->selective)
1768             {
1769               *dst++ = c1;
1770               break;
1771             }
1772           /* fall down to treat '\r' as '\n' ...  */
1773
1774         case EMACS_linefeed_code:
1775           if (coding->eol_type == CODING_EOL_LF
1776               || coding->eol_type == CODING_EOL_AUTOMATIC)
1777             *dst++ = '\n';
1778           else if (coding->eol_type == CODING_EOL_CRLF)
1779             *dst++ = '\r', *dst++ = '\n';
1780           else
1781             *dst++ = '\r';
1782           break;
1783
1784         case EMACS_leading_code_2:
1785           ONE_MORE_BYTE (c2);
1786           if (sjis_p && c1 == charset_katakana_jisx0201)
1787             *dst++ = c2;
1788           else
1789             *dst++ = c1, *dst++ = c2;
1790           break;
1791
1792         case EMACS_leading_code_3:
1793           TWO_MORE_BYTES (c2, c3);
1794           c2 &= 0x7F, c3 &= 0x7F;
1795           if (sjis_p && c1 == charset_jisx0208)
1796             {
1797               unsigned char s1, s2;
1798
1799               ENCODE_SJIS (c2, c3, s1, s2);
1800               *dst++ = s1, *dst++ = s2;
1801             }
1802           else if (!sjis_p && (c1 == charset_big5_1 || c1 == charset_big5_2))
1803             {
1804               unsigned char b1, b2;
1805
1806               ENCODE_BIG5 (c1, c2, c3, b1, b2);
1807               *dst++ = b1, *dst++ = b2;
1808             }
1809           else
1810             *dst++ = c1, *dst++ = c2, *dst++ = c3;
1811           break;
1812
1813         case EMACS_leading_code_4:
1814           THREE_MORE_BYTES (c2, c3, c4);
1815           *dst++ = c1, *dst++ = c2, *dst++ = c3, *dst++ = c4;
1816           break;
1817
1818         case EMACS_leading_code_composition:
1819           coding->composing = 1;
1820           break;
1821
1822         default:                /* i.e. case EMACS_invalid_code: */
1823           *dst++ = c1;
1824         }
1825       continue;
1826
1827     label_end_of_loop:
1828       coding->carryover_size = src - src_base;
1829       bcopy (src_base, coding->carryover, coding->carryover_size);
1830       src = src_base;
1831       break;
1832     }
1833
1834   *consumed = src - source;
1835   return dst - destination;
1836 }
1837
1838 \f
1839 /*** 5. End-of-line handlers ***/
1840
1841 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1842    This function is called only when `coding->eol_type' is
1843    CODING_EOL_CRLF or CODING_EOL_CR.  */
1844
1845 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1846      struct coding_system *coding;
1847      unsigned char *source, *destination;
1848      int src_bytes, dst_bytes;
1849      int *consumed;
1850 {
1851   unsigned char *src = source;
1852   unsigned char *src_end = source + src_bytes;
1853   unsigned char *dst = destination;
1854   unsigned char *dst_end = destination + dst_bytes;
1855   int produced;
1856
1857   switch (coding->eol_type)
1858     {
1859     case CODING_EOL_CRLF:
1860       {
1861         /* Since the maximum bytes produced by each loop is 2, we
1862            subtract 1 from DST_END to assure overflow checking is
1863            necessary only at the head of loop.  */
1864         unsigned char *adjusted_dst_end = dst_end - 1;
1865
1866         while (src < src_end && dst < adjusted_dst_end)
1867           {
1868             unsigned char *src_base = src;
1869             unsigned char c = *src++;
1870             if (c == '\r')
1871               {
1872                 ONE_MORE_BYTE (c);
1873                 if (c != '\n')
1874                   *dst++ = '\r';
1875                 *dst++ = c;
1876               }
1877             else
1878               *dst++ = c;
1879             continue;
1880
1881           label_end_of_loop:
1882             coding->carryover_size = src - src_base;
1883             bcopy (src_base, coding->carryover, coding->carryover_size);
1884             src = src_base;
1885             break;
1886           }
1887         *consumed = src - source;
1888         produced = dst - destination;
1889         break;
1890       }
1891
1892     case CODING_EOL_CR:
1893       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1894       bcopy (source, destination, produced);
1895       dst_end = destination + produced;
1896       while (dst < dst_end)
1897         if (*dst++ == '\r') dst[-1] = '\n';
1898       *consumed = produced;
1899       break;
1900
1901     default:                    /* i.e. case: CODING_EOL_LF */
1902       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1903       bcopy (source, destination, produced);
1904       *consumed = produced;
1905       break;
1906     }
1907
1908   return produced;
1909 }
1910
1911 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
1912    format of end-of-line according to `coding->eol_type'.  If
1913    `coding->selective' is 1, code '\r' in source text also means
1914    end-of-line.  */
1915
1916 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1917      struct coding_system *coding;
1918      unsigned char *source, *destination;
1919      int src_bytes, dst_bytes;
1920      int *consumed;
1921 {
1922   unsigned char *src = source;
1923   unsigned char *dst = destination;
1924   int produced;
1925
1926   if (src_bytes <= 0)
1927     return 0;
1928
1929   switch (coding->eol_type)
1930     {
1931     case CODING_EOL_LF:
1932     case CODING_EOL_AUTOMATIC:
1933       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1934       bcopy (source, destination, produced);
1935       if (coding->selective)
1936         {
1937           int i = produced;
1938           while (i--)
1939             if (*dst++ == '\r') dst[-1] = '\n';
1940         }
1941       *consumed = produced;
1942
1943     case CODING_EOL_CRLF:
1944       {
1945         unsigned char c;
1946         unsigned char *src_end = source + src_bytes;
1947         unsigned char *dst_end = destination + dst_bytes;
1948         /* Since the maximum bytes produced by each loop is 2, we
1949            subtract 1 from DST_END to assure overflow checking is
1950            necessary only at the head of loop.  */
1951         unsigned char *adjusted_dst_end = dst_end - 1;
1952
1953         while (src < src_end && dst < adjusted_dst_end)
1954           {
1955             c = *src++;
1956             if (c == '\n' || (c == '\r' && coding->selective))
1957               *dst++ = '\r', *dst++ = '\n';
1958             else
1959               *dst++ = c;
1960           }
1961         produced = dst - destination;
1962         *consumed = src - source;
1963         break;
1964       }
1965
1966     default:                    /* i.e. case CODING_EOL_CR: */
1967       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
1968       bcopy (source, destination, produced);
1969       {
1970         int i = produced;
1971         while (i--)
1972           if (*dst++ == '\n') dst[-1] = '\r';
1973       }
1974       *consumed = produced;
1975     }
1976
1977   return produced;
1978 }
1979
1980 \f
1981 /*** 6. C library functions ***/
1982
1983 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
1984    has a property `coding-system'.  The value of this property is a
1985    vector of length 5 (called as coding-vector).  Among elements of
1986    this vector, the first (element[0]) and the fifth (element[4])
1987    carry important information for decoding/encoding.  Before
1988    decoding/encoding, this information should be set in fields of a
1989    structure of type `coding_system'.
1990
1991    A value of property `coding-system' can be a symbol of another
1992    subsidiary coding-system.  In that case, Emacs gets coding-vector
1993    from that symbol.
1994
1995    `element[0]' contains information to be set in `coding->type'.  The
1996    value and its meaning is as follows:
1997
1998    0 -- coding_system_internal
1999    1 -- coding_system_sjis
2000    2 -- coding_system_iso2022
2001    3 -- coding_system_big5
2002    4 -- coding_system_ccl
2003    nil -- coding_system_no_conversion
2004    t -- coding_system_automatic
2005
2006    `element[4]' contains information to be set in `coding->flags' and
2007    `coding->spec'.  The meaning varies by `coding->type'.
2008
2009    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2010    of length 32 (of which the first 13 sub-elements are used now).
2011    Meanings of these sub-elements are:
2012
2013    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2014         If the value is an integer of valid charset, the charset is
2015         assumed to be designated to graphic register N initially.
2016
2017         If the value is minus, it is a minus value of charset which
2018         reserves graphic register N, which means that the charset is
2019         not designated initially but should be designated to graphic
2020         register N just before encoding a character in that charset.
2021
2022         If the value is nil, graphic register N is never used on
2023         encoding.
2024
2025    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2026         Each value takes t or nil.  See the section ISO2022 of
2027         `coding.h' for more information.
2028
2029    If `coding->type' is `coding_type_big5', element[4] is t to denote
2030    BIG5-ETen or nil to denote BIG5-HKU.
2031
2032    If `coding->type' takes the other value, element[4] is ignored.
2033
2034    Emacs Lisp's coding system also carries information about format of
2035    end-of-line in a value of property `eol-type'.  If the value is
2036    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2037    means CODING_EOL_CR.  If it is not integer, it should be a vector
2038    of subsidiary coding systems of which property `eol-type' has one
2039    of above values.
2040
2041 */
2042
2043 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2044    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2045    is setup so that no conversion is necessary and return -1, else
2046    return 0.  */
2047
2048 int
2049 setup_coding_system (coding_system, coding)
2050      Lisp_Object coding_system;
2051      struct coding_system *coding;
2052 {
2053   Lisp_Object type, eol_type;
2054
2055   /* At first, set several fields default values.  */
2056   coding->require_flushing = 0;
2057   coding->last_block = 0;
2058   coding->selective = 0;
2059   coding->composing = 0;
2060   coding->direction = 0;
2061   coding->carryover_size = 0;
2062   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2063
2064   Vlast_coding_system_used = coding->symbol = coding_system;
2065   eol_type = Qnil;
2066   /* Get value of property `coding-system' until we get a vector.
2067      While doing that, also get values of properties
2068      `post-read-conversion', `pre-write-conversion', and `eol-type'.  */
2069   while (!NILP (coding_system) && SYMBOLP (coding_system))
2070     {
2071       if (NILP (coding->post_read_conversion))
2072         coding->post_read_conversion = Fget (coding_system,
2073                                              Qpost_read_conversion);
2074       if (NILP (coding->pre_write_conversion))
2075         coding->pre_write_conversion = Fget (coding_system,
2076                                              Qpre_write_conversion);
2077       if (NILP (eol_type))
2078         eol_type = Fget (coding_system, Qeol_type);
2079       coding_system = Fget (coding_system, Qcoding_system);
2080     }
2081   if (!VECTORP (coding_system)
2082       || XVECTOR (coding_system)->size != 5)
2083     goto label_invalid_coding_system;
2084
2085   if (VECTORP (eol_type))
2086     coding->eol_type = CODING_EOL_AUTOMATIC;
2087   else if (XFASTINT (eol_type) == 1)
2088     coding->eol_type = CODING_EOL_CRLF;
2089   else if (XFASTINT (eol_type) == 2)
2090     coding->eol_type = CODING_EOL_CR;
2091   else
2092     coding->eol_type = CODING_EOL_LF;
2093
2094   type = XVECTOR (coding_system)->contents[0];
2095   switch (XFASTINT (type))
2096     {
2097     case 0:
2098       coding->type = coding_type_internal;
2099       break;
2100
2101     case 1:
2102       coding->type = coding_type_sjis;
2103       break;
2104
2105     case 2:
2106       coding->type = coding_type_iso2022;
2107       {
2108         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2109         Lisp_Object *flags;
2110         int i, charset, default_reg_bits = 0;
2111
2112         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2113           goto label_invalid_coding_system;
2114
2115         flags = XVECTOR (val)->contents;
2116         coding->flags
2117           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2118              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2119              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2120              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2121              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2122              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2123              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2124              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2125              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2126              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2127              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2128
2129         /* Invoke graphic register 0 to plane 0.  */
2130         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2131         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2132         CODING_SPEC_ISO_INVOCATION (coding, 1)
2133           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2134         /* Not single shifting at first.  */
2135         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2136         /* Beginning of buffer should also be regarded as bol. */
2137         CODING_SPEC_ISO_BOL(coding) = 1;
2138
2139         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2140            FLAGS[REG] can be one of below:
2141                 integer CHARSET: CHARSET occupies register I,
2142                 t: designate nothing to REG initially, but can be used
2143                   by any charsets,
2144                 list of integer, nil, or t: designate the first
2145                   element (if integer) to REG initially, the remaining
2146                   elements (if integer) is designated to REG on request,
2147                   if an element is t, REG can be used by any charset,
2148                 nil: REG is never used.  */
2149         for (charset = 0; charset <= MAX_CHARSET; charset++)
2150           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = -1;
2151         for (i = 0; i < 4; i++)
2152           {
2153             if (INTEGERP (flags[i])
2154                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2155                 || (charset = get_charset_id (flags[i])) >= 0)
2156               {
2157                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2158                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2159               }
2160             else if (EQ (flags[i], Qt))
2161               {
2162                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2163                 default_reg_bits |= 1 << i;
2164               }
2165             else if (CONSP (flags[i]))
2166               {
2167                 Lisp_Object tail = flags[i];
2168
2169                 if (INTEGERP (XCONS (tail)->car)
2170                     && (charset = XINT (XCONS (tail)->car),
2171                         CHARSET_VALID_P (charset))
2172                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2173                   {
2174                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2175                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2176                   }
2177                 else
2178                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2179                 tail = XCONS (tail)->cdr;
2180                 while (CONSP (tail))
2181                   {
2182                     if (INTEGERP (XCONS (tail)->car)
2183                         && (charset = XINT (XCONS (tail)->car),
2184                             CHARSET_VALID_P (charset))
2185                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2186                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2187                         = i;
2188                     else if (EQ (XCONS (tail)->car, Qt))
2189                       default_reg_bits |= 1 << i;
2190                     tail = XCONS (tail)->cdr;
2191                   }
2192               }
2193             else
2194               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2195
2196             CODING_SPEC_ISO_DESIGNATION (coding, i)
2197               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2198           }
2199
2200         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2201           {
2202             /* REG 1 can be used only by locking shift in 7-bit env.  */
2203             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2204               default_reg_bits &= ~2;
2205             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2206               /* Without any shifting, only REG 0 and 1 can be used.  */
2207               default_reg_bits &= 3;
2208           }
2209
2210         for (charset = 0; charset <= MAX_CHARSET; charset++)
2211           if (CHARSET_VALID_P (charset)
2212               && CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) < 0)
2213             {
2214               /* We have not yet decided where to designate CHARSET.  */
2215               int reg_bits = default_reg_bits;
2216
2217               if (CHARSET_CHARS (charset) == 96)
2218                 /* A charset of CHARS96 can't be designated to REG 0.  */
2219                 reg_bits &= ~1;
2220
2221               if (reg_bits)
2222                 /* There exist some default graphic register.  */
2223                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2224                   = (reg_bits & 1
2225                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2226               else
2227                 /* We anyway have to designate CHARSET to somewhere.  */
2228                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2229                   = (CHARSET_CHARS (charset) == 94
2230                      ? 0
2231                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2232                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2233                         ? 1
2234                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2235                            ? 2 : 0)));
2236             }
2237       }
2238       coding->require_flushing = 1;
2239       break;
2240
2241     case 3:
2242       coding->type = coding_type_big5;
2243       coding->flags
2244         = (NILP (XVECTOR (coding_system)->contents[4])
2245            ? CODING_FLAG_BIG5_HKU
2246            : CODING_FLAG_BIG5_ETEN);
2247       break;
2248
2249     case 4:
2250       coding->type = coding_type_ccl;
2251       {
2252         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2253         if (CONSP  (val)
2254             && VECTORP (XCONS (val)->car)
2255             && VECTORP (XCONS (val)->cdr))
2256           {
2257             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2258             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2259           }
2260         else
2261           goto label_invalid_coding_system;
2262       }
2263       coding->require_flushing = 1;
2264       break;
2265
2266     default:
2267       if (EQ (type, Qt))
2268         coding->type = coding_type_automatic;
2269       else
2270         coding->type = coding_type_no_conversion;
2271       break;
2272     }
2273   return 0;
2274
2275  label_invalid_coding_system:
2276   coding->type = coding_type_no_conversion;
2277   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2278     = Qnil;
2279   return -1;
2280 }
2281
2282 /* Emacs has a mechanism to automatically detect a coding system if it
2283    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2284    it's impossible to distinguish some coding systems accurately
2285    because they use the same range of codes.  So, at first, coding
2286    systems are categorized into 7, those are:
2287
2288    o coding-category-internal
2289
2290         The category for a coding system which has the same code range
2291         as Emacs' internal format.  Assigned the coding-system (Lisp
2292         symbol) `internal' by default.
2293
2294    o coding-category-sjis
2295
2296         The category for a coding system which has the same code range
2297         as SJIS.  Assigned the coding-system (Lisp
2298         symbol) `shift-jis' by default.
2299
2300    o coding-category-iso-7
2301
2302         The category for a coding system which has the same code range
2303         as ISO2022 of 7-bit environment.  Assigned the coding-system
2304         (Lisp symbol) `iso-2022-7' by default.
2305
2306    o coding-category-iso-8-1
2307
2308         The category for a coding system which has the same code range
2309         as ISO2022 of 8-bit environment and graphic plane 1 used only
2310         for DIMENSION1 charset.  Assigned the coding-system (Lisp
2311         symbol) `iso-8859-1' by default.
2312
2313    o coding-category-iso-8-2
2314
2315         The category for a coding system which has the same code range
2316         as ISO2022 of 8-bit environment and graphic plane 1 used only
2317         for DIMENSION2 charset.  Assigned the coding-system (Lisp
2318         symbol) `euc-japan' by default.
2319
2320    o coding-category-iso-else
2321
2322         The category for a coding system which has the same code range
2323         as ISO2022 but not belongs to any of the above three
2324         categories.  Assigned the coding-system (Lisp symbol)
2325         `iso-2022-ss2-7' by default.
2326
2327    o coding-category-big5
2328
2329         The category for a coding system which has the same code range
2330         as BIG5.  Assigned the coding-system (Lisp symbol)
2331         `cn-big5' by default.
2332
2333    o coding-category-binary
2334
2335         The category for a coding system not categorized in any of the
2336         above.  Assigned the coding-system (Lisp symbol)
2337         `no-conversion' by default.
2338
2339    Each of them is a Lisp symbol and the value is an actual
2340    `coding-system's (this is also a Lisp symbol) assigned by a user.
2341    What Emacs does actually is to detect a category of coding system.
2342    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2343    decide only one possible category, it selects a category of the
2344    highest priority.  Priorities of categories are also specified by a
2345    user in a Lisp variable `coding-category-list'.
2346
2347 */
2348
2349 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2350    If it detects possible coding systems, return an integer in which
2351    appropriate flag bits are set.  Flag bits are defined by macros
2352    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2353
2354 int
2355 detect_coding_mask (src, src_bytes)
2356      unsigned char *src;
2357      int src_bytes;
2358 {
2359   register unsigned char c;
2360   unsigned char *src_end = src + src_bytes;
2361   int mask;
2362
2363   /* At first, skip all ASCII characters and control characters except
2364      for three ISO2022 specific control characters.  */
2365   while (src < src_end)
2366     {
2367       c = *src;
2368       if (c >= 0x80
2369           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2370         break;
2371       src++;
2372     }
2373
2374   if (src >= src_end)
2375     /* We found nothing other than ASCII.  There's nothing to do.  */
2376     return CODING_CATEGORY_MASK_ANY;
2377
2378   /* The text seems to be encoded in some multilingual coding system.
2379      Now, try to find in which coding system the text is encoded.  */
2380   if (c < 0x80)
2381     /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2382     /* C is an ISO2022 specific control code of C0.  */
2383     mask = detect_coding_iso2022 (src, src_end);
2384
2385   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2386     /* C is an ISO2022 specific control code of C1,
2387        or the first byte of SJIS's 2-byte character code,
2388        or a leading code of Emacs.  */
2389     mask = (detect_coding_iso2022 (src, src_end)
2390             | detect_coding_sjis (src, src_end)
2391             | detect_coding_internal (src, src_end));
2392
2393   else if (c < 0xA0)
2394     /* C is the first byte of SJIS character code,
2395        or a leading-code of Emacs.  */
2396     mask = (detect_coding_sjis (src, src_end)
2397             | detect_coding_internal (src, src_end));
2398
2399   else
2400     /* C is a character of ISO2022 in graphic plane right,
2401        or a SJIS's 1-byte character code (i.e. JISX0201),
2402        or the first byte of BIG5's 2-byte code.  */
2403     mask = (detect_coding_iso2022 (src, src_end)
2404             | detect_coding_sjis (src, src_end)
2405             | detect_coding_big5 (src, src_end));
2406
2407   return mask;
2408 }
2409
2410 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2411    The information of the detected coding system is set in CODING.  */
2412
2413 void
2414 detect_coding (coding, src, src_bytes)
2415      struct coding_system *coding;
2416      unsigned char *src;
2417      int src_bytes;
2418 {
2419   int mask = detect_coding_mask (src, src_bytes);
2420   int idx;
2421
2422   if (mask == CODING_CATEGORY_MASK_ANY)
2423     /* We found nothing other than ASCII.  There's nothing to do.  */
2424     return;
2425
2426   if (!mask)
2427     /* The source text seems to be encoded in unknown coding system.
2428        Emacs regards the category of such a kind of coding system as
2429        `coding-category-binary'.  We assume that a user has assigned
2430        an appropriate coding system for a `coding-category-binary'.  */
2431     idx = CODING_CATEGORY_IDX_BINARY;
2432   else
2433     {
2434       /* We found some plausible coding systems.  Let's use a coding
2435          system of the highest priority.  */
2436       Lisp_Object val = Vcoding_category_list;
2437
2438       if (CONSP (val))
2439         while (!NILP (val))
2440           {
2441             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2442             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2443               break;
2444             val = XCONS (val)->cdr;
2445           }
2446       else
2447         val = Qnil;
2448
2449       if (NILP (val))
2450         {
2451           /* For unknown reason, `Vcoding_category_list' contains none
2452              of found categories.  Let's use any of them.  */
2453           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2454             if (mask & (1 << idx))
2455               break;
2456         }
2457     }
2458   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2459 }
2460
2461 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2462    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2463    CODING_EOL_CR, and CODING_EOL_AUTOMATIC.  */
2464
2465 int
2466 detect_eol_type (src, src_bytes)
2467      unsigned char *src;
2468      int src_bytes;
2469 {
2470   unsigned char *src_end = src + src_bytes;
2471   unsigned char c;
2472
2473   while (src < src_end)
2474     {
2475       c = *src++;
2476       if (c == '\n')
2477         return CODING_EOL_LF;
2478       else if (c == '\r')
2479         {
2480           if (src < src_end && *src == '\n')
2481             return CODING_EOL_CRLF;
2482           else
2483             return CODING_EOL_CR;
2484         }
2485     }
2486   return CODING_EOL_AUTOMATIC;
2487 }
2488
2489 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2490    is encoded.  If it detects an appropriate format of end-of-line, it
2491    sets the information in *CODING.  */
2492
2493 void
2494 detect_eol (coding, src, src_bytes)
2495      struct coding_system *coding;
2496      unsigned char *src;
2497      int src_bytes;
2498 {
2499   Lisp_Object val;
2500   int eol_type = detect_eol_type (src, src_bytes);
2501
2502   if (eol_type == CODING_EOL_AUTOMATIC)
2503     /*  We found no end-of-line in the source text.  */
2504     return;
2505
2506   val = Fget (coding->symbol, Qeol_type);
2507   if (VECTORP (val) && XVECTOR (val)->size == 3)
2508     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2509 }
2510
2511 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2512    decoding, it may detect coding system and format of end-of-line if
2513    those are not yet decided.  */
2514
2515 int
2516 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2517      struct coding_system *coding;
2518      unsigned char *source, *destination;
2519      int src_bytes, dst_bytes;
2520      int *consumed;
2521 {
2522   int produced;
2523
2524   if (src_bytes <= 0)
2525     {
2526       *consumed = 0;
2527       return 0;
2528     }
2529
2530   if (coding->type == coding_type_automatic)
2531     detect_coding (coding, source, src_bytes);
2532
2533   if (coding->eol_type == CODING_EOL_AUTOMATIC)
2534     detect_eol (coding, source, src_bytes);
2535
2536   coding->carryover_size = 0;
2537   switch (coding->type)
2538     {
2539     case coding_type_no_conversion:
2540     label_no_conversion:
2541       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2542       bcopy (source, destination, produced);
2543       *consumed = produced;
2544       break;
2545
2546     case coding_type_internal:
2547     case coding_type_automatic:
2548       if (coding->eol_type == CODING_EOL_LF
2549           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2550         goto label_no_conversion;
2551       produced = decode_eol (coding, source, destination,
2552                              src_bytes, dst_bytes, consumed);
2553       break;
2554
2555     case coding_type_sjis:
2556       produced = decode_coding_sjis_big5 (coding, source, destination,
2557                                           src_bytes, dst_bytes, consumed,
2558                                           1);
2559       break;
2560
2561     case coding_type_iso2022:
2562       produced = decode_coding_iso2022 (coding, source, destination,
2563                                         src_bytes, dst_bytes, consumed);
2564       break;
2565
2566     case coding_type_big5:
2567       produced = decode_coding_sjis_big5 (coding, source, destination,
2568                                           src_bytes, dst_bytes, consumed,
2569                                           0);
2570       break;
2571
2572     case coding_type_ccl:
2573       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2574                              src_bytes, dst_bytes, consumed);
2575       break;
2576     }
2577
2578   return produced;
2579 }
2580
2581 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2582
2583 int
2584 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2585      struct coding_system *coding;
2586      unsigned char *source, *destination;
2587      int src_bytes, dst_bytes;
2588      int *consumed;
2589 {
2590   int produced;
2591
2592   coding->carryover_size = 0;
2593   switch (coding->type)
2594     {
2595     case coding_type_no_conversion:
2596     label_no_conversion:
2597       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2598       if (produced > 0)
2599         {
2600           bcopy (source, destination, produced);
2601           if (coding->selective)
2602             {
2603               unsigned char *p = destination, *pend = destination + produced;
2604               while (p < pend)
2605                 if (*p++ == '\015') p[-1] = '\n';
2606             }
2607         }
2608       *consumed = produced;
2609       break;
2610
2611     case coding_type_internal:
2612     case coding_type_automatic:
2613       if (coding->eol_type == CODING_EOL_LF
2614           ||  coding->eol_type == CODING_EOL_AUTOMATIC)
2615         goto label_no_conversion;
2616       produced = encode_eol (coding, source, destination,
2617                              src_bytes, dst_bytes, consumed);
2618       break;
2619
2620     case coding_type_sjis:
2621       produced = encode_coding_sjis_big5 (coding, source, destination,
2622                                           src_bytes, dst_bytes, consumed,
2623                                           1);
2624       break;
2625
2626     case coding_type_iso2022:
2627       produced = encode_coding_iso2022 (coding, source, destination,
2628                                         src_bytes, dst_bytes, consumed);
2629       break;
2630
2631     case coding_type_big5:
2632       produced = encode_coding_sjis_big5 (coding, source, destination,
2633                                           src_bytes, dst_bytes, consumed,
2634                                           0);
2635       break;
2636
2637     case coding_type_ccl:
2638       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2639                              src_bytes, dst_bytes, consumed);
2640       break;
2641     }
2642
2643   return produced;
2644 }
2645
2646 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2647
2648 /* Return maximum size (bytes) of a buffer enough for decoding
2649    SRC_BYTES of text encoded in CODING.  */
2650
2651 int
2652 decoding_buffer_size (coding, src_bytes)
2653      struct coding_system *coding;
2654      int src_bytes;
2655 {
2656   int magnification;
2657
2658   if (coding->type == coding_type_iso2022)
2659     magnification = 3;
2660   else if (coding->type == coding_type_ccl)
2661     magnification = coding->spec.ccl.decoder.buf_magnification;
2662   else
2663     magnification = 2;
2664
2665   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2666 }
2667
2668 /* Return maximum size (bytes) of a buffer enough for encoding
2669    SRC_BYTES of text to CODING.  */
2670
2671 int
2672 encoding_buffer_size (coding, src_bytes)
2673      struct coding_system *coding;
2674      int src_bytes;
2675 {
2676   int magnification;
2677
2678   if (coding->type == coding_type_ccl)
2679     magnification = coding->spec.ccl.encoder.buf_magnification;
2680   else
2681     magnification = 3;
2682
2683   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2684 }
2685
2686 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2687 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2688 #endif
2689
2690 char *conversion_buffer;
2691 int conversion_buffer_size;
2692
2693 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2694    or decoding.  Sufficient memory is allocated automatically.  If we
2695    run out of memory, return NULL.  */
2696
2697 char *
2698 get_conversion_buffer (size)
2699      int size;
2700 {
2701   if (size > conversion_buffer_size)
2702     {
2703       char *buf;
2704       int real_size = conversion_buffer_size * 2;
2705
2706       while (real_size < size) real_size *= 2;
2707       buf = (char *) xmalloc (real_size);
2708       xfree (conversion_buffer);
2709       conversion_buffer = buf;
2710       conversion_buffer_size = real_size;
2711     }
2712   return conversion_buffer;
2713 }
2714
2715 \f
2716 #ifdef emacs
2717 /*** 7. Emacs Lisp library functions ***/
2718
2719 DEFUN ("coding-system-vector", Fcoding_system_vector, Scoding_system_vector,
2720        1, 1, 0,
2721   "Return coding-vector of CODING-SYSTEM.\n\
2722 If CODING-SYSTEM is not a valid coding-system, return nil.")
2723   (obj)
2724      Lisp_Object obj;
2725 {
2726   while (SYMBOLP (obj) && !NILP (obj))
2727     obj = Fget (obj, Qcoding_system);
2728   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2729           ? Qnil : obj);
2730 }
2731
2732 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2733   "Return t if OBJECT is nil or a coding-system.\n\
2734 See document of make-coding-system for coding-system object.")
2735   (obj)
2736      Lisp_Object obj;
2737 {
2738   return ((NILP (obj) || !NILP (Fcoding_system_vector (obj))) ? Qt : Qnil);
2739 }
2740
2741 DEFUN ("read-non-nil-coding-system",
2742        Fread_non_nil_coding_system, Sread_non_nil_coding_system, 1, 1, 0,
2743   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2744   (prompt)
2745      Lisp_Object prompt;
2746 {
2747   Lisp_Object val;
2748   do {
2749     val = Fcompleting_read (prompt, Vobarray, Qcoding_system_vector,
2750                             Qt, Qnil, Qnil);
2751   } while (XSTRING (val)->size == 0);
2752   return (Fintern (val, Qnil));
2753 }
2754
2755 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2756   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2757   (prompt)
2758      Lisp_Object prompt;
2759 {
2760   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2761                                       Qt, Qnil, Qnil);
2762   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2763 }
2764
2765 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2766        1, 1, 0,
2767   "Check validity of CODING-SYSTEM.\n\
2768 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2769 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2770 The value of property should be a vector of length 5.")
2771   (coding_system)
2772      Lisp_Object coding_system;
2773 {
2774   CHECK_SYMBOL (coding_system, 0);
2775   if (!NILP (Fcoding_system_p (coding_system)))
2776     return coding_system;
2777   while (1)
2778     Fsignal (Qcoding_system_error, coding_system);
2779 }
2780
2781 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2782        2, 2, 0,
2783   "Detect coding-system of the text in the region between START and END.\n\
2784 Return a list of possible coding-systems ordered by priority.\n\
2785 If only ASCII characters are found, it returns `automatic-conversion'\n\
2786  or its subsidiary coding-system according to a detected end-of-line format.")
2787   (b, e)
2788      Lisp_Object b, e;
2789 {
2790   int coding_mask, eol_type;
2791   Lisp_Object val;
2792   int beg, end;
2793
2794   validate_region (&b, &e);
2795   beg = XINT (b), end = XINT (e);
2796   if (beg < GPT && end >= GPT) move_gap (end);
2797
2798   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2799   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2800
2801   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2802     {
2803       val = intern ("automatic-conversion");
2804       if (eol_type != CODING_EOL_AUTOMATIC)
2805         {
2806           Lisp_Object val2 = Fget (val, Qeol_type);
2807           if (VECTORP (val2))
2808             val = XVECTOR (val2)->contents[eol_type];
2809         }
2810     }
2811   else
2812     {
2813       Lisp_Object val2;
2814
2815       /* At first, gather possible coding-systems in VAL in a reverse
2816          order.  */
2817       val = Qnil;
2818       for (val2 = Vcoding_category_list;
2819            !NILP (val2);
2820            val2 = XCONS (val2)->cdr)
2821         {
2822           int idx
2823             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2824           if (coding_mask & (1 << idx))
2825             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
2826         }
2827
2828       /* Then, change the order of the list, while getting subsidiary
2829          coding-systems.  */
2830       val2 = val;
2831       val = Qnil;
2832       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
2833         {
2834           if (eol_type == CODING_EOL_AUTOMATIC)
2835             val = Fcons (XCONS (val2)->car, val);
2836           else
2837             {
2838               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
2839               if (VECTORP (val3))
2840                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
2841               else
2842                 val = Fcons (XCONS (val2)->car, val);
2843             }
2844         }
2845     }
2846
2847   return val;
2848 }
2849
2850 /* Scan text in the region between *BEGP and *ENDP, skip characters
2851    which we never have to encode to (iff ENCODEP is 1) or decode from
2852    coding system CODING at the head and tail, then set BEGP and ENDP
2853    to the addresses of start and end of the text we actually convert.  */
2854
2855 void
2856 shrink_conversion_area (begp, endp, coding, encodep)
2857      unsigned char **begp, **endp;
2858      struct coding_system *coding;
2859      int encodep;
2860 {
2861   register unsigned char *beg_addr = *begp, *end_addr = *endp;
2862
2863   if (coding->eol_type != CODING_EOL_LF
2864       && coding->eol_type != CODING_EOL_AUTOMATIC)
2865     /* Since we anyway have to convert end-of-line format, it is not
2866        worth skipping at most 100 bytes or so.  */
2867     return;
2868
2869   if (encodep)                  /* for encoding */
2870     {
2871       switch (coding->type)
2872         {
2873         case coding_type_no_conversion:
2874         case coding_type_internal:
2875         case coding_type_automatic:
2876           /* We need no conversion.  */
2877           *begp = *endp;
2878           return;
2879         case coding_type_ccl:
2880           /* We can't skip any data.  */
2881           return;
2882         case coding_type_iso2022:
2883           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2884             {
2885               unsigned char *bol = beg_addr;
2886               while (beg_addr < end_addr && *beg_addr < 0x80)
2887                 {
2888                   beg_addr++;
2889                   if (*(beg_addr - 1) == '\n')
2890                     bol = beg_addr;
2891                 }
2892               beg_addr = bol;
2893               goto label_skip_tail;
2894             }
2895           /* fall down ... */
2896         default:
2897           /* We can skip all ASCII characters at the head and tail.  */
2898           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2899         label_skip_tail:
2900           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2901           break;
2902         }
2903     }
2904   else                          /* for decoding */
2905     {
2906       switch (coding->type)
2907         {
2908         case coding_type_no_conversion:
2909           /* We need no conversion.  */
2910           *begp = *endp;
2911           return;
2912         case coding_type_internal:
2913           if (coding->eol_type == CODING_EOL_LF)
2914             {
2915               /* We need no conversion.  */
2916               *begp = *endp;
2917               return;
2918             }
2919           /* We can skip all but carriage-return.  */
2920           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
2921           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
2922           break;
2923         case coding_type_sjis:
2924         case coding_type_big5:
2925           /* We can skip all ASCII characters at the head.  */
2926           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
2927           /* We can skip all ASCII characters at the tail except for
2928              the second byte of SJIS or BIG5 code.  */
2929           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
2930           if (end_addr != *endp)
2931             end_addr++;
2932           break;
2933         case coding_type_ccl:
2934           /* We can't skip any data.  */
2935           return;
2936         default:                /* i.e. case coding_type_iso2022: */
2937           {
2938             unsigned char c;
2939
2940             /* We can skip all ASCII characters except for a few
2941                control codes at the head.  */
2942             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
2943                    && c != ISO_CODE_CR && c != ISO_CODE_SO
2944                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
2945               beg_addr++;
2946           }
2947           break;
2948         }
2949     }
2950   *begp = beg_addr;
2951   *endp = end_addr;
2952   return;
2953 }
2954
2955 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
2956    text between B and E.  B and E are buffer position.  */
2957
2958 Lisp_Object
2959 code_convert_region (b, e, coding, encodep)
2960      Lisp_Object b, e;
2961      struct coding_system *coding;
2962      int encodep;
2963 {
2964   int beg, end, len, consumed, produced;
2965   char *buf;
2966   unsigned char *begp, *endp;
2967   int pos = PT;
2968
2969   validate_region (&b, &e);
2970   beg = XINT (b), end = XINT (e);
2971   if (beg < GPT && end >= GPT)
2972     move_gap (end);
2973
2974   if (encodep && !NILP (coding->pre_write_conversion))
2975     {
2976       /* We must call a pre-conversion function which may put a new
2977          text to be converted in a new buffer.  */
2978       struct buffer *old = current_buffer, *new;
2979
2980       TEMP_SET_PT (beg);
2981       call2 (coding->pre_write_conversion, b, e);
2982       if (old != current_buffer)
2983         {
2984           /* Replace the original text by the text just generated.  */
2985           len = ZV - BEGV;
2986           new = current_buffer;
2987           set_buffer_internal (old);
2988           del_range (beg, end);
2989           insert_from_buffer (new, 1, len, 0);
2990           end = beg + len;
2991         }
2992     }
2993
2994   /* We may be able to shrink the conversion region.  */
2995   begp = POS_ADDR (beg); endp = begp + (end - beg);
2996   shrink_conversion_area (&begp, &endp, coding, encodep);
2997
2998   if (begp == endp)
2999     /* We need no conversion.  */
3000     len = end - beg;
3001   else
3002     {
3003       beg += begp - POS_ADDR (beg);
3004       end =  beg + (endp - begp);
3005
3006       if (encodep)
3007         len = encoding_buffer_size (coding, end - beg);
3008       else
3009         len = decoding_buffer_size (coding, end - beg);
3010       buf = get_conversion_buffer (len);
3011
3012       coding->last_block = 1;
3013       produced = (encodep
3014                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3015                                    &consumed)
3016                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3017                                    &consumed));
3018
3019       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3020
3021       TEMP_SET_PT (beg);
3022       insert (buf, produced);
3023       del_range (PT, PT + end - beg);
3024       if (pos >= end)
3025         pos = PT + (pos - end);
3026       else if (pos > beg)
3027         pos = beg;
3028       TEMP_SET_PT (pos);
3029   }
3030
3031   if (!encodep && !NILP (coding->post_read_conversion))
3032     {
3033       /* We must call a post-conversion function which may alter
3034          the text just converted.  */
3035       Lisp_Object insval;
3036
3037       beg = XINT (b);
3038       TEMP_SET_PT (beg);
3039       insval = call1 (coding->post_read_conversion, make_number (len));
3040       CHECK_NUMBER (insval, 0);
3041       len = XINT (insval);
3042     }
3043
3044   return make_number (len);
3045 }
3046
3047 Lisp_Object
3048 code_convert_string (str, coding, encodep, nocopy)
3049      Lisp_Object str, nocopy;
3050      struct coding_system *coding;
3051      int encodep;
3052 {
3053   int len, consumed, produced;
3054   char *buf;
3055   unsigned char *begp, *endp;
3056   int head_skip, tail_skip;
3057   struct gcpro gcpro1;
3058
3059   if (encodep && !NILP (coding->pre_write_conversion)
3060       || !encodep && !NILP (coding->post_read_conversion))
3061     {
3062       /* Since we have to call Lisp functions which assume target text
3063          is in a buffer, after setting a temporary buffer, call
3064          code_convert_region.  */
3065       int count = specpdl_ptr - specpdl;
3066       int len = XSTRING (str)->size;
3067       Lisp_Object result;
3068       struct buffer *old = current_buffer;
3069
3070       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3071       temp_output_buffer_setup (" *code-converting-work*");
3072       set_buffer_internal (XBUFFER (Vstandard_output));
3073       insert_from_string (str, 0, len, 0);
3074       code_convert_region (make_number (BEGV), make_number (ZV),
3075                            coding, encodep);
3076       result = make_buffer_string (BEGV, ZV, 0);
3077       set_buffer_internal (old);
3078       return unbind_to (count, result);
3079     }
3080
3081   /* We may be able to shrink the conversion region.  */
3082   begp = XSTRING (str)->data;
3083   endp = begp + XSTRING (str)->size;
3084   shrink_conversion_area (&begp, &endp, coding, encodep);
3085
3086   if (begp == endp)
3087     /* We need no conversion.  */
3088     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3089
3090   head_skip = begp - XSTRING (str)->data;
3091   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3092
3093   GCPRO1 (str);
3094
3095   if (encodep)
3096     len = encoding_buffer_size (coding, endp - begp);
3097   else
3098     len = decoding_buffer_size (coding, endp - begp);
3099   buf = get_conversion_buffer (len + head_skip + tail_skip);
3100
3101   bcopy (XSTRING (str)->data, buf, head_skip);
3102   coding->last_block = 1;
3103   produced = (encodep
3104               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3105                                buf + head_skip, endp - begp, len, &consumed)
3106               : decode_coding (coding, XSTRING (str)->data + head_skip,
3107                                buf + head_skip, endp - begp, len, &consumed));
3108   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3109          buf + head_skip + produced,
3110          tail_skip);
3111
3112   UNGCPRO;
3113
3114   return make_string (buf, head_skip + produced + tail_skip);
3115 }
3116
3117 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3118        3, 3, "r\nzCoding system: ",
3119   "Decode current region by specified coding system.\n\
3120 When called from a program, takes three arguments:\n\
3121 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3122 Return length of decoded text.")
3123   (b, e, coding_system)
3124      Lisp_Object b, e, coding_system;
3125 {
3126   struct coding_system coding;
3127
3128   CHECK_NUMBER_COERCE_MARKER (b, 0);
3129   CHECK_NUMBER_COERCE_MARKER (e, 1);
3130   CHECK_SYMBOL (coding_system, 2);
3131
3132   if (NILP (coding_system))
3133     return make_number (XFASTINT (e) - XFASTINT (b));
3134   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3135     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3136
3137   return code_convert_region (b, e, &coding, 0);
3138 }
3139
3140 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3141        3, 3, "r\nzCoding system: ",
3142   "Encode current region by specified coding system.\n\
3143 When called from a program, takes three arguments:\n\
3144 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3145 Return length of encoded text.")
3146   (b, e, coding_system)
3147      Lisp_Object b, e, coding_system;
3148 {
3149   struct coding_system coding;
3150
3151   CHECK_NUMBER_COERCE_MARKER (b, 0);
3152   CHECK_NUMBER_COERCE_MARKER (e, 1);
3153   CHECK_SYMBOL (coding_system, 2);
3154
3155   if (NILP (coding_system))
3156     return make_number (XFASTINT (e) - XFASTINT (b));
3157   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3158     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3159
3160   return code_convert_region (b, e, &coding, 1);
3161 }
3162
3163 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3164        2, 3, 0,
3165   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3166 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3167 of decoding.")
3168   (string, coding_system, nocopy)
3169      Lisp_Object string, coding_system, nocopy;
3170 {
3171   struct coding_system coding;
3172
3173   CHECK_STRING (string, 0);
3174   CHECK_SYMBOL (coding_system, 1);
3175
3176   if (NILP (coding_system))
3177     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3178   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3179     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3180
3181   return code_convert_string (string, &coding, 0, nocopy);
3182 }
3183
3184 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3185        2, 3, 0,
3186   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3187 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3188 of encoding.")
3189   (string, coding_system, nocopy)
3190      Lisp_Object string, coding_system, nocopy;
3191 {
3192   struct coding_system coding;
3193
3194   CHECK_STRING (string, 0);
3195   CHECK_SYMBOL (coding_system, 1);
3196
3197   if (NILP (coding_system))
3198     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3199   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3200     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3201
3202   return code_convert_string (string, &coding, 1, nocopy);
3203 }
3204
3205 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3206   "Decode a JISX0208 character of shift-jis encoding.\n\
3207 CODE is the character code in SJIS.\n\
3208 Return the corresponding character.")
3209   (code)
3210      Lisp_Object code;
3211 {
3212   unsigned char c1, c2, s1, s2;
3213   Lisp_Object val;
3214
3215   CHECK_NUMBER (code, 0);
3216   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3217   DECODE_SJIS (s1, s2, c1, c2);
3218   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3219   return val;
3220 }
3221
3222 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3223   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3224 Return the corresponding character code in SJIS.")
3225   (ch)
3226      Lisp_Object ch;
3227 {
3228   int charset;
3229   unsigned char c1, c2, s1, s2;
3230   Lisp_Object val;
3231
3232   CHECK_NUMBER (ch, 0);
3233   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3234   if (charset == charset_jisx0208)
3235     {
3236       ENCODE_SJIS (c1, c2, s1, s2);
3237       XSETFASTINT (val, ((int)s1 << 8) | s2);
3238     }
3239   else
3240     XSETFASTINT (val, 0);
3241   return val;
3242 }
3243
3244 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3245   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3246 CODE is the character code in BIG5.\n\
3247 Return the corresponding character.")
3248   (code)
3249      Lisp_Object code;
3250 {
3251   int charset;
3252   unsigned char b1, b2, c1, c2;
3253   Lisp_Object val;
3254
3255   CHECK_NUMBER (code, 0);
3256   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3257   DECODE_BIG5 (b1, b2, charset, c1, c2);
3258   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3259   return val;
3260 }
3261
3262 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3263   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3264 Return the corresponding character code in Big5.")
3265   (ch)
3266      Lisp_Object ch;
3267 {
3268   int charset;
3269   unsigned char c1, c2, b1, b2;
3270   Lisp_Object val;
3271
3272   CHECK_NUMBER (ch, 0);
3273   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3274   if (charset == charset_big5_1 || charset == charset_big5_2)
3275     {
3276       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3277       XSETFASTINT (val, ((int)b1 << 8) | b2);
3278     }
3279   else
3280     XSETFASTINT (val, 0);
3281   return val;
3282 }
3283
3284 DEFUN ("set-terminal-coding-system",
3285        Fset_terminal_coding_system, Sset_terminal_coding_system, 1, 1,
3286        "zCoding-system for terminal display: ",
3287   "Set coding-system of your terminal to CODING-SYSTEM.\n\
3288 All outputs to terminal are encoded to this coding-system.")
3289   (coding_system)
3290      Lisp_Object coding_system;
3291 {
3292   CHECK_SYMBOL (coding_system, 0);
3293   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3294   update_mode_lines++;
3295   if (!NILP (Finteractive_p ()))
3296     Fredraw_display ();
3297   return Qnil;
3298 }
3299
3300 DEFUN ("terminal-coding-system",
3301        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3302   "Return coding-system of your terminal.")
3303   ()
3304 {
3305   return terminal_coding.symbol;
3306 }
3307
3308 DEFUN ("set-keyboard-coding-system",
3309        Fset_keyboard_coding_system, Sset_keyboard_coding_system, 1, 1,
3310        "zCoding-system for keyboard input: ",
3311   "Set coding-system of what is sent from terminal keyboard to CODING-SYSTEM.\n\
3312 All inputs from terminal are decoded from this coding-system.")
3313   (coding_system)
3314      Lisp_Object coding_system;
3315 {
3316   CHECK_SYMBOL (coding_system, 0);
3317   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3318   return Qnil;
3319 }
3320
3321 DEFUN ("keyboard-coding-system",
3322        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3323   "Return coding-system of what is sent from terminal keyboard.")
3324   ()
3325 {
3326   return keyboard_coding.symbol;
3327 }
3328
3329 \f
3330 DEFUN ("find-coding-system", Ffind_coding_system, Sfind_coding_system,
3331        1, MANY, 0,
3332   "Choose a coding system for a file operation based on file name.\n\
3333 The value names a pair of coding systems: (ENCODING-SYSTEM DECODING-SYSTEM).\n\
3334 ENCODING-SYSTEM is the coding system to use for encoding\n\
3335 \(in case OPERATION does encoding), and DECODING-SYSTEM is the coding system\n\
3336 for decoding (in case OPERATION does decoding).\n\
3337 \n\
3338 The first argument OPERATION specifies an I/O primitive:\n\
3339   For file I/O, `insert-file-contents' or `write-region'.\n\
3340   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3341   For network I/O, `open-network-stream'.\n\
3342 \n\
3343 The remaining arguments should be the same arguments that were passed\n\
3344 to the primitive.  Depending on which primitive, one of those arguments\n\
3345 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3346 whichever argument specifies the file name is TARGET.\n\
3347 \n\
3348 TARGET has a meaning which depends on OPERATION:\n\
3349   For file I/O, TARGET is a file name.\n\
3350   For process I/O, TARGET is a process name.\n\
3351   For network I/O, TARGET is a service name or a port number\n\
3352 \n\
3353 This function looks up what `coding-system-alist' specifies for\n\
3354 OPERATION and TARGET.  It may specify a cons cell which represents\n\
3355 a particular coding system or it may have a function to call.\n\
3356 In the latter case, we call the function with one argument,\n\
3357 which is a list of all the arguments given to `find-coding-system'.")
3358   (nargs, args)
3359      int nargs;
3360      Lisp_Object *args;
3361 {
3362   Lisp_Object operation, target_idx, target, val;
3363   register Lisp_Object chain;
3364
3365   if (nargs < 2)
3366     error ("Too few arguments");
3367   operation = args[0];
3368   if (!SYMBOLP (operation)
3369       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3370     error ("Invalid first arguement");
3371   if (nargs < 1 + XINT (target_idx))
3372     error ("Too few arguments for operation: %s",
3373            XSYMBOL (operation)->name->data);
3374   target = args[XINT (target_idx) + 1];
3375   if (!(STRINGP (target)
3376         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3377     error ("Invalid %dth argument", XINT (target_idx) + 1);
3378
3379   chain = Fassq (operation, Vcoding_system_alist);
3380   if (NILP (chain))
3381     return Qnil;
3382
3383   for (chain = XCONS (chain)->cdr; CONSP (chain); chain = XCONS (chain)->cdr)
3384     {
3385       Lisp_Object elt = XCONS (chain)->car;
3386
3387       if (CONSP (elt)
3388           && ((STRINGP (target)
3389                && STRINGP (XCONS (elt)->car)
3390                && fast_string_match (XCONS (elt)->car, target) >= 0)
3391               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3392         return (CONSP (val = XCONS (elt)->cdr)
3393                 ? val
3394                 : ((SYMBOLP (val) && Fboundp (val)
3395                     ? call2 (val, Flist (nargs, args))
3396                     : Qnil)));
3397     }
3398   return Qnil;
3399 }
3400
3401 #endif /* emacs */
3402
3403 \f
3404 /*** 8. Post-amble ***/
3405
3406 init_coding_once ()
3407 {
3408   int i;
3409
3410   /* Emacs internal format specific initialize routine.  */
3411   for (i = 0; i <= 0x20; i++)
3412     emacs_code_class[i] = EMACS_control_code;
3413   emacs_code_class[0x0A] = EMACS_linefeed_code;
3414   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3415   for (i = 0x21 ; i < 0x7F; i++)
3416     emacs_code_class[i] = EMACS_ascii_code;
3417   emacs_code_class[0x7F] = EMACS_control_code;
3418   emacs_code_class[0x80] = EMACS_leading_code_composition;
3419   for (i = 0x81; i < 0xFF; i++)
3420     emacs_code_class[i] = EMACS_invalid_code;
3421   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3422   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3423   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3424   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3425
3426   /* ISO2022 specific initialize routine.  */
3427   for (i = 0; i < 0x20; i++)
3428     iso_code_class[i] = ISO_control_code;
3429   for (i = 0x21; i < 0x7F; i++)
3430     iso_code_class[i] = ISO_graphic_plane_0;
3431   for (i = 0x80; i < 0xA0; i++)
3432     iso_code_class[i] = ISO_control_code;
3433   for (i = 0xA1; i < 0xFF; i++)
3434     iso_code_class[i] = ISO_graphic_plane_1;
3435   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3436   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3437   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3438   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3439   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3440   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3441   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3442   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3443   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3444   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3445
3446   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3447   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3448
3449   setup_coding_system (Qnil, &keyboard_coding);
3450   setup_coding_system (Qnil, &terminal_coding);
3451 }
3452
3453 #ifdef emacs
3454
3455 syms_of_coding ()
3456 {
3457   Qtarget_idx = intern ("target-idx");
3458   staticpro (&Qtarget_idx);
3459
3460   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3461   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3462
3463   Qcall_process = intern ("call-process");
3464   staticpro (&Qcall_process);
3465   Fput (Qcall_process, Qtarget_idx, make_number (0));
3466
3467   Qcall_process_region = intern ("call-process-region");
3468   staticpro (&Qcall_process_region);
3469   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3470
3471   Qstart_process = intern ("start-process");
3472   staticpro (&Qstart_process);
3473   Fput (Qstart_process, Qtarget_idx, make_number (2));
3474
3475   Qopen_network_stream = intern ("open-network-stream");
3476   staticpro (&Qopen_network_stream);
3477   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3478
3479   Qcoding_system = intern ("coding-system");
3480   staticpro (&Qcoding_system);
3481
3482   Qeol_type = intern ("eol-type");
3483   staticpro (&Qeol_type);
3484
3485   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3486   staticpro (&Qbuffer_file_coding_system);
3487
3488   Qpost_read_conversion = intern ("post-read-conversion");
3489   staticpro (&Qpost_read_conversion);
3490
3491   Qpre_write_conversion = intern ("pre-write-conversion");
3492   staticpro (&Qpre_write_conversion);
3493
3494   Qcoding_system_vector = intern ("coding-system-vector");
3495   staticpro (&Qcoding_system_vector);
3496
3497   Qcoding_system_p = intern ("coding-system-p");
3498   staticpro (&Qcoding_system_p);
3499
3500   Qcoding_system_error = intern ("coding-system-error");
3501   staticpro (&Qcoding_system_error);
3502
3503   Fput (Qcoding_system_error, Qerror_conditions,
3504         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3505   Fput (Qcoding_system_error, Qerror_message,
3506         build_string ("Coding-system error"));
3507
3508   Qcoding_category_index = intern ("coding-category-index");
3509   staticpro (&Qcoding_category_index);
3510
3511   {
3512     int i;
3513     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3514       {
3515         coding_category_table[i] = intern (coding_category_name[i]);
3516         staticpro (&coding_category_table[i]);
3517         Fput (coding_category_table[i], Qcoding_category_index,
3518               make_number (i));
3519       }
3520   }
3521
3522   defsubr (&Scoding_system_vector);
3523   defsubr (&Scoding_system_p);
3524   defsubr (&Sread_coding_system);
3525   defsubr (&Sread_non_nil_coding_system);
3526   defsubr (&Scheck_coding_system);
3527   defsubr (&Sdetect_coding_region);
3528   defsubr (&Sdecode_coding_region);
3529   defsubr (&Sencode_coding_region);
3530   defsubr (&Sdecode_coding_string);
3531   defsubr (&Sencode_coding_string);
3532   defsubr (&Sdecode_sjis_char);
3533   defsubr (&Sencode_sjis_char);
3534   defsubr (&Sdecode_big5_char);
3535   defsubr (&Sencode_big5_char);
3536   defsubr (&Sset_terminal_coding_system);
3537   defsubr (&Sterminal_coding_system);
3538   defsubr (&Sset_keyboard_coding_system);
3539   defsubr (&Skeyboard_coding_system);
3540   defsubr (&Sfind_coding_system);
3541
3542   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3543     "List of coding-categories (symbols) ordered by priority.");
3544   {
3545     int i;
3546
3547     Vcoding_category_list = Qnil;
3548     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3549       Vcoding_category_list
3550         = Fcons (coding_category_table[i], Vcoding_category_list);
3551   }
3552
3553   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3554     "A variable of internal use only.\n\
3555 If the value is a coding system, it is used for decoding on read operation.\n\
3556 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3557   Vcoding_system_for_read = Qnil;
3558
3559   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3560     "A variable of internal use only.\n\
3561 If the value is a coding system, it is used for encoding on write operation.\n\
3562 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3563   Vcoding_system_for_write = Qnil;
3564
3565   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3566     "Coding-system used in the latest file or process I/O.");
3567   Vlast_coding_system_used = Qnil;
3568
3569   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
3570     "Nested alist to decide a coding system for a specific I/O operation.\n\
3571 The format is ((OPERATION . ((REGEXP . CODING-SYSTEMS) ...)) ...).\n\
3572 \n\
3573 OPERATION is one of the following Emacs I/O primitives:\n\
3574   For file I/O, insert-file-contents and write-region.\n\
3575   For process I/O, call-process, call-process-region, and start-process.\n\
3576   For network I/O, open-network-stream.\n\
3577 In addition, for process I/O, `process-argument' can be specified for\n\
3578 encoding arguments of the process.\n\
3579 \n\
3580 REGEXP is a regular expression matching a target of OPERATION, where\n\
3581 target is a file name for file I/O operations, a process name for\n\
3582 process I/O operations, or a service name for network I/O\n\
3583 operations.  REGEXP might be a port number for network I/O operation.\n\
3584 \n\
3585 CODING-SYSTEMS is a cons of coding systems to encode and decode\n\
3586 character code on OPERATION, or a function symbol returning the cons.\n\
3587 See the documentation of `find-coding-system' for more detail.");
3588   Vcoding_system_alist = Qnil;
3589
3590   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3591     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3592   eol_mnemonic_unix = '.';
3593
3594   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3595     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3596   eol_mnemonic_dos = ':';
3597
3598   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3599     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3600   eol_mnemonic_mac = '\'';
3601
3602   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3603     "Mnemonic character indicating end-of-line format is not yet decided.");
3604   eol_mnemonic_undecided = '-';
3605
3606   DEFVAR_LISP ("alternate-charset-table", &Valternate_charset_table,
3607     "Alist of charsets vs the alternate charsets.\n\
3608 While decoding, if a charset (car part of an element) is found,\n\
3609 decode it as the alternate charset (cdr part of the element).");
3610   Valternate_charset_table = Qnil;
3611
3612   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3613     "Alist of charsets vs revision numbers.\n\
3614 While encoding, if a charset (car part of an element) is found,\n\
3615 designate it with the escape sequence identifing revision (cdr part of the element).");
3616   Vcharset_revision_alist = Qnil;
3617 }
3618
3619 #endif /* emacs */