src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Other
  71
  72   If a user wants to read/write a text encoded in a coding system not
  73   listed above, he can supply a decoder and an encoder for it in CCL
  74   (Code Conversion Language) programs.  Emacs executes the CCL program
  75   while reading/writing.
  76
  77   Emacs represents a coding-system by a Lisp symbol that has a property
  78   `coding-system'.  But, before actually using the coding-system, the
  79   information about it is set in a structure of type `struct
  80   coding_system' for rapid processing.  See section 6 for more details.
  81
  82 */
  83
  84 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  85
  86   How end-of-line of a text is encoded depends on a system.  For
  87   instance, Unix's format is just one byte of `line-feed' code,
  88   whereas DOS's format is two-byte sequence of `carriage-return' and
  89   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  90
  91   Since text characters encoding and end-of-line encoding are
  92   independent, any coding system described above can take
  93   any format of end-of-line.  So, Emacs has information of format of
  94   end-of-line in each coding-system.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
  99
 100   These functions check if a text between SRC and SRC_END is encoded
 101   in the coding system category XXX.  Each returns an integer value in
 102   which appropriate flag bits for the category XXX is set.  The flag
 103   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 104   template of these functions.  */
 105 #if 0
 106 int
 107 detect_coding_emacs_mule (src, src_end)
 108      unsigned char *src, *src_end;
 109 {
 110   ...
 111 }
 112 #endif
 113
 114 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 115
 116   These functions decode SRC_BYTES length text at SOURCE encoded in
 117   CODING to Emacs' internal format (emacs-mule).  The resulting text
 118   goes to a place pointed to by DESTINATION, the length of which should
 119   not exceed DST_BYTES.  The number of bytes actually processed is
 120   returned as *CONSUMED.  The return value is the length of the decoded
 121   text.  Below is a template of these functions.  */
 122 #if 0
 123 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 124      struct coding_system *coding;
 125      unsigned char *source, *destination;
 126      int src_bytes, dst_bytes;
 127      int *consumed;
 128 {
 129   ...
 130 }
 131 #endif
 132
 133 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 134
 135   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 136   internal format (emacs-mule) to CODING.  The resulting text goes to
 137   a place pointed to by DESTINATION, the length of which should not
 138   exceed DST_BYTES.  The number of bytes actually processed is
 139   returned as *CONSUMED.  The return value is the length of the
 140   encoded text.  Below is a template of these functions.  */
 141 #if 0
 142 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 143      struct coding_system *coding;
 144      unsigned char *source, *destination;
 145      int src_bytes, dst_bytes;
 146      int *consumed;
 147 {
 148   ...
 149 }
 150 #endif
 151
 152 /*** COMMONLY USED MACROS ***/
 153
 154 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 155    THREE_MORE_BYTES safely get one, two, and three bytes from the
 156    source text respectively.  If there are not enough bytes in the
 157    source, they jump to `label_end_of_loop'.  The caller should set
 158    variables `src' and `src_end' to appropriate areas in advance.  */
 159
 160 #define ONE_MORE_BYTE(c1)       \
 161   do {                          \
 162     if (src < src_end)          \
 163       c1 = *src++;              \
 164     else                        \
 165       goto label_end_of_loop;   \
 166   } while (0)
 167
 168 #define TWO_MORE_BYTES(c1, c2)  \
 169   do {                          \
 170     if (src + 1 < src_end)      \
 171       c1 = *src++, c2 = *src++; \
 172     else                        \
 173       goto label_end_of_loop;   \
 174   } while (0)
 175
 176 #define THREE_MORE_BYTES(c1, c2, c3)            \
 177   do {                                          \
 178     if (src + 2 < src_end)                      \
 179       c1 = *src++, c2 = *src++, c3 = *src++;    \
 180     else                                        \
 181       goto label_end_of_loop;                   \
 182   } while (0)
 183
 184 /* The following three macros DECODE_CHARACTER_ASCII,
 185    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 186    the multi-byte form of a character of each class at the place
 187    pointed by `dst'.  The caller should set the variable `dst' to
 188    point to an appropriate area and the variable `coding' to point to
 189    the coding-system of the currently decoding text in advance.  */
 190
 191 /* Decode one ASCII character C.  */
 192
 193 #define DECODE_CHARACTER_ASCII(c)                               \
 194   do {                                                          \
 195     if (COMPOSING_P (coding->composing))                        \
 196       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 197     else                                                        \
 198       *dst++ = (c);                                             \
 199   } while (0)
 200
 201 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 202    position-code is C.  */
 203
 204 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 205   do {                                                                  \
 206     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 207     if (COMPOSING_P (coding->composing))                                \
 208       *dst++ = leading_code + 0x20;                                     \
 209     else                                                                \
 210       *dst++ = leading_code;                                            \
 211     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 212       *dst++ = leading_code;                                            \
 213     *dst++ = (c) | 0x80;                                                \
 214   } while (0)
 215
 216 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 217    position-codes are C1 and C2.  */
 218
 219 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 220   do {                                                  \
 221     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 222     *dst++ = (c2) | 0x80;                               \
 223   } while (0)
 224
 225 \f
 226 /*** 1. Preamble ***/
 227
 228 #include <stdio.h>
 229
 230 #ifdef emacs
 231
 232 #include <config.h>
 233 #include "lisp.h"
 234 #include "buffer.h"
 235 #include "charset.h"
 236 #include "ccl.h"
 237 #include "coding.h"
 238 #include "window.h"
 239
 240 #else  /* not emacs */
 241
 242 #include "mulelib.h"
 243
 244 #endif /* not emacs */
 245
 246 Lisp_Object Qcoding_system, Qeol_type;
 247 Lisp_Object Qbuffer_file_coding_system;
 248 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 249
 250 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 251 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 252 Lisp_Object Qstart_process, Qopen_network_stream;
 253 Lisp_Object Qtarget_idx;
 254
 255 /* Mnemonic character of each format of end-of-line.  */
 256 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 257 /* Mnemonic character to indicate format of end-of-line is not yet
 258    decided.  */
 259 int eol_mnemonic_undecided;
 260
 261 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 262    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 263 int system_eol_type;
 264
 265 #ifdef emacs
 266
 267 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 268
 269 /* Coding system emacs-mule is for converting only end-of-line format.  */
 270 Lisp_Object Qemacs_mule;
 271
 272 /* Coding-systems are handed between Emacs Lisp programs and C internal
 273    routines by the following three variables.  */
 274 /* Coding-system for reading files and receiving data from process.  */
 275 Lisp_Object Vcoding_system_for_read;
 276 /* Coding-system for writing files and sending data to process.  */
 277 Lisp_Object Vcoding_system_for_write;
 278 /* Coding-system actually used in the latest I/O.  */
 279 Lisp_Object Vlast_coding_system_used;
 280
 281 /* Flag to inhibit code conversion of end-of-line format.  */
 282 int inhibit_eol_conversion;
 283
 284 /* Coding-system of what terminal accept for displaying.  */
 285 struct coding_system terminal_coding;
 286
 287 /* Coding-system of what is sent from terminal keyboard.  */
 288 struct coding_system keyboard_coding;
 289
 290 Lisp_Object Vfile_coding_system_alist;
 291 Lisp_Object Vprocess_coding_system_alist;
 292 Lisp_Object Vnetwork_coding_system_alist;
 293
 294 #endif /* emacs */
 295
 296 Lisp_Object Qcoding_category_index;
 297
 298 /* List of symbols `coding-category-xxx' ordered by priority.  */
 299 Lisp_Object Vcoding_category_list;
 300
 301 /* Table of coding-systems currently assigned to each coding-category.  */
 302 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 303
 304 /* Table of names of symbol for each coding-category.  */
 305 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 306   "coding-category-emacs-mule",
 307   "coding-category-sjis",
 308   "coding-category-iso-7",
 309   "coding-category-iso-8-1",
 310   "coding-category-iso-8-2",
 311   "coding-category-iso-7-else",
 312   "coding-category-iso-8-else",
 313   "coding-category-big5",
 314   "coding-category-binary"
 315 };
 316
 317 /* Flag to tell if we look up unification table on character code
 318    conversion.  */
 319 Lisp_Object Venable_character_unification;
 320 /* Standard unification table to look up on decoding (reading).  */
 321 Lisp_Object Vstandard_character_unification_table_for_decode;
 322 /* Standard unification table to look up on encoding (writing).  */
 323 Lisp_Object Vstandard_character_unification_table_for_encode;
 324
 325 Lisp_Object Qcharacter_unification_table;
 326 Lisp_Object Qcharacter_unification_table_for_decode;
 327 Lisp_Object Qcharacter_unification_table_for_encode;
 328
 329 /* Alist of charsets vs revision number.  */
 330 Lisp_Object Vcharset_revision_alist;
 331
 332 /* Default coding systems used for process I/O.  */
 333 Lisp_Object Vdefault_process_coding_system;
 334
 335 \f
 336 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 337
 338 /* Emacs' internal format for encoding multiple character sets is a
 339    kind of multi-byte encoding, i.e. characters are encoded by
 340    variable-length sequences of one-byte codes.  ASCII characters
 341    and control characters (e.g. `tab', `newline') are represented by
 342    one-byte sequences which are their ASCII codes, in the range 0x00
 343    through 0x7F.  The other characters are represented by a sequence
 344    of `base leading-code', optional `extended leading-code', and one
 345    or two `position-code's.  The length of the sequence is determined
 346    by the base leading-code.  Leading-code takes the range 0x80
 347    through 0x9F, whereas extended leading-code and position-code take
 348    the range 0xA0 through 0xFF.  See `charset.h' for more details
 349    about leading-code and position-code.
 350
 351    There's one exception to this rule.  Special leading-code
 352    `leading-code-composition' denotes that the following several
 353    characters should be composed into one character.  Leading-codes of
 354    components (except for ASCII) are added 0x20.  An ASCII character
 355    component is represented by a 2-byte sequence of `0xA0' and
 356    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 357    details of composite character.  Hence, we can summarize the code
 358    range as follows:
 359
 360    --- CODE RANGE of Emacs' internal format ---
 361    (character set)      (range)
 362    ASCII                0x00 .. 0x7F
 363    ELSE (1st byte)      0x80 .. 0x9F
 364         (rest bytes)    0xA0 .. 0xFF
 365    ---------------------------------------------
 366
 367   */
 368
 369 enum emacs_code_class_type emacs_code_class[256];
 370
 371 /* Go to the next statement only if *SRC is accessible and the code is
 372    greater than 0xA0.  */
 373 #define CHECK_CODE_RANGE_A0_FF  \
 374   do {                          \
 375     if (src >= src_end)         \
 376       goto label_end_of_switch; \
 377     else if (*src++ < 0xA0)     \
 378       return 0;                 \
 379   } while (0)
 380
 381 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 382    Check if a text is encoded in Emacs' internal format.  If it is,
 383    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 384
 385 int
 386 detect_coding_emacs_mule (src, src_end)
 387      unsigned char *src, *src_end;
 388 {
 389   unsigned char c;
 390   int composing = 0;
 391
 392   while (src < src_end)
 393     {
 394       c = *src++;
 395
 396       if (composing)
 397         {
 398           if (c < 0xA0)
 399             composing = 0;
 400           else
 401             c -= 0x20;
 402         }
 403
 404       switch (emacs_code_class[c])
 405         {
 406         case EMACS_ascii_code:
 407         case EMACS_linefeed_code:
 408           break;
 409
 410         case EMACS_control_code:
 411           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 412             return 0;
 413           break;
 414
 415         case EMACS_invalid_code:
 416           return 0;
 417
 418         case EMACS_leading_code_composition: /* c == 0x80 */
 419           if (composing)
 420             CHECK_CODE_RANGE_A0_FF;
 421           else
 422             composing = 1;
 423           break;
 424
 425         case EMACS_leading_code_4:
 426           CHECK_CODE_RANGE_A0_FF;
 427           /* fall down to check it two more times ...  */
 428
 429         case EMACS_leading_code_3:
 430           CHECK_CODE_RANGE_A0_FF;
 431           /* fall down to check it one more time ...  */
 432
 433         case EMACS_leading_code_2:
 434           CHECK_CODE_RANGE_A0_FF;
 435           break;
 436
 437         default:
 438         label_end_of_switch:
 439           break;
 440         }
 441     }
 442   return CODING_CATEGORY_MASK_EMACS_MULE;
 443 }
 444
 445 \f
 446 /*** 3. ISO2022 handlers ***/
 447
 448 /* The following note describes the coding system ISO2022 briefly.
 449    Since the intention of this note is to help in understanding of
 450    the programs in this file, some parts are NOT ACCURATE or OVERLY
 451    SIMPLIFIED.  For the thorough understanding, please refer to the
 452    original document of ISO2022.
 453
 454    ISO2022 provides many mechanisms to encode several character sets
 455    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 456    all text is encoded by codes of less than 128.  This may make the
 457    encoded text a little bit longer, but the text gets more stability
 458    to pass through several gateways (some of them strip off the MSB).
 459
 460    There are two kinds of character set: control character set and
 461    graphic character set.  The former contains control characters such
 462    as `newline' and `escape' to provide control functions (control
 463    functions are provided also by escape sequences).  The latter
 464    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 465    two control character sets and many graphic character sets.
 466
 467    Graphic character sets are classified into one of the following
 468    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 469    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 470    bytes (DIMENSION) and the number of characters in one dimension
 471    (CHARS) of the set.  In addition, each character set is assigned an
 472    identification tag (called "final character" and denoted as <F>
 473    here after) which is unique in each class.  <F> of each character
 474    set is decided by ECMA(*) when it is registered in ISO.  Code range
 475    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 476
 477    Note (*): ECMA = European Computer Manufacturers Association
 478
 479    Here are examples of graphic character set [NAME(<F>)]:
 480         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 481         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 482         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 483         o DIMENSION2_CHARS96 -- none for the moment
 484
 485    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 486         C0 [0x00..0x1F] -- control character plane 0
 487         GL [0x20..0x7F] -- graphic character plane 0
 488         C1 [0x80..0x9F] -- control character plane 1
 489         GR [0xA0..0xFF] -- graphic character plane 1
 490
 491    A control character set is directly designated and invoked to C0 or
 492    C1 by an escape sequence.  The most common case is that ISO646's
 493    control character set is designated/invoked to C0 and ISO6429's
 494    control character set is designated/invoked to C1, and usually
 495    these designations/invocations are omitted in a coded text.  With
 496    7-bit environment, only C0 can be used, and a control character for
 497    C1 is encoded by an appropriate escape sequence to fit in the
 498    environment.  All control characters for C1 are defined the
 499    corresponding escape sequences.
 500
 501    A graphic character set is at first designated to one of four
 502    graphic registers (G0 through G3), then these graphic registers are
 503    invoked to GL or GR.  These designations and invocations can be
 504    done independently.  The most common case is that G0 is invoked to
 505    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 506    these invocations and designations are omitted in a coded text.
 507    With 7-bit environment, only GL can be used.
 508
 509    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 510    and 0x7F of GL area work as control characters SPACE and DEL
 511    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 512
 513    There are two ways of invocation: locking-shift and single-shift.
 514    With locking-shift, the invocation lasts until the next different
 515    invocation, whereas with single-shift, the invocation works only
 516    for the following character and doesn't affect locking-shift.
 517    Invocations are done by the following control characters or escape
 518    sequences.
 519
 520    ----------------------------------------------------------------------
 521    function             control char    escape sequence description
 522    ----------------------------------------------------------------------
 523    SI  (shift-in)               0x0F    none            invoke G0 to GL
 524    SI  (shift-out)              0x0E    none            invoke G1 to GL
 525    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 526    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 527    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 528    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 529    ----------------------------------------------------------------------
 530    The first four are for locking-shift.  Control characters for these
 531    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 532
 533    Designations are done by the following escape sequences.
 534    ----------------------------------------------------------------------
 535    escape sequence      description
 536    ----------------------------------------------------------------------
 537    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 538    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 539    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 540    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 541    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 542    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 543    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 544    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 545    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 546    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 547    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 548    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 549    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 550    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 551    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 552    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 553    ----------------------------------------------------------------------
 554
 555    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 556    of dimension 1, chars 94, and final character <F>, and etc.
 557
 558    Note (*): Although these designations are not allowed in ISO2022,
 559    Emacs accepts them on decoding, and produces them on encoding
 560    CHARS96 character set in a coding system which is characterized as
 561    7-bit environment, non-locking-shift, and non-single-shift.
 562
 563    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 564    '(' can be omitted.  We call this as "short-form" here after.
 565
 566    Now you may notice that there are a lot of ways for encoding the
 567    same multilingual text in ISO2022.  Actually, there exists many
 568    coding systems such as Compound Text (used in X's inter client
 569    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 570    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 571    localized platforms), and all of these are variants of ISO2022.
 572
 573    In addition to the above, Emacs handles two more kinds of escape
 574    sequences: ISO6429's direction specification and Emacs' private
 575    sequence for specifying character composition.
 576
 577    ISO6429's direction specification takes the following format:
 578         o CSI ']'      -- end of the current direction
 579         o CSI '0' ']'  -- end of the current direction
 580         o CSI '1' ']'  -- start of left-to-right text
 581         o CSI '2' ']'  -- start of right-to-left text
 582    The control character CSI (0x9B: control sequence introducer) is
 583    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 584
 585    Character composition specification takes the following format:
 586         o ESC '0' -- start character composition
 587         o ESC '1' -- end character composition
 588    Since these are not standard escape sequences of any ISO, the use
 589    of them for these meaning is restricted to Emacs only.  */
 590
 591 enum iso_code_class_type iso_code_class[256];
 592
 593 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 594    Check if a text is encoded in ISO2022.  If it is, returns an
 595    integer in which appropriate flag bits any of:
 596         CODING_CATEGORY_MASK_ISO_7
 597         CODING_CATEGORY_MASK_ISO_8_1
 598         CODING_CATEGORY_MASK_ISO_8_2
 599         CODING_CATEGORY_MASK_ISO_7_ELSE
 600         CODING_CATEGORY_MASK_ISO_8_ELSE
 601    are set.  If a code which should never appear in ISO2022 is found,
 602    returns 0.  */
 603
 604 int
 605 detect_coding_iso2022 (src, src_end)
 606      unsigned char *src, *src_end;
 607 {
 608   int mask = (CODING_CATEGORY_MASK_ISO_7
 609               | CODING_CATEGORY_MASK_ISO_8_1
 610               | CODING_CATEGORY_MASK_ISO_8_2
 611               | CODING_CATEGORY_MASK_ISO_7_ELSE
 612               | CODING_CATEGORY_MASK_ISO_8_ELSE
 613               );
 614   int g1 = 0;                   /* 1 iff designating to G1.  */
 615   int c, i;
 616
 617   while (src < src_end)
 618     {
 619       c = *src++;
 620       switch (c)
 621         {
 622         case ISO_CODE_ESC:
 623           if (src >= src_end)
 624             break;
 625           c = *src++;
 626           if (src < src_end
 627               && ((c >= '(' && c <= '/')
 628                   || c == '$' && ((*src >= '(' && *src <= '/')
 629                                   || (*src >= '@' && *src <= 'B'))))
 630             {
 631               /* Valid designation sequence.  */
 632               if (c == ')' || (c == '$' && *src == ')'))
 633                 {
 634                   g1 = 1;
 635                   mask &= ~(CODING_CATEGORY_MASK_ISO_7
 636                             | CODING_CATEGORY_MASK_ISO_7_ELSE);
 637                 }
 638               src++;
 639               break;
 640             }
 641           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 642             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 643                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 644           break;
 645
 646         case ISO_CODE_SO:
 647           if (g1)
 648             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 649                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 650           break;
 651
 652         case ISO_CODE_CSI:
 653         case ISO_CODE_SS2:
 654         case ISO_CODE_SS3:
 655           mask &= ~(CODING_CATEGORY_MASK_ISO_7
 656                     | CODING_CATEGORY_MASK_ISO_7_ELSE);
 657           break;
 658
 659         default:
 660           if (c < 0x80)
 661             break;
 662           else if (c < 0xA0)
 663             return 0;
 664           else
 665             {
 666               unsigned char *src_begin = src;
 667
 668               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 669                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 670               while (src < src_end && *src >= 0xA0)
 671                 src++;
 672               if ((src - src_begin - 1) & 1 && src < src_end)
 673                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 674             }
 675           break;
 676         }
 677     }
 678
 679   return mask;
 680 }
 681
 682 /* Decode a character of which charset is CHARSET and the 1st position
 683    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 684    fetched from SRC and set to C2.  If CHARSET is negative, it means
 685    that we are decoding ill formed text, and what we can do is just to
 686    read C1 as is.  */
 687
 688 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 689   do {                                                                  \
 690     int c_alt, charset_alt = (charset);                                 \
 691     if (COMPOSING_HEAD_P (coding->composing))                           \
 692       {                                                                 \
 693         *dst++ = LEADING_CODE_COMPOSITION;                              \
 694         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 695           /* To tell composition rules are embeded.  */                 \
 696           *dst++ = 0xFF;                                                \
 697         coding->composing += 2;                                         \
 698       }                                                                 \
 699     if ((charset) >= 0)                                                 \
 700       {                                                                 \
 701         if (CHARSET_DIMENSION (charset) == 2)                           \
 702           ONE_MORE_BYTE (c2);                                           \
 703         if (!NILP (unification_table)                                   \
 704             && ((c_alt = unify_char (unification_table,                 \
 705                                      -1, (charset), c1, c2)) >= 0))     \
 706           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 707       }                                                                 \
 708     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 709       DECODE_CHARACTER_ASCII (c1);                                      \
 710     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 711       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 712     else                                                                \
 713       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 714     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 715       /* To tell a composition rule follows.  */                        \
 716       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 717   } while (0)
 718
 719 /* Set designation state into CODING.  */
 720 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 721   do {                                                                  \
 722     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 723                                      make_number (chars),               \
 724                                      make_number (final_char));         \
 725     if (charset >= 0)                                                   \
 726       {                                                                 \
 727         if (coding->direction == 1                                      \
 728             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 729           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 730         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 731       }                                                                 \
 732   } while (0)
 733
 734 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 735
 736 int
 737 decode_coding_iso2022 (coding, source, destination,
 738                        src_bytes, dst_bytes, consumed)
 739      struct coding_system *coding;
 740      unsigned char *source, *destination;
 741      int src_bytes, dst_bytes;
 742      int *consumed;
 743 {
 744   unsigned char *src = source;
 745   unsigned char *src_end = source + src_bytes;
 746   unsigned char *dst = destination;
 747   unsigned char *dst_end = destination + dst_bytes;
 748   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 749      from DST_END to assure that overflow checking is necessary only
 750      at the head of loop.  */
 751   unsigned char *adjusted_dst_end = dst_end - 6;
 752   int charset;
 753   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 754   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 755   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 756   Lisp_Object unification_table
 757       = coding->character_unification_table_for_decode;
 758
 759   if (!NILP (Venable_character_unification) && NILP (unification_table))
 760     unification_table = Vstandard_character_unification_table_for_decode;
 761
 762   while (src < src_end && dst < adjusted_dst_end)
 763     {
 764       /* SRC_BASE remembers the start position in source in each loop.
 765          The loop will be exited when there's not enough source text
 766          to analyze long escape sequence or 2-byte code (within macros
 767          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 768          to SRC_BASE before exiting.  */
 769       unsigned char *src_base = src;
 770       int c1 = *src++, c2;
 771
 772       switch (iso_code_class [c1])
 773         {
 774         case ISO_0x20_or_0x7F:
 775           if (!coding->composing
 776               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 777             {
 778               /* This is SPACE or DEL.  */
 779               *dst++ = c1;
 780               break;
 781             }
 782           /* This is a graphic character, we fall down ...  */
 783
 784         case ISO_graphic_plane_0:
 785           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 786             {
 787               /* This is a composition rule.  */
 788               *dst++ = c1 | 0x80;
 789               coding->composing = COMPOSING_WITH_RULE_TAIL;
 790             }
 791           else
 792             DECODE_ISO_CHARACTER (charset0, c1);
 793           break;
 794
 795         case ISO_0xA0_or_0xFF:
 796           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 797             {
 798               /* Invalid code.  */
 799               *dst++ = c1;
 800               break;
 801             }
 802           /* This is a graphic character, we fall down ... */
 803
 804         case ISO_graphic_plane_1:
 805           DECODE_ISO_CHARACTER (charset1, c1);
 806           break;
 807
 808         case ISO_control_code:
 809           /* All ISO2022 control characters in this class have the
 810              same representation in Emacs internal format.  */
 811           *dst++ = c1;
 812           break;
 813
 814         case ISO_carriage_return:
 815           if (coding->eol_type == CODING_EOL_CR)
 816             {
 817               *dst++ = '\n';
 818             }
 819           else if (coding->eol_type == CODING_EOL_CRLF)
 820             {
 821               ONE_MORE_BYTE (c1);
 822               if (c1 == ISO_CODE_LF)
 823                 *dst++ = '\n';
 824               else
 825                 {
 826                   src--;
 827                   *dst++ = c1;
 828                 }
 829             }
 830           else
 831             {
 832               *dst++ = c1;
 833             }
 834           break;
 835
 836         case ISO_shift_out:
 837           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 838             goto label_invalid_escape_sequence;
 839           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 840           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 841           break;
 842
 843         case ISO_shift_in:
 844           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 845           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 846           break;
 847
 848         case ISO_single_shift_2_7:
 849         case ISO_single_shift_2:
 850           /* SS2 is handled as an escape sequence of ESC 'N' */
 851           c1 = 'N';
 852           goto label_escape_sequence;
 853
 854         case ISO_single_shift_3:
 855           /* SS2 is handled as an escape sequence of ESC 'O' */
 856           c1 = 'O';
 857           goto label_escape_sequence;
 858
 859         case ISO_control_sequence_introducer:
 860           /* CSI is handled as an escape sequence of ESC '[' ...  */
 861           c1 = '[';
 862           goto label_escape_sequence;
 863
 864         case ISO_escape:
 865           ONE_MORE_BYTE (c1);
 866         label_escape_sequence:
 867           /* Escape sequences handled by Emacs are invocation,
 868              designation, direction specification, and character
 869              composition specification.  */
 870           switch (c1)
 871             {
 872             case '&':           /* revision of following character set */
 873               ONE_MORE_BYTE (c1);
 874               if (!(c1 >= '@' && c1 <= '~'))
 875                 goto label_invalid_escape_sequence;
 876               ONE_MORE_BYTE (c1);
 877               if (c1 != ISO_CODE_ESC)
 878                 goto label_invalid_escape_sequence;
 879               ONE_MORE_BYTE (c1);
 880               goto label_escape_sequence;
 881
 882             case '$':           /* designation of 2-byte character set */
 883               ONE_MORE_BYTE (c1);
 884               if (c1 >= '@' && c1 <= 'B')
 885                 {       /* designation of JISX0208.1978, GB2312.1980,
 886                                    or JISX0208.1980 */
 887                   DECODE_DESIGNATION (0, 2, 94, c1);
 888                 }
 889               else if (c1 >= 0x28 && c1 <= 0x2B)
 890                 {       /* designation of DIMENSION2_CHARS94 character set */
 891                   ONE_MORE_BYTE (c2);
 892                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 893                 }
 894               else if (c1 >= 0x2C && c1 <= 0x2F)
 895                 {       /* designation of DIMENSION2_CHARS96 character set */
 896                   ONE_MORE_BYTE (c2);
 897                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 898                 }
 899               else
 900                 goto label_invalid_escape_sequence;
 901               break;
 902
 903             case 'n':           /* invocation of locking-shift-2 */
 904               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 905                 goto label_invalid_escape_sequence;
 906               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 907               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 908               break;
 909
 910             case 'o':           /* invocation of locking-shift-3 */
 911               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 912                 goto label_invalid_escape_sequence;
 913               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 914               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 915               break;
 916
 917             case 'N':           /* invocation of single-shift-2 */
 918               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 919                 goto label_invalid_escape_sequence;
 920               ONE_MORE_BYTE (c1);
 921               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 922               DECODE_ISO_CHARACTER (charset, c1);
 923               break;
 924
 925             case 'O':           /* invocation of single-shift-3 */
 926               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 927                 goto label_invalid_escape_sequence;
 928               ONE_MORE_BYTE (c1);
 929               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
 930               DECODE_ISO_CHARACTER (charset, c1);
 931               break;
 932
 933             case '0':           /* start composing without embeded rules */
 934               coding->composing = COMPOSING_NO_RULE_HEAD;
 935               break;
 936
 937             case '1':           /* end composing */
 938               coding->composing = COMPOSING_NO;
 939               break;
 940
 941             case '2':           /* start composing with embeded rules */
 942               coding->composing = COMPOSING_WITH_RULE_HEAD;
 943               break;
 944
 945             case '[':           /* specification of direction */
 946               /* For the moment, nested direction is not supported.
 947                  So, the value of `coding->direction' is 0 or 1: 0
 948                  means left-to-right, 1 means right-to-left.  */
 949               ONE_MORE_BYTE (c1);
 950               switch (c1)
 951                 {
 952                 case ']':       /* end of the current direction */
 953                   coding->direction = 0;
 954
 955                 case '0':       /* end of the current direction */
 956                 case '1':       /* start of left-to-right direction */
 957                   ONE_MORE_BYTE (c1);
 958                   if (c1 == ']')
 959                     coding->direction = 0;
 960                   else
 961                     goto label_invalid_escape_sequence;
 962                   break;
 963
 964                 case '2':       /* start of right-to-left direction */
 965                   ONE_MORE_BYTE (c1);
 966                   if (c1 == ']')
 967                     coding->direction= 1;
 968                   else
 969                     goto label_invalid_escape_sequence;
 970                   break;
 971
 972                 default:
 973                   goto label_invalid_escape_sequence;
 974                 }
 975               break;
 976
 977             default:
 978               if (c1 >= 0x28 && c1 <= 0x2B)
 979                 {       /* designation of DIMENSION1_CHARS94 character set */
 980                   ONE_MORE_BYTE (c2);
 981                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
 982                 }
 983               else if (c1 >= 0x2C && c1 <= 0x2F)
 984                 {       /* designation of DIMENSION1_CHARS96 character set */
 985                   ONE_MORE_BYTE (c2);
 986                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
 987                 }
 988               else
 989                 {
 990                   goto label_invalid_escape_sequence;
 991                 }
 992             }
 993           /* We must update these variables now.  */
 994           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 995           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 996           break;
 997
 998         label_invalid_escape_sequence:
 999           {
1000             int length = src - src_base;
1001
1002             bcopy (src_base, dst, length);
1003             dst += length;
1004           }
1005         }
1006       continue;
1007
1008     label_end_of_loop:
1009       coding->carryover_size = src - src_base;
1010       bcopy (src_base, coding->carryover, coding->carryover_size);
1011       src = src_base;
1012       break;
1013     }
1014
1015   /* If this is the last block of the text to be decoded, we had
1016      better just flush out all remaining codes in the text although
1017      they are not valid characters.  */
1018   if (coding->last_block)
1019     {
1020       bcopy (src, dst, src_end - src);
1021       dst += (src_end - src);
1022       src = src_end;
1023     }
1024   *consumed = src - source;
1025   return dst - destination;
1026 }
1027
1028 /* ISO2022 encoding stuff.  */
1029
1030 /*
1031    It is not enough to say just "ISO2022" on encoding, we have to
1032    specify more details.  In Emacs, each coding-system of ISO2022
1033    variant has the following specifications:
1034         1. Initial designation to G0 thru G3.
1035         2. Allows short-form designation?
1036         3. ASCII should be designated to G0 before control characters?
1037         4. ASCII should be designated to G0 at end of line?
1038         5. 7-bit environment or 8-bit environment?
1039         6. Use locking-shift?
1040         7. Use Single-shift?
1041    And the following two are only for Japanese:
1042         8. Use ASCII in place of JIS0201-1976-Roman?
1043         9. Use JISX0208-1983 in place of JISX0208-1978?
1044    These specifications are encoded in `coding->flags' as flag bits
1045    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1046    details.
1047 */
1048
1049 /* Produce codes (escape sequence) for designating CHARSET to graphic
1050    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1051    the coding system CODING allows, produce designation sequence of
1052    short-form.  */
1053
1054 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1055   do {                                                                  \
1056     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1057     char *intermediate_char_94 = "()*+";                                \
1058     char *intermediate_char_96 = ",-./";                                \
1059     Lisp_Object temp                                                    \
1060       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1061     if (! NILP (temp))                                                  \
1062         {                                                               \
1063         *dst++ = ISO_CODE_ESC;                                          \
1064         *dst++ = '&';                                                   \
1065         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1066       }                                                                 \
1067     *dst++ = ISO_CODE_ESC;                                              \
1068     if (CHARSET_DIMENSION (charset) == 1)                               \
1069       {                                                                 \
1070         if (CHARSET_CHARS (charset) == 94)                              \
1071           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1072         else                                                            \
1073           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1074       }                                                                 \
1075     else                                                                \
1076       {                                                                 \
1077         *dst++ = '$';                                                   \
1078         if (CHARSET_CHARS (charset) == 94)                              \
1079           {                                                             \
1080             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1081                 || reg != 0                                             \
1082                 || final_char < '@' || final_char > 'B')                \
1083               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1084           }                                                             \
1085         else                                                            \
1086           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1087       }                                                                 \
1088     *dst++ = final_char;                                                \
1089     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1090   } while (0)
1091
1092 /* The following two macros produce codes (control character or escape
1093    sequence) for ISO2022 single-shift functions (single-shift-2 and
1094    single-shift-3).  */
1095
1096 #define ENCODE_SINGLE_SHIFT_2                           \
1097   do {                                                  \
1098     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1099       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1100     else                                                \
1101       *dst++ = ISO_CODE_SS2;                            \
1102     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1103   } while (0)
1104
1105 #define ENCODE_SINGLE_SHIFT_3                           \
1106   do {                                                  \
1107     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1108       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1109     else                                                \
1110       *dst++ = ISO_CODE_SS3;                            \
1111     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1112   } while (0)
1113
1114 /* The following four macros produce codes (control character or
1115    escape sequence) for ISO2022 locking-shift functions (shift-in,
1116    shift-out, locking-shift-2, and locking-shift-3).  */
1117
1118 #define ENCODE_SHIFT_IN                         \
1119   do {                                          \
1120     *dst++ = ISO_CODE_SI;                       \
1121     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1122   } while (0)
1123
1124 #define ENCODE_SHIFT_OUT                        \
1125   do {                                          \
1126     *dst++ = ISO_CODE_SO;                       \
1127     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1128   } while (0)
1129
1130 #define ENCODE_LOCKING_SHIFT_2                  \
1131   do {                                          \
1132     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1133     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1134   } while (0)
1135
1136 #define ENCODE_LOCKING_SHIFT_3                  \
1137   do {                                          \
1138     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1139     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1140   } while (0)
1141
1142 /* Produce codes for a DIMENSION1 character whose character set is
1143    CHARSET and whose position-code is C1.  Designation and invocation
1144    sequences are also produced in advance if necessary.  */
1145
1146
1147 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1148   do {                                                                  \
1149     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1150       {                                                                 \
1151         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1152           *dst++ = c1 & 0x7F;                                           \
1153         else                                                            \
1154           *dst++ = c1 | 0x80;                                           \
1155         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1156         break;                                                          \
1157       }                                                                 \
1158     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1159       {                                                                 \
1160         *dst++ = c1 & 0x7F;                                             \
1161         break;                                                          \
1162       }                                                                 \
1163     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1164       {                                                                 \
1165         *dst++ = c1 | 0x80;                                             \
1166         break;                                                          \
1167       }                                                                 \
1168     else                                                                \
1169       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1170          must invoke it, or, at first, designate it to some graphic     \
1171          register.  Then repeat the loop to actually produce the        \
1172          character.  */                                                 \
1173       dst = encode_invocation_designation (charset, coding, dst);       \
1174   } while (1)
1175
1176 /* Produce codes for a DIMENSION2 character whose character set is
1177    CHARSET and whose position-codes are C1 and C2.  Designation and
1178    invocation codes are also produced in advance if necessary.  */
1179
1180 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1181   do {                                                                  \
1182     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1183       {                                                                 \
1184         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1185           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1186         else                                                            \
1187           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1188         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1189         break;                                                          \
1190       }                                                                 \
1191     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1192       {                                                                 \
1193         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1194         break;                                                          \
1195       }                                                                 \
1196     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1197       {                                                                 \
1198         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1199         break;                                                          \
1200       }                                                                 \
1201     else                                                                \
1202       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1203          must invoke it, or, at first, designate it to some graphic     \
1204          register.  Then repeat the loop to actually produce the        \
1205          character.  */                                                 \
1206       dst = encode_invocation_designation (charset, coding, dst);       \
1207   } while (1)
1208
1209 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1210   do {                                                                    \
1211     int c_alt, charset_alt;                                               \
1212     if (!NILP (unification_table)                                         \
1213         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1214             >= 0))                                                        \
1215       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1216     else                                                                  \
1217       charset_alt = charset;                                              \
1218     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1219       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1220     else                                                                  \
1221       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1222   } while (0)
1223
1224 /* Produce designation and invocation codes at a place pointed by DST
1225    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1226    Return new DST.  */
1227
1228 unsigned char *
1229 encode_invocation_designation (charset, coding, dst)
1230      int charset;
1231      struct coding_system *coding;
1232      unsigned char *dst;
1233 {
1234   int reg;                      /* graphic register number */
1235
1236   /* At first, check designations.  */
1237   for (reg = 0; reg < 4; reg++)
1238     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1239       break;
1240
1241   if (reg >= 4)
1242     {
1243       /* CHARSET is not yet designated to any graphic registers.  */
1244       /* At first check the requested designation.  */
1245       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1246       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1247         /* Since CHARSET requests no special designation, designate it
1248            to graphic register 0.  */
1249         reg = 0;
1250
1251       ENCODE_DESIGNATION (charset, reg, coding);
1252     }
1253
1254   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1255       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1256     {
1257       /* Since the graphic register REG is not invoked to any graphic
1258          planes, invoke it to graphic plane 0.  */
1259       switch (reg)
1260         {
1261         case 0:                 /* graphic register 0 */
1262           ENCODE_SHIFT_IN;
1263           break;
1264
1265         case 1:                 /* graphic register 1 */
1266           ENCODE_SHIFT_OUT;
1267           break;
1268
1269         case 2:                 /* graphic register 2 */
1270           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1271             ENCODE_SINGLE_SHIFT_2;
1272           else
1273             ENCODE_LOCKING_SHIFT_2;
1274           break;
1275
1276         case 3:                 /* graphic register 3 */
1277           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1278             ENCODE_SINGLE_SHIFT_3;
1279           else
1280             ENCODE_LOCKING_SHIFT_3;
1281           break;
1282         }
1283     }
1284   return dst;
1285 }
1286
1287 /* The following two macros produce codes for indicating composition.  */
1288 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1289 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1290 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1291
1292 /* The following three macros produce codes for indicating direction
1293    of text.  */
1294 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1295   do {                                                  \
1296     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1297       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1298     else                                                \
1299       *dst++ = ISO_CODE_CSI;                            \
1300   } while (0)
1301
1302 #define ENCODE_DIRECTION_R2L    \
1303   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1304
1305 #define ENCODE_DIRECTION_L2R    \
1306   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1307
1308 /* Produce codes for designation and invocation to reset the graphic
1309    planes and registers to initial state.  */
1310 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1311   do {                                                                      \
1312     int reg;                                                                \
1313     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1314       ENCODE_SHIFT_IN;                                                      \
1315     for (reg = 0; reg < 4; reg++)                                           \
1316       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1317           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1318               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1319         ENCODE_DESIGNATION                                                  \
1320           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1321   } while (0)
1322
1323 /* Produce designation sequences of charsets in the line started from
1324    *SRC to a place pointed by DSTP.
1325
1326    If the current block ends before any end-of-line, we may fail to
1327    find all the necessary *designations.  */
1328 encode_designation_at_bol (coding, table, src, src_end, dstp)
1329      struct coding_system *coding;
1330      Lisp_Object table;
1331      unsigned char *src, *src_end, **dstp;
1332 {
1333   int charset, c, found = 0, reg;
1334   /* Table of charsets to be designated to each graphic register.  */
1335   int r[4];
1336   unsigned char *dst = *dstp;
1337
1338   for (reg = 0; reg < 4; reg++)
1339     r[reg] = -1;
1340
1341   while (src < src_end && *src != '\n' && found < 4)
1342     {
1343       int bytes = BYTES_BY_CHAR_HEAD (*src);
1344
1345       if (NILP (table))
1346         charset = CHARSET_AT (src);
1347       else
1348         {
1349           int c_alt, c1, c2;
1350
1351           SPLIT_STRING(src, bytes, charset, c1, c2);
1352           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1353             charset = CHAR_CHARSET (c_alt);
1354         }
1355
1356       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1357       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1358         {
1359           found++;
1360           r[reg] = charset;
1361         }
1362
1363       src += bytes;
1364     }
1365
1366   if (found)
1367     {
1368       for (reg = 0; reg < 4; reg++)
1369         if (r[reg] >= 0
1370             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1371           ENCODE_DESIGNATION (r[reg], reg, coding);
1372       *dstp = dst;
1373     }
1374 }
1375
1376 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1377
1378 int
1379 encode_coding_iso2022 (coding, source, destination,
1380                        src_bytes, dst_bytes, consumed)
1381      struct coding_system *coding;
1382      unsigned char *source, *destination;
1383      int src_bytes, dst_bytes;
1384      int *consumed;
1385 {
1386   unsigned char *src = source;
1387   unsigned char *src_end = source + src_bytes;
1388   unsigned char *dst = destination;
1389   unsigned char *dst_end = destination + dst_bytes;
1390   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1391      from DST_END to assure overflow checking is necessary only at the
1392      head of loop.  */
1393   unsigned char *adjusted_dst_end = dst_end - 19;
1394   Lisp_Object unification_table
1395       = coding->character_unification_table_for_encode;
1396
1397   if (!NILP (Venable_character_unification) && NILP (unification_table))
1398     unification_table = Vstandard_character_unification_table_for_encode;
1399
1400   while (src < src_end && dst < adjusted_dst_end)
1401     {
1402       /* SRC_BASE remembers the start position in source in each loop.
1403          The loop will be exited when there's not enough source text
1404          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1405          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1406          reset to SRC_BASE before exiting.  */
1407       unsigned char *src_base = src;
1408       int charset, c1, c2, c3, c4;
1409
1410       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1411           && CODING_SPEC_ISO_BOL (coding))
1412         {
1413           /* We have to produce designation sequences if any now.  */
1414           encode_designation_at_bol (coding, unification_table,
1415                                      src, src_end, &dst);
1416           CODING_SPEC_ISO_BOL (coding) = 0;
1417         }
1418
1419       c1 = *src++;
1420       /* If we are seeing a component of a composite character, we are
1421          seeing a leading-code specially encoded for composition, or a
1422          composition rule if composing with rule.  We must set C1
1423          to a normal leading-code or an ASCII code.  If we are not at
1424          a composed character, we must reset the composition state.  */
1425       if (COMPOSING_P (coding->composing))
1426         {
1427           if (c1 < 0xA0)
1428             {
1429               /* We are not in a composite character any longer.  */
1430               coding->composing = COMPOSING_NO;
1431               ENCODE_COMPOSITION_END;
1432             }
1433           else
1434             {
1435               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1436                 {
1437                   *dst++ = c1 & 0x7F;
1438                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1439                   continue;
1440                 }
1441               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1442                 coding->composing = COMPOSING_WITH_RULE_RULE;
1443               if (c1 == 0xA0)
1444                 {
1445                   /* This is an ASCII component.  */
1446                   ONE_MORE_BYTE (c1);
1447                   c1 &= 0x7F;
1448                 }
1449               else
1450                 /* This is a leading-code of non ASCII component.  */
1451                 c1 -= 0x20;
1452             }
1453         }
1454
1455       /* Now encode one character.  C1 is a control character, an
1456          ASCII character, or a leading-code of multi-byte character.  */
1457       switch (emacs_code_class[c1])
1458         {
1459         case EMACS_ascii_code:
1460           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1461           break;
1462
1463         case EMACS_control_code:
1464           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1465             ENCODE_RESET_PLANE_AND_REGISTER;
1466           *dst++ = c1;
1467           break;
1468
1469         case EMACS_carriage_return_code:
1470           if (!coding->selective)
1471             {
1472               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1473                 ENCODE_RESET_PLANE_AND_REGISTER;
1474               *dst++ = c1;
1475               break;
1476             }
1477           /* fall down to treat '\r' as '\n' ...  */
1478
1479         case EMACS_linefeed_code:
1480           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1481             ENCODE_RESET_PLANE_AND_REGISTER;
1482           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1483             bcopy (coding->spec.iso2022.initial_designation,
1484                    coding->spec.iso2022.current_designation,
1485                    sizeof coding->spec.iso2022.initial_designation);
1486           if (coding->eol_type == CODING_EOL_LF
1487               || coding->eol_type == CODING_EOL_UNDECIDED)
1488             *dst++ = ISO_CODE_LF;
1489           else if (coding->eol_type == CODING_EOL_CRLF)
1490             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1491           else
1492             *dst++ = ISO_CODE_CR;
1493           CODING_SPEC_ISO_BOL (coding) = 1;
1494           break;
1495
1496         case EMACS_leading_code_2:
1497           ONE_MORE_BYTE (c2);
1498           ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1499           break;
1500
1501         case EMACS_leading_code_3:
1502           TWO_MORE_BYTES (c2, c3);
1503           if (c1 < LEADING_CODE_PRIVATE_11)
1504             ENCODE_ISO_CHARACTER (c1, c2, c3);
1505           else
1506             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1507           break;
1508
1509         case EMACS_leading_code_4:
1510           THREE_MORE_BYTES (c2, c3, c4);
1511           ENCODE_ISO_CHARACTER (c2, c3, c4);
1512           break;
1513
1514         case EMACS_leading_code_composition:
1515           ONE_MORE_BYTE (c1);
1516           if (c1 == 0xFF)
1517             {
1518               coding->composing = COMPOSING_WITH_RULE_HEAD;
1519               ENCODE_COMPOSITION_WITH_RULE_START;
1520             }
1521           else
1522             {
1523               /* Rewind one byte because it is a character code of
1524                  composition elements.  */
1525               src--;
1526               coding->composing = COMPOSING_NO_RULE_HEAD;
1527               ENCODE_COMPOSITION_NO_RULE_START;
1528             }
1529           break;
1530
1531         case EMACS_invalid_code:
1532           *dst++ = c1;
1533           break;
1534         }
1535       continue;
1536     label_end_of_loop:
1537       coding->carryover_size = src - src_base;
1538       bcopy (src_base, coding->carryover, coding->carryover_size);
1539       break;
1540     }
1541
1542   /* If this is the last block of the text to be encoded, we must
1543      reset graphic planes and registers to the initial state.  */
1544   if (src >= src_end && coding->last_block)
1545     {
1546       ENCODE_RESET_PLANE_AND_REGISTER;
1547       if (coding->carryover_size > 0
1548           && coding->carryover_size < (dst_end - dst))
1549         {
1550           bcopy (coding->carryover, dst, coding->carryover_size);
1551           dst += coding->carryover_size;
1552           coding->carryover_size = 0;
1553         }
1554     }
1555   *consumed = src - source;
1556   return dst - destination;
1557 }
1558
1559 \f
1560 /*** 4. SJIS and BIG5 handlers ***/
1561
1562 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1563    quite widely.  So, for the moment, Emacs supports them in the bare
1564    C code.  But, in the future, they may be supported only by CCL.  */
1565
1566 /* SJIS is a coding system encoding three character sets: ASCII, right
1567    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1568    as is.  A character of charset katakana-jisx0201 is encoded by
1569    "position-code + 0x80".  A character of charset japanese-jisx0208
1570    is encoded in 2-byte but two position-codes are divided and shifted
1571    so that it fit in the range below.
1572
1573    --- CODE RANGE of SJIS ---
1574    (character set)      (range)
1575    ASCII                0x00 .. 0x7F
1576    KATAKANA-JISX0201    0xA0 .. 0xDF
1577    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1578             (2nd byte)  0x40 .. 0xFF
1579    -------------------------------
1580
1581 */
1582
1583 /* BIG5 is a coding system encoding two character sets: ASCII and
1584    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1585    character set and is encoded in two-byte.
1586
1587    --- CODE RANGE of BIG5 ---
1588    (character set)      (range)
1589    ASCII                0x00 .. 0x7F
1590    Big5 (1st byte)      0xA1 .. 0xFE
1591         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1592    --------------------------
1593
1594    Since the number of characters in Big5 is larger than maximum
1595    characters in Emacs' charset (96x96), it can't be handled as one
1596    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1597    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1598    contains frequently used characters and the latter contains less
1599    frequently used characters.  */
1600
1601 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1602    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1603    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1604    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1605
1606 /* Number of Big5 characters which have the same code in 1st byte.  */
1607 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1608
1609 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1610   do {                                                                  \
1611     unsigned int temp                                                   \
1612       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1613     if (b1 < 0xC9)                                                      \
1614       charset = charset_big5_1;                                         \
1615     else                                                                \
1616       {                                                                 \
1617         charset = charset_big5_2;                                       \
1618         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1619       }                                                                 \
1620     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1621     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1622   } while (0)
1623
1624 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1625   do {                                                                  \
1626     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1627     if (charset == charset_big5_2)                                      \
1628       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1629     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1630     b2 = temp % BIG5_SAME_ROW;                                          \
1631     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1632   } while (0)
1633
1634 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1635   do {                                                                  \
1636     int c_alt, charset_alt = (charset);                                 \
1637     if (!NILP (unification_table)                                       \
1638         && ((c_alt = unify_char (unification_table,                     \
1639                                  -1, (charset), c1, c2)) >= 0))         \
1640           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1641     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1642       DECODE_CHARACTER_ASCII (c1);                                      \
1643     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1644       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1645     else                                                                \
1646       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1647   } while (0)
1648
1649 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1650   do {                                                                    \
1651     int c_alt, charset_alt;                                               \
1652     if (!NILP (unification_table)                                         \
1653         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1654             >= 0))                                                        \
1655       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1656     else                                                                  \
1657       charset_alt = charset;                                              \
1658     if (charset_alt == charset_ascii)                                     \
1659       *dst++ = c1;                                                        \
1660     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1661       {                                                                   \
1662         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1663           *dst++ = c1;                                                    \
1664         else                                                              \
1665           *dst++ = charset_alt, *dst++ = c1;                              \
1666       }                                                                   \
1667     else                                                                  \
1668       {                                                                   \
1669         c1 &= 0x7F, c2 &= 0x7F;                                           \
1670         if (sjis_p && charset_alt == charset_jisx0208)                    \
1671           {                                                               \
1672             unsigned char s1, s2;                                         \
1673                                                                           \
1674             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1675             *dst++ = s1, *dst++ = s2;                                     \
1676           }                                                               \
1677         else if (!sjis_p                                                  \
1678                  && (charset_alt == charset_big5_1                        \
1679                      || charset_alt == charset_big5_2))                   \
1680           {                                                               \
1681             unsigned char b1, b2;                                         \
1682                                                                           \
1683             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1684             *dst++ = b1, *dst++ = b2;                                     \
1685           }                                                               \
1686         else                                                              \
1687           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1688       }                                                                   \
1689   } while (0);
1690
1691 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1692    Check if a text is encoded in SJIS.  If it is, return
1693    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1694
1695 int
1696 detect_coding_sjis (src, src_end)
1697      unsigned char *src, *src_end;
1698 {
1699   unsigned char c;
1700
1701   while (src < src_end)
1702     {
1703       c = *src++;
1704       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1705         return 0;
1706       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1707         {
1708           if (src < src_end && *src++ < 0x40)
1709             return 0;
1710         }
1711     }
1712   return CODING_CATEGORY_MASK_SJIS;
1713 }
1714
1715 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1716    Check if a text is encoded in BIG5.  If it is, return
1717    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1718
1719 int
1720 detect_coding_big5 (src, src_end)
1721      unsigned char *src, *src_end;
1722 {
1723   unsigned char c;
1724
1725   while (src < src_end)
1726     {
1727       c = *src++;
1728       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1729         return 0;
1730       if (c >= 0xA1)
1731         {
1732           if (src >= src_end)
1733             break;
1734           c = *src++;
1735           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1736             return 0;
1737         }
1738     }
1739   return CODING_CATEGORY_MASK_BIG5;
1740 }
1741
1742 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1743    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1744
1745 int
1746 decode_coding_sjis_big5 (coding, source, destination,
1747                          src_bytes, dst_bytes, consumed, sjis_p)
1748      struct coding_system *coding;
1749      unsigned char *source, *destination;
1750      int src_bytes, dst_bytes;
1751      int *consumed;
1752      int sjis_p;
1753 {
1754   unsigned char *src = source;
1755   unsigned char *src_end = source + src_bytes;
1756   unsigned char *dst = destination;
1757   unsigned char *dst_end = destination + dst_bytes;
1758   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1759      from DST_END to assure overflow checking is necessary only at the
1760      head of loop.  */
1761   unsigned char *adjusted_dst_end = dst_end - 3;
1762   Lisp_Object unification_table
1763       = coding->character_unification_table_for_decode;
1764
1765   if (!NILP (Venable_character_unification) && NILP (unification_table))
1766     unification_table = Vstandard_character_unification_table_for_decode;
1767
1768   while (src < src_end && dst < adjusted_dst_end)
1769     {
1770       /* SRC_BASE remembers the start position in source in each loop.
1771          The loop will be exited when there's not enough source text
1772          to analyze two-byte character (within macro ONE_MORE_BYTE).
1773          In that case, SRC is reset to SRC_BASE before exiting.  */
1774       unsigned char *src_base = src;
1775       unsigned char c1 = *src++, c2, c3, c4;
1776
1777       if (c1 == '\r')
1778         {
1779           if (coding->eol_type == CODING_EOL_CRLF)
1780             {
1781               ONE_MORE_BYTE (c2);
1782               if (c2 == '\n')
1783                 *dst++ = c2;
1784               else
1785                 /* To process C2 again, SRC is subtracted by 1.  */
1786                 *dst++ = c1, src--;
1787             }
1788           else
1789             *dst++ = c1;
1790         }
1791       else if (c1 < 0x20)
1792         *dst++ = c1;
1793       else if (c1 < 0x80)
1794         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1795       else if (c1 < 0xA0 || c1 >= 0xE0)
1796         {
1797           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1798           if (sjis_p)
1799             {
1800               ONE_MORE_BYTE (c2);
1801               DECODE_SJIS (c1, c2, c3, c4);
1802               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1803             }
1804           else if (c1 >= 0xE0 && c1 < 0xFF)
1805             {
1806               int charset;
1807
1808               ONE_MORE_BYTE (c2);
1809               DECODE_BIG5 (c1, c2, charset, c3, c4);
1810               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1811             }
1812           else                  /* Invalid code */
1813             *dst++ = c1;
1814         }
1815       else
1816         {
1817           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1818           if (sjis_p)
1819             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1820           else
1821             {
1822               int charset;
1823
1824               ONE_MORE_BYTE (c2);
1825               DECODE_BIG5 (c1, c2, charset, c3, c4);
1826               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1827             }
1828         }
1829       continue;
1830
1831     label_end_of_loop:
1832       coding->carryover_size = src - src_base;
1833       bcopy (src_base, coding->carryover, coding->carryover_size);
1834       src = src_base;
1835       break;
1836     }
1837
1838   *consumed = src - source;
1839   return dst - destination;
1840 }
1841
1842 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1843    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1844    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1845    sure that all these charsets are registered as official charset
1846    (i.e. do not have extended leading-codes).  Characters of other
1847    charsets are produced without any encoding.  If SJIS_P is 1, encode
1848    SJIS text, else encode BIG5 text.  */
1849
1850 int
1851 encode_coding_sjis_big5 (coding, source, destination,
1852                          src_bytes, dst_bytes, consumed, sjis_p)
1853      struct coding_system *coding;
1854      unsigned char *source, *destination;
1855      int src_bytes, dst_bytes;
1856      int *consumed;
1857      int sjis_p;
1858 {
1859   unsigned char *src = source;
1860   unsigned char *src_end = source + src_bytes;
1861   unsigned char *dst = destination;
1862   unsigned char *dst_end = destination + dst_bytes;
1863   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1864      from DST_END to assure overflow checking is necessary only at the
1865      head of loop.  */
1866   unsigned char *adjusted_dst_end = dst_end - 1;
1867   Lisp_Object unification_table
1868       = coding->character_unification_table_for_encode;
1869
1870   if (!NILP (Venable_character_unification) && NILP (unification_table))
1871     unification_table = Vstandard_character_unification_table_for_encode;
1872
1873   while (src < src_end && dst < adjusted_dst_end)
1874     {
1875       /* SRC_BASE remembers the start position in source in each loop.
1876          The loop will be exited when there's not enough source text
1877          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
1878          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
1879          before exiting.  */
1880       unsigned char *src_base = src;
1881       unsigned char c1 = *src++, c2, c3, c4;
1882
1883       if (coding->composing)
1884         {
1885           if (c1 == 0xA0)
1886             {
1887               ONE_MORE_BYTE (c1);
1888               c1 &= 0x7F;
1889             }
1890           else if (c1 >= 0xA0)
1891             c1 -= 0x20;
1892           else
1893             coding->composing = 0;
1894         }
1895
1896       switch (emacs_code_class[c1])
1897         {
1898         case EMACS_ascii_code:
1899           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1900           break;
1901
1902         case EMACS_control_code:
1903           *dst++ = c1;
1904           break;
1905
1906         case EMACS_carriage_return_code:
1907           if (!coding->selective)
1908             {
1909               *dst++ = c1;
1910               break;
1911             }
1912           /* fall down to treat '\r' as '\n' ...  */
1913
1914         case EMACS_linefeed_code:
1915           if (coding->eol_type == CODING_EOL_LF
1916               || coding->eol_type == CODING_EOL_UNDECIDED)
1917             *dst++ = '\n';
1918           else if (coding->eol_type == CODING_EOL_CRLF)
1919             *dst++ = '\r', *dst++ = '\n';
1920           else
1921             *dst++ = '\r';
1922           break;
1923
1924         case EMACS_leading_code_2:
1925           ONE_MORE_BYTE (c2);
1926           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
1927           break;
1928
1929         case EMACS_leading_code_3:
1930           TWO_MORE_BYTES (c2, c3);
1931           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
1932           break;
1933
1934         case EMACS_leading_code_4:
1935           THREE_MORE_BYTES (c2, c3, c4);
1936           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
1937           break;
1938
1939         case EMACS_leading_code_composition:
1940           coding->composing = 1;
1941           break;
1942
1943         default:                /* i.e. case EMACS_invalid_code: */
1944           *dst++ = c1;
1945         }
1946       continue;
1947
1948     label_end_of_loop:
1949       coding->carryover_size = src - src_base;
1950       bcopy (src_base, coding->carryover, coding->carryover_size);
1951       src = src_base;
1952       break;
1953     }
1954
1955   *consumed = src - source;
1956   return dst - destination;
1957 }
1958
1959 \f
1960 /*** 5. End-of-line handlers ***/
1961
1962 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1963    This function is called only when `coding->eol_type' is
1964    CODING_EOL_CRLF or CODING_EOL_CR.  */
1965
1966 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
1967      struct coding_system *coding;
1968      unsigned char *source, *destination;
1969      int src_bytes, dst_bytes;
1970      int *consumed;
1971 {
1972   unsigned char *src = source;
1973   unsigned char *src_end = source + src_bytes;
1974   unsigned char *dst = destination;
1975   unsigned char *dst_end = destination + dst_bytes;
1976   int produced;
1977
1978   switch (coding->eol_type)
1979     {
1980     case CODING_EOL_CRLF:
1981       {
1982         /* Since the maximum bytes produced by each loop is 2, we
1983            subtract 1 from DST_END to assure overflow checking is
1984            necessary only at the head of loop.  */
1985         unsigned char *adjusted_dst_end = dst_end - 1;
1986
1987         while (src < src_end && dst < adjusted_dst_end)
1988           {
1989             unsigned char *src_base = src;
1990             unsigned char c = *src++;
1991             if (c == '\r')
1992               {
1993                 ONE_MORE_BYTE (c);
1994                 if (c != '\n')
1995                   *dst++ = '\r';
1996                 *dst++ = c;
1997               }
1998             else
1999               *dst++ = c;
2000             continue;
2001
2002           label_end_of_loop:
2003             coding->carryover_size = src - src_base;
2004             bcopy (src_base, coding->carryover, coding->carryover_size);
2005             src = src_base;
2006             break;
2007           }
2008         *consumed = src - source;
2009         produced = dst - destination;
2010         break;
2011       }
2012
2013     case CODING_EOL_CR:
2014       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2015       bcopy (source, destination, produced);
2016       dst_end = destination + produced;
2017       while (dst < dst_end)
2018         if (*dst++ == '\r') dst[-1] = '\n';
2019       *consumed = produced;
2020       break;
2021
2022     default:                    /* i.e. case: CODING_EOL_LF */
2023       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2024       bcopy (source, destination, produced);
2025       *consumed = produced;
2026       break;
2027     }
2028
2029   return produced;
2030 }
2031
2032 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2033    format of end-of-line according to `coding->eol_type'.  If
2034    `coding->selective' is 1, code '\r' in source text also means
2035    end-of-line.  */
2036
2037 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2038      struct coding_system *coding;
2039      unsigned char *source, *destination;
2040      int src_bytes, dst_bytes;
2041      int *consumed;
2042 {
2043   unsigned char *src = source;
2044   unsigned char *dst = destination;
2045   int produced;
2046
2047   if (src_bytes <= 0)
2048     return 0;
2049
2050   switch (coding->eol_type)
2051     {
2052     case CODING_EOL_LF:
2053     case CODING_EOL_UNDECIDED:
2054       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2055       bcopy (source, destination, produced);
2056       if (coding->selective)
2057         {
2058           int i = produced;
2059           while (i--)
2060             if (*dst++ == '\r') dst[-1] = '\n';
2061         }
2062       *consumed = produced;
2063
2064     case CODING_EOL_CRLF:
2065       {
2066         unsigned char c;
2067         unsigned char *src_end = source + src_bytes;
2068         unsigned char *dst_end = destination + dst_bytes;
2069         /* Since the maximum bytes produced by each loop is 2, we
2070            subtract 1 from DST_END to assure overflow checking is
2071            necessary only at the head of loop.  */
2072         unsigned char *adjusted_dst_end = dst_end - 1;
2073
2074         while (src < src_end && dst < adjusted_dst_end)
2075           {
2076             c = *src++;
2077             if (c == '\n' || (c == '\r' && coding->selective))
2078               *dst++ = '\r', *dst++ = '\n';
2079             else
2080               *dst++ = c;
2081           }
2082         produced = dst - destination;
2083         *consumed = src - source;
2084         break;
2085       }
2086
2087     default:                    /* i.e. case CODING_EOL_CR: */
2088       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2089       bcopy (source, destination, produced);
2090       {
2091         int i = produced;
2092         while (i--)
2093           if (*dst++ == '\n') dst[-1] = '\r';
2094       }
2095       *consumed = produced;
2096     }
2097
2098   return produced;
2099 }
2100
2101 \f
2102 /*** 6. C library functions ***/
2103
2104 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2105    has a property `coding-system'.  The value of this property is a
2106    vector of length 5 (called as coding-vector).  Among elements of
2107    this vector, the first (element[0]) and the fifth (element[4])
2108    carry important information for decoding/encoding.  Before
2109    decoding/encoding, this information should be set in fields of a
2110    structure of type `coding_system'.
2111
2112    A value of property `coding-system' can be a symbol of another
2113    subsidiary coding-system.  In that case, Emacs gets coding-vector
2114    from that symbol.
2115
2116    `element[0]' contains information to be set in `coding->type'.  The
2117    value and its meaning is as follows:
2118
2119    0 -- coding_type_emacs_mule
2120    1 -- coding_type_sjis
2121    2 -- coding_type_iso2022
2122    3 -- coding_type_big5
2123    4 -- coding_type_ccl encoder/decoder written in CCL
2124    nil -- coding_type_no_conversion
2125    t -- coding_type_undecided (automatic conversion on decoding,
2126                                no-conversion on encoding)
2127
2128    `element[4]' contains information to be set in `coding->flags' and
2129    `coding->spec'.  The meaning varies by `coding->type'.
2130
2131    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2132    of length 32 (of which the first 13 sub-elements are used now).
2133    Meanings of these sub-elements are:
2134
2135    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2136         If the value is an integer of valid charset, the charset is
2137         assumed to be designated to graphic register N initially.
2138
2139         If the value is minus, it is a minus value of charset which
2140         reserves graphic register N, which means that the charset is
2141         not designated initially but should be designated to graphic
2142         register N just before encoding a character in that charset.
2143
2144         If the value is nil, graphic register N is never used on
2145         encoding.
2146
2147    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2148         Each value takes t or nil.  See the section ISO2022 of
2149         `coding.h' for more information.
2150
2151    If `coding->type' is `coding_type_big5', element[4] is t to denote
2152    BIG5-ETen or nil to denote BIG5-HKU.
2153
2154    If `coding->type' takes the other value, element[4] is ignored.
2155
2156    Emacs Lisp's coding system also carries information about format of
2157    end-of-line in a value of property `eol-type'.  If the value is
2158    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2159    means CODING_EOL_CR.  If it is not integer, it should be a vector
2160    of subsidiary coding systems of which property `eol-type' has one
2161    of above values.
2162
2163 */
2164
2165 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2166    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2167    is setup so that no conversion is necessary and return -1, else
2168    return 0.  */
2169
2170 int
2171 setup_coding_system (coding_system, coding)
2172      Lisp_Object coding_system;
2173      struct coding_system *coding;
2174 {
2175   Lisp_Object type, eol_type;
2176
2177   /* At first, set several fields to default values.  */
2178   coding->require_flushing = 0;
2179   coding->last_block = 0;
2180   coding->selective = 0;
2181   coding->composing = 0;
2182   coding->direction = 0;
2183   coding->carryover_size = 0;
2184   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2185   coding->character_unification_table_for_decode = Qnil;
2186   coding->character_unification_table_for_encode = Qnil;
2187
2188   Vlast_coding_system_used = coding->symbol = coding_system;
2189   eol_type = Qnil;
2190   /* Get value of property `coding-system' until we get a vector.
2191      While doing that, also get values of properties
2192      `post-read-conversion', `pre-write-conversion',
2193      `character-unification-table-for-decode',
2194      `character-unification-table-for-encode' and `eol-type'.  */
2195   while (!NILP (coding_system) && SYMBOLP (coding_system))
2196     {
2197       if (NILP (coding->post_read_conversion))
2198         coding->post_read_conversion = Fget (coding_system,
2199                                              Qpost_read_conversion);
2200       if (NILP (coding->pre_write_conversion))
2201         coding->pre_write_conversion = Fget (coding_system,
2202                                              Qpre_write_conversion);
2203       if (!inhibit_eol_conversion && NILP (eol_type))
2204         eol_type = Fget (coding_system, Qeol_type);
2205
2206       if (NILP (coding->character_unification_table_for_decode))
2207         coding->character_unification_table_for_decode
2208           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2209
2210       if (NILP (coding->character_unification_table_for_encode))
2211         coding->character_unification_table_for_encode
2212           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2213
2214       coding_system = Fget (coding_system, Qcoding_system);
2215     }
2216
2217   while (!NILP (coding->character_unification_table_for_decode)
2218          && SYMBOLP (coding->character_unification_table_for_decode))
2219         coding->character_unification_table_for_decode
2220           = Fget (coding->character_unification_table_for_decode,
2221                   Qcharacter_unification_table_for_decode);
2222   if (!NILP (coding->character_unification_table_for_decode)
2223       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2224       coding->character_unification_table_for_decode = Qnil;
2225
2226   while (!NILP (coding->character_unification_table_for_encode)
2227          && SYMBOLP (coding->character_unification_table_for_encode))
2228         coding->character_unification_table_for_encode
2229           = Fget (coding->character_unification_table_for_encode,
2230                   Qcharacter_unification_table_for_encode);
2231   if (!NILP (coding->character_unification_table_for_encode)
2232       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2233       coding->character_unification_table_for_encode = Qnil;
2234
2235   if (!VECTORP (coding_system)
2236       || XVECTOR (coding_system)->size != 5)
2237     goto label_invalid_coding_system;
2238
2239   if (VECTORP (eol_type))
2240     coding->eol_type = CODING_EOL_UNDECIDED;
2241   else if (XFASTINT (eol_type) == 1)
2242     coding->eol_type = CODING_EOL_CRLF;
2243   else if (XFASTINT (eol_type) == 2)
2244     coding->eol_type = CODING_EOL_CR;
2245   else
2246     coding->eol_type = CODING_EOL_LF;
2247
2248   type = XVECTOR (coding_system)->contents[0];
2249   switch (XFASTINT (type))
2250     {
2251     case 0:
2252       coding->type = coding_type_emacs_mule;
2253       break;
2254
2255     case 1:
2256       coding->type = coding_type_sjis;
2257       break;
2258
2259     case 2:
2260       coding->type = coding_type_iso2022;
2261       {
2262         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2263         Lisp_Object *flags;
2264         int i, charset, default_reg_bits = 0;
2265
2266         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2267           goto label_invalid_coding_system;
2268
2269         flags = XVECTOR (val)->contents;
2270         coding->flags
2271           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2272              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2273              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2274              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2275              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2276              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2277              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2278              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2279              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2280              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2281              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL));
2282
2283         /* Invoke graphic register 0 to plane 0.  */
2284         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2285         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2286         CODING_SPEC_ISO_INVOCATION (coding, 1)
2287           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2288         /* Not single shifting at first.  */
2289         CODING_SPEC_ISO_SINGLE_SHIFTING(coding) = 0;
2290         /* Beginning of buffer should also be regarded as bol. */
2291         CODING_SPEC_ISO_BOL(coding) = 1;
2292
2293         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2294            FLAGS[REG] can be one of below:
2295                 integer CHARSET: CHARSET occupies register I,
2296                 t: designate nothing to REG initially, but can be used
2297                   by any charsets,
2298                 list of integer, nil, or t: designate the first
2299                   element (if integer) to REG initially, the remaining
2300                   elements (if integer) is designated to REG on request,
2301                   if an element is t, REG can be used by any charset,
2302                 nil: REG is never used.  */
2303         for (charset = 0; charset <= MAX_CHARSET; charset++)
2304           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2305             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2306         for (i = 0; i < 4; i++)
2307           {
2308             if (INTEGERP (flags[i])
2309                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2310                 || (charset = get_charset_id (flags[i])) >= 0)
2311               {
2312                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2313                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2314               }
2315             else if (EQ (flags[i], Qt))
2316               {
2317                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2318                 default_reg_bits |= 1 << i;
2319               }
2320             else if (CONSP (flags[i]))
2321               {
2322                 Lisp_Object tail = flags[i];
2323
2324                 if (INTEGERP (XCONS (tail)->car)
2325                     && (charset = XINT (XCONS (tail)->car),
2326                         CHARSET_VALID_P (charset))
2327                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2328                   {
2329                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2330                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2331                   }
2332                 else
2333                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2334                 tail = XCONS (tail)->cdr;
2335                 while (CONSP (tail))
2336                   {
2337                     if (INTEGERP (XCONS (tail)->car)
2338                         && (charset = XINT (XCONS (tail)->car),
2339                             CHARSET_VALID_P (charset))
2340                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2341                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2342                         = i;
2343                     else if (EQ (XCONS (tail)->car, Qt))
2344                       default_reg_bits |= 1 << i;
2345                     tail = XCONS (tail)->cdr;
2346                   }
2347               }
2348             else
2349               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2350
2351             CODING_SPEC_ISO_DESIGNATION (coding, i)
2352               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2353           }
2354
2355         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2356           {
2357             /* REG 1 can be used only by locking shift in 7-bit env.  */
2358             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2359               default_reg_bits &= ~2;
2360             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2361               /* Without any shifting, only REG 0 and 1 can be used.  */
2362               default_reg_bits &= 3;
2363           }
2364
2365         for (charset = 0; charset <= MAX_CHARSET; charset++)
2366           if (CHARSET_VALID_P (charset)
2367               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2368                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2369             {
2370               /* We have not yet decided where to designate CHARSET.  */
2371               int reg_bits = default_reg_bits;
2372
2373               if (CHARSET_CHARS (charset) == 96)
2374                 /* A charset of CHARS96 can't be designated to REG 0.  */
2375                 reg_bits &= ~1;
2376
2377               if (reg_bits)
2378                 /* There exist some default graphic register.  */
2379                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2380                   = (reg_bits & 1
2381                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2382               else
2383                 /* We anyway have to designate CHARSET to somewhere.  */
2384                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2385                   = (CHARSET_CHARS (charset) == 94
2386                      ? 0
2387                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2388                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2389                         ? 1
2390                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2391                            ? 2 : 0)));
2392             }
2393       }
2394       coding->require_flushing = 1;
2395       break;
2396
2397     case 3:
2398       coding->type = coding_type_big5;
2399       coding->flags
2400         = (NILP (XVECTOR (coding_system)->contents[4])
2401            ? CODING_FLAG_BIG5_HKU
2402            : CODING_FLAG_BIG5_ETEN);
2403       break;
2404
2405     case 4:
2406       coding->type = coding_type_ccl;
2407       {
2408         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2409         if (CONSP  (val)
2410             && VECTORP (XCONS (val)->car)
2411             && VECTORP (XCONS (val)->cdr))
2412           {
2413             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2414             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2415           }
2416         else
2417           goto label_invalid_coding_system;
2418       }
2419       coding->require_flushing = 1;
2420       break;
2421
2422     default:
2423       if (EQ (type, Qt))
2424         coding->type = coding_type_undecided;
2425       else
2426         coding->type = coding_type_no_conversion;
2427       break;
2428     }
2429   return 0;
2430
2431  label_invalid_coding_system:
2432   coding->type = coding_type_no_conversion;
2433   coding->eol_type = CODING_EOL_LF;
2434   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2435     = Qnil;
2436   return -1;
2437 }
2438
2439 /* Emacs has a mechanism to automatically detect a coding system if it
2440    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2441    it's impossible to distinguish some coding systems accurately
2442    because they use the same range of codes.  So, at first, coding
2443    systems are categorized into 7, those are:
2444
2445    o coding-category-emacs-mule
2446
2447         The category for a coding system which has the same code range
2448         as Emacs' internal format.  Assigned the coding-system (Lisp
2449         symbol) `emacs-mule' by default.
2450
2451    o coding-category-sjis
2452
2453         The category for a coding system which has the same code range
2454         as SJIS.  Assigned the coding-system (Lisp
2455         symbol) `japanese-shift-jis' by default.
2456
2457    o coding-category-iso-7
2458
2459         The category for a coding system which has the same code range
2460         as ISO2022 of 7-bit environment.  This doesn't use any locking
2461         shift and single shift functions.  Assigned the coding-system
2462         (Lisp symbol) `iso-2022-7bit' by default.
2463
2464    o coding-category-iso-8-1
2465
2466         The category for a coding system which has the same code range
2467         as ISO2022 of 8-bit environment and graphic plane 1 used only
2468         for DIMENSION1 charset.  This doesn't use any locking shift
2469         and single shift functions.  Assigned the coding-system (Lisp
2470         symbol) `iso-latin-1' by default.
2471
2472    o coding-category-iso-8-2
2473
2474         The category for a coding system which has the same code range
2475         as ISO2022 of 8-bit environment and graphic plane 1 used only
2476         for DIMENSION2 charset.  This doesn't use any locking shift
2477         and single shift functions.  Assigned the coding-system (Lisp
2478         symbol) `japanese-iso-8bit' by default.
2479
2480    o coding-category-iso-7-else
2481
2482         The category for a coding system which has the same code range
2483         as ISO2022 of 7-bit environemnt but uses locking shift or
2484         single shift functions.  Assigned the coding-system (Lisp
2485         symbol) `iso-2022-7bit-lock' by default.
2486
2487    o coding-category-iso-8-else
2488
2489         The category for a coding system which has the same code range
2490         as ISO2022 of 8-bit environemnt but uses locking shift or
2491         single shift functions.  Assigned the coding-system (Lisp
2492         symbol) `iso-2022-8bit-ss2' by default.
2493
2494    o coding-category-big5
2495
2496         The category for a coding system which has the same code range
2497         as BIG5.  Assigned the coding-system (Lisp symbol)
2498         `cn-big5' by default.
2499
2500    o coding-category-binary
2501
2502         The category for a coding system not categorized in any of the
2503         above.  Assigned the coding-system (Lisp symbol)
2504         `no-conversion' by default.
2505
2506    Each of them is a Lisp symbol and the value is an actual
2507    `coding-system's (this is also a Lisp symbol) assigned by a user.
2508    What Emacs does actually is to detect a category of coding system.
2509    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2510    decide only one possible category, it selects a category of the
2511    highest priority.  Priorities of categories are also specified by a
2512    user in a Lisp variable `coding-category-list'.
2513
2514 */
2515
2516 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2517    If it detects possible coding systems, return an integer in which
2518    appropriate flag bits are set.  Flag bits are defined by macros
2519    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2520
2521 int
2522 detect_coding_mask (src, src_bytes)
2523      unsigned char *src;
2524      int src_bytes;
2525 {
2526   register unsigned char c;
2527   unsigned char *src_end = src + src_bytes;
2528   int mask;
2529
2530   /* At first, skip all ASCII characters and control characters except
2531      for three ISO2022 specific control characters.  */
2532  label_loop_detect_coding:
2533   while (src < src_end)
2534     {
2535       c = *src;
2536       if (c >= 0x80
2537           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2538         break;
2539       src++;
2540     }
2541
2542   if (src >= src_end)
2543     /* We found nothing other than ASCII.  There's nothing to do.  */
2544     return CODING_CATEGORY_MASK_ANY;
2545
2546   /* The text seems to be encoded in some multilingual coding system.
2547      Now, try to find in which coding system the text is encoded.  */
2548   if (c < 0x80)
2549     {
2550       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2551       /* C is an ISO2022 specific control code of C0.  */
2552       mask = detect_coding_iso2022 (src, src_end);
2553       src++;
2554       if (mask == CODING_CATEGORY_MASK_ANY)
2555         /* No valid ISO2022 code follows C.  Try again.  */
2556         goto label_loop_detect_coding;
2557     }
2558   else if (c == ISO_CODE_SS2 || c == ISO_CODE_SS3 || c == ISO_CODE_CSI)
2559     /* C is an ISO2022 specific control code of C1,
2560        or the first byte of SJIS's 2-byte character code,
2561        or a leading code of Emacs.  */
2562     mask = (detect_coding_iso2022 (src, src_end)
2563             | detect_coding_sjis (src, src_end)
2564             | detect_coding_emacs_mule (src, src_end));
2565
2566   else if (c < 0xA0)
2567     /* C is the first byte of SJIS character code,
2568        or a leading-code of Emacs.  */
2569     mask = (detect_coding_sjis (src, src_end)
2570             | detect_coding_emacs_mule (src, src_end));
2571
2572   else
2573     /* C is a character of ISO2022 in graphic plane right,
2574        or a SJIS's 1-byte character code (i.e. JISX0201),
2575        or the first byte of BIG5's 2-byte code.  */
2576     mask = (detect_coding_iso2022 (src, src_end)
2577             | detect_coding_sjis (src, src_end)
2578             | detect_coding_big5 (src, src_end));
2579
2580   return mask;
2581 }
2582
2583 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2584    The information of the detected coding system is set in CODING.  */
2585
2586 void
2587 detect_coding (coding, src, src_bytes)
2588      struct coding_system *coding;
2589      unsigned char *src;
2590      int src_bytes;
2591 {
2592   int mask = detect_coding_mask (src, src_bytes);
2593   int idx;
2594
2595   if (mask == CODING_CATEGORY_MASK_ANY)
2596     /* We found nothing other than ASCII.  There's nothing to do.  */
2597     return;
2598
2599   if (!mask)
2600     /* The source text seems to be encoded in unknown coding system.
2601        Emacs regards the category of such a kind of coding system as
2602        `coding-category-binary'.  We assume that a user has assigned
2603        an appropriate coding system for a `coding-category-binary'.  */
2604     idx = CODING_CATEGORY_IDX_BINARY;
2605   else
2606     {
2607       /* We found some plausible coding systems.  Let's use a coding
2608          system of the highest priority.  */
2609       Lisp_Object val = Vcoding_category_list;
2610
2611       if (CONSP (val))
2612         while (!NILP (val))
2613           {
2614             idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2615             if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2616               break;
2617             val = XCONS (val)->cdr;
2618           }
2619       else
2620         val = Qnil;
2621
2622       if (NILP (val))
2623         {
2624           /* For unknown reason, `Vcoding_category_list' contains none
2625              of found categories.  Let's use any of them.  */
2626           for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2627             if (mask & (1 << idx))
2628               break;
2629         }
2630     }
2631   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2632 }
2633
2634 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2635    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2636    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2637
2638 int
2639 detect_eol_type (src, src_bytes)
2640      unsigned char *src;
2641      int src_bytes;
2642 {
2643   unsigned char *src_end = src + src_bytes;
2644   unsigned char c;
2645
2646   while (src < src_end)
2647     {
2648       c = *src++;
2649       if (c == '\n')
2650         return CODING_EOL_LF;
2651       else if (c == '\r')
2652         {
2653           if (src < src_end && *src == '\n')
2654             return CODING_EOL_CRLF;
2655           else
2656             return CODING_EOL_CR;
2657         }
2658     }
2659   return CODING_EOL_UNDECIDED;
2660 }
2661
2662 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2663    is encoded.  If it detects an appropriate format of end-of-line, it
2664    sets the information in *CODING.  */
2665
2666 void
2667 detect_eol (coding, src, src_bytes)
2668      struct coding_system *coding;
2669      unsigned char *src;
2670      int src_bytes;
2671 {
2672   Lisp_Object val;
2673   int eol_type = detect_eol_type (src, src_bytes);
2674
2675   if (eol_type == CODING_EOL_UNDECIDED)
2676     /*  We found no end-of-line in the source text.  */
2677     return;
2678
2679   val = Fget (coding->symbol, Qeol_type);
2680   if (VECTORP (val) && XVECTOR (val)->size == 3)
2681     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2682 }
2683
2684 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2685    decoding, it may detect coding system and format of end-of-line if
2686    those are not yet decided.  */
2687
2688 int
2689 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2690      struct coding_system *coding;
2691      unsigned char *source, *destination;
2692      int src_bytes, dst_bytes;
2693      int *consumed;
2694 {
2695   int produced;
2696
2697   if (src_bytes <= 0)
2698     {
2699       *consumed = 0;
2700       return 0;
2701     }
2702
2703   if (coding->type == coding_type_undecided)
2704     detect_coding (coding, source, src_bytes);
2705
2706   if (coding->eol_type == CODING_EOL_UNDECIDED)
2707     detect_eol (coding, source, src_bytes);
2708
2709   coding->carryover_size = 0;
2710   switch (coding->type)
2711     {
2712     case coding_type_no_conversion:
2713     label_no_conversion:
2714       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2715       bcopy (source, destination, produced);
2716       *consumed = produced;
2717       break;
2718
2719     case coding_type_emacs_mule:
2720     case coding_type_undecided:
2721       if (coding->eol_type == CODING_EOL_LF
2722           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2723         goto label_no_conversion;
2724       produced = decode_eol (coding, source, destination,
2725                              src_bytes, dst_bytes, consumed);
2726       break;
2727
2728     case coding_type_sjis:
2729       produced = decode_coding_sjis_big5 (coding, source, destination,
2730                                           src_bytes, dst_bytes, consumed,
2731                                           1);
2732       break;
2733
2734     case coding_type_iso2022:
2735       produced = decode_coding_iso2022 (coding, source, destination,
2736                                         src_bytes, dst_bytes, consumed);
2737       break;
2738
2739     case coding_type_big5:
2740       produced = decode_coding_sjis_big5 (coding, source, destination,
2741                                           src_bytes, dst_bytes, consumed,
2742                                           0);
2743       break;
2744
2745     case coding_type_ccl:
2746       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2747                              src_bytes, dst_bytes, consumed);
2748       break;
2749     }
2750
2751   return produced;
2752 }
2753
2754 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2755
2756 int
2757 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2758      struct coding_system *coding;
2759      unsigned char *source, *destination;
2760      int src_bytes, dst_bytes;
2761      int *consumed;
2762 {
2763   int produced;
2764
2765   coding->carryover_size = 0;
2766   switch (coding->type)
2767     {
2768     case coding_type_no_conversion:
2769     label_no_conversion:
2770       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2771       if (produced > 0)
2772         {
2773           bcopy (source, destination, produced);
2774           if (coding->selective)
2775             {
2776               unsigned char *p = destination, *pend = destination + produced;
2777               while (p < pend)
2778                 if (*p++ == '\015') p[-1] = '\n';
2779             }
2780         }
2781       *consumed = produced;
2782       break;
2783
2784     case coding_type_emacs_mule:
2785     case coding_type_undecided:
2786       if (coding->eol_type == CODING_EOL_LF
2787           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2788         goto label_no_conversion;
2789       produced = encode_eol (coding, source, destination,
2790                              src_bytes, dst_bytes, consumed);
2791       break;
2792
2793     case coding_type_sjis:
2794       produced = encode_coding_sjis_big5 (coding, source, destination,
2795                                           src_bytes, dst_bytes, consumed,
2796                                           1);
2797       break;
2798
2799     case coding_type_iso2022:
2800       produced = encode_coding_iso2022 (coding, source, destination,
2801                                         src_bytes, dst_bytes, consumed);
2802       break;
2803
2804     case coding_type_big5:
2805       produced = encode_coding_sjis_big5 (coding, source, destination,
2806                                           src_bytes, dst_bytes, consumed,
2807                                           0);
2808       break;
2809
2810     case coding_type_ccl:
2811       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2812                              src_bytes, dst_bytes, consumed);
2813       break;
2814     }
2815
2816   return produced;
2817 }
2818
2819 #define CONVERSION_BUFFER_EXTRA_ROOM 256
2820
2821 /* Return maximum size (bytes) of a buffer enough for decoding
2822    SRC_BYTES of text encoded in CODING.  */
2823
2824 int
2825 decoding_buffer_size (coding, src_bytes)
2826      struct coding_system *coding;
2827      int src_bytes;
2828 {
2829   int magnification;
2830
2831   if (coding->type == coding_type_iso2022)
2832     magnification = 3;
2833   else if (coding->type == coding_type_ccl)
2834     magnification = coding->spec.ccl.decoder.buf_magnification;
2835   else
2836     magnification = 2;
2837
2838   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2839 }
2840
2841 /* Return maximum size (bytes) of a buffer enough for encoding
2842    SRC_BYTES of text to CODING.  */
2843
2844 int
2845 encoding_buffer_size (coding, src_bytes)
2846      struct coding_system *coding;
2847      int src_bytes;
2848 {
2849   int magnification;
2850
2851   if (coding->type == coding_type_ccl)
2852     magnification = coding->spec.ccl.encoder.buf_magnification;
2853   else
2854     magnification = 3;
2855
2856   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
2857 }
2858
2859 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
2860 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
2861 #endif
2862
2863 char *conversion_buffer;
2864 int conversion_buffer_size;
2865
2866 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
2867    or decoding.  Sufficient memory is allocated automatically.  If we
2868    run out of memory, return NULL.  */
2869
2870 char *
2871 get_conversion_buffer (size)
2872      int size;
2873 {
2874   if (size > conversion_buffer_size)
2875     {
2876       char *buf;
2877       int real_size = conversion_buffer_size * 2;
2878
2879       while (real_size < size) real_size *= 2;
2880       buf = (char *) xmalloc (real_size);
2881       xfree (conversion_buffer);
2882       conversion_buffer = buf;
2883       conversion_buffer_size = real_size;
2884     }
2885   return conversion_buffer;
2886 }
2887
2888 \f
2889 #ifdef emacs
2890 /*** 7. Emacs Lisp library functions ***/
2891
2892 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
2893        1, 1, 0,
2894   "Return coding-spec of CODING-SYSTEM.\n\
2895 If CODING-SYSTEM is not a valid coding-system, return nil.")
2896   (obj)
2897      Lisp_Object obj;
2898 {
2899   while (SYMBOLP (obj) && !NILP (obj))
2900     obj = Fget (obj, Qcoding_system);
2901   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
2902           ? Qnil : obj);
2903 }
2904
2905 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
2906   "Return t if OBJECT is nil or a coding-system.\n\
2907 See document of make-coding-system for coding-system object.")
2908   (obj)
2909      Lisp_Object obj;
2910 {
2911   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
2912 }
2913
2914 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
2915        Sread_non_nil_coding_system, 1, 1, 0,
2916   "Read a coding system from the minibuffer, prompting with string PROMPT.")
2917   (prompt)
2918      Lisp_Object prompt;
2919 {
2920   Lisp_Object val;
2921   do
2922     {
2923       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
2924                               Qt, Qnil, Qnil, Qnil);
2925     }
2926   while (XSTRING (val)->size == 0);
2927   return (Fintern (val, Qnil));
2928 }
2929
2930 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
2931   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
2932   (prompt)
2933      Lisp_Object prompt;
2934 {
2935   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
2936                                       Qt, Qnil, Qnil, Qnil);
2937   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
2938 }
2939
2940 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
2941        1, 1, 0,
2942   "Check validity of CODING-SYSTEM.\n\
2943 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
2944 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
2945 The value of property should be a vector of length 5.")
2946   (coding_system)
2947      Lisp_Object coding_system;
2948 {
2949   CHECK_SYMBOL (coding_system, 0);
2950   if (!NILP (Fcoding_system_p (coding_system)))
2951     return coding_system;
2952   while (1)
2953     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
2954 }
2955
2956 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
2957        2, 2, 0,
2958   "Detect coding-system of the text in the region between START and END.\n\
2959 Return a list of possible coding-systems ordered by priority.\n\
2960 If only ASCII characters are found, it returns `undecided'\n\
2961  or its subsidiary coding-system according to a detected end-of-line format.")
2962   (b, e)
2963      Lisp_Object b, e;
2964 {
2965   int coding_mask, eol_type;
2966   Lisp_Object val;
2967   int beg, end;
2968
2969   validate_region (&b, &e);
2970   beg = XINT (b), end = XINT (e);
2971   if (beg < GPT && end >= GPT) move_gap (end);
2972
2973   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
2974   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
2975
2976   if (coding_mask == CODING_CATEGORY_MASK_ANY)
2977     {
2978       val = intern ("undecided");
2979       if (eol_type != CODING_EOL_UNDECIDED)
2980         {
2981           Lisp_Object val2 = Fget (val, Qeol_type);
2982           if (VECTORP (val2))
2983             val = XVECTOR (val2)->contents[eol_type];
2984         }
2985     }
2986   else
2987     {
2988       Lisp_Object val2;
2989
2990       /* At first, gather possible coding-systems in VAL in a reverse
2991          order.  */
2992       val = Qnil;
2993       for (val2 = Vcoding_category_list;
2994            !NILP (val2);
2995            val2 = XCONS (val2)->cdr)
2996         {
2997           int idx
2998             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
2999           if (coding_mask & (1 << idx))
3000             val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3001         }
3002
3003       /* Then, change the order of the list, while getting subsidiary
3004          coding-systems.  */
3005       val2 = val;
3006       val = Qnil;
3007       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3008         {
3009           if (eol_type == CODING_EOL_UNDECIDED)
3010             val = Fcons (XCONS (val2)->car, val);
3011           else
3012             {
3013               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3014               if (VECTORP (val3))
3015                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3016               else
3017                 val = Fcons (XCONS (val2)->car, val);
3018             }
3019         }
3020     }
3021
3022   return val;
3023 }
3024
3025 /* Scan text in the region between *BEGP and *ENDP, skip characters
3026    which we never have to encode to (iff ENCODEP is 1) or decode from
3027    coding system CODING at the head and tail, then set BEGP and ENDP
3028    to the addresses of start and end of the text we actually convert.  */
3029
3030 void
3031 shrink_conversion_area (begp, endp, coding, encodep)
3032      unsigned char **begp, **endp;
3033      struct coding_system *coding;
3034      int encodep;
3035 {
3036   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3037
3038   if (coding->eol_type != CODING_EOL_LF
3039       && coding->eol_type != CODING_EOL_UNDECIDED)
3040     /* Since we anyway have to convert end-of-line format, it is not
3041        worth skipping at most 100 bytes or so.  */
3042     return;
3043
3044   if (encodep)                  /* for encoding */
3045     {
3046       switch (coding->type)
3047         {
3048         case coding_type_no_conversion:
3049         case coding_type_emacs_mule:
3050         case coding_type_undecided:
3051           /* We need no conversion.  */
3052           *begp = *endp;
3053           return;
3054         case coding_type_ccl:
3055           /* We can't skip any data.  */
3056           return;
3057         case coding_type_iso2022:
3058           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3059             {
3060               unsigned char *bol = beg_addr;
3061               while (beg_addr < end_addr && *beg_addr < 0x80)
3062                 {
3063                   beg_addr++;
3064                   if (*(beg_addr - 1) == '\n')
3065                     bol = beg_addr;
3066                 }
3067               beg_addr = bol;
3068               goto label_skip_tail;
3069             }
3070           /* fall down ... */
3071         default:
3072           /* We can skip all ASCII characters at the head and tail.  */
3073           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3074         label_skip_tail:
3075           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3076           break;
3077         }
3078     }
3079   else                          /* for decoding */
3080     {
3081       switch (coding->type)
3082         {
3083         case coding_type_no_conversion:
3084           /* We need no conversion.  */
3085           *begp = *endp;
3086           return;
3087         case coding_type_emacs_mule:
3088           if (coding->eol_type == CODING_EOL_LF)
3089             {
3090               /* We need no conversion.  */
3091               *begp = *endp;
3092               return;
3093             }
3094           /* We can skip all but carriage-return.  */
3095           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3096           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3097           break;
3098         case coding_type_sjis:
3099         case coding_type_big5:
3100           /* We can skip all ASCII characters at the head.  */
3101           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3102           /* We can skip all ASCII characters at the tail except for
3103              the second byte of SJIS or BIG5 code.  */
3104           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3105           if (end_addr != *endp)
3106             end_addr++;
3107           break;
3108         case coding_type_ccl:
3109           /* We can't skip any data.  */
3110           return;
3111         default:                /* i.e. case coding_type_iso2022: */
3112           {
3113             unsigned char c;
3114
3115             /* We can skip all ASCII characters except for a few
3116                control codes at the head.  */
3117             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3118                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3119                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3120               beg_addr++;
3121           }
3122           break;
3123         }
3124     }
3125   *begp = beg_addr;
3126   *endp = end_addr;
3127   return;
3128 }
3129
3130 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3131    text between B and E.  B and E are buffer position.  */
3132
3133 Lisp_Object
3134 code_convert_region (b, e, coding, encodep)
3135      Lisp_Object b, e;
3136      struct coding_system *coding;
3137      int encodep;
3138 {
3139   int beg, end, len, consumed, produced;
3140   char *buf;
3141   unsigned char *begp, *endp;
3142   int pos = PT;
3143
3144   validate_region (&b, &e);
3145   beg = XINT (b), end = XINT (e);
3146   if (beg < GPT && end >= GPT)
3147     move_gap (end);
3148
3149   if (encodep && !NILP (coding->pre_write_conversion))
3150     {
3151       /* We must call a pre-conversion function which may put a new
3152          text to be converted in a new buffer.  */
3153       struct buffer *old = current_buffer, *new;
3154
3155       TEMP_SET_PT (beg);
3156       call2 (coding->pre_write_conversion, b, e);
3157       if (old != current_buffer)
3158         {
3159           /* Replace the original text by the text just generated.  */
3160           len = ZV - BEGV;
3161           new = current_buffer;
3162           set_buffer_internal (old);
3163           del_range (beg, end);
3164           insert_from_buffer (new, 1, len, 0);
3165           end = beg + len;
3166         }
3167     }
3168
3169   /* We may be able to shrink the conversion region.  */
3170   begp = POS_ADDR (beg); endp = begp + (end - beg);
3171   shrink_conversion_area (&begp, &endp, coding, encodep);
3172
3173   if (begp == endp)
3174     /* We need no conversion.  */
3175     len = end - beg;
3176   else
3177     {
3178       beg += begp - POS_ADDR (beg);
3179       end =  beg + (endp - begp);
3180
3181       if (encodep)
3182         len = encoding_buffer_size (coding, end - beg);
3183       else
3184         len = decoding_buffer_size (coding, end - beg);
3185       buf = get_conversion_buffer (len);
3186
3187       coding->last_block = 1;
3188       produced = (encodep
3189                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3190                                    &consumed)
3191                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3192                                    &consumed));
3193
3194       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3195
3196       TEMP_SET_PT (beg);
3197       insert (buf, produced);
3198       del_range (PT, PT + end - beg);
3199       if (pos >= end)
3200         pos = PT + (pos - end);
3201       else if (pos > beg)
3202         pos = beg;
3203       TEMP_SET_PT (pos);
3204   }
3205
3206   if (!encodep && !NILP (coding->post_read_conversion))
3207     {
3208       /* We must call a post-conversion function which may alter
3209          the text just converted.  */
3210       Lisp_Object insval;
3211
3212       beg = XINT (b);
3213       TEMP_SET_PT (beg);
3214       insval = call1 (coding->post_read_conversion, make_number (len));
3215       CHECK_NUMBER (insval, 0);
3216       len = XINT (insval);
3217     }
3218
3219   return make_number (len);
3220 }
3221
3222 Lisp_Object
3223 code_convert_string (str, coding, encodep, nocopy)
3224      Lisp_Object str, nocopy;
3225      struct coding_system *coding;
3226      int encodep;
3227 {
3228   int len, consumed, produced;
3229   char *buf;
3230   unsigned char *begp, *endp;
3231   int head_skip, tail_skip;
3232   struct gcpro gcpro1;
3233
3234   if (encodep && !NILP (coding->pre_write_conversion)
3235       || !encodep && !NILP (coding->post_read_conversion))
3236     {
3237       /* Since we have to call Lisp functions which assume target text
3238          is in a buffer, after setting a temporary buffer, call
3239          code_convert_region.  */
3240       int count = specpdl_ptr - specpdl;
3241       int len = XSTRING (str)->size;
3242       Lisp_Object result;
3243       struct buffer *old = current_buffer;
3244
3245       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3246       temp_output_buffer_setup (" *code-converting-work*");
3247       set_buffer_internal (XBUFFER (Vstandard_output));
3248       insert_from_string (str, 0, len, 0);
3249       code_convert_region (make_number (BEGV), make_number (ZV),
3250                            coding, encodep);
3251       result = make_buffer_string (BEGV, ZV, 0);
3252       set_buffer_internal (old);
3253       return unbind_to (count, result);
3254     }
3255
3256   /* We may be able to shrink the conversion region.  */
3257   begp = XSTRING (str)->data;
3258   endp = begp + XSTRING (str)->size;
3259   shrink_conversion_area (&begp, &endp, coding, encodep);
3260
3261   if (begp == endp)
3262     /* We need no conversion.  */
3263     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3264
3265   head_skip = begp - XSTRING (str)->data;
3266   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3267
3268   GCPRO1 (str);
3269
3270   if (encodep)
3271     len = encoding_buffer_size (coding, endp - begp);
3272   else
3273     len = decoding_buffer_size (coding, endp - begp);
3274   buf = get_conversion_buffer (len + head_skip + tail_skip);
3275
3276   bcopy (XSTRING (str)->data, buf, head_skip);
3277   coding->last_block = 1;
3278   produced = (encodep
3279               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3280                                buf + head_skip, endp - begp, len, &consumed)
3281               : decode_coding (coding, XSTRING (str)->data + head_skip,
3282                                buf + head_skip, endp - begp, len, &consumed));
3283   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3284          buf + head_skip + produced,
3285          tail_skip);
3286
3287   UNGCPRO;
3288
3289   return make_string (buf, head_skip + produced + tail_skip);
3290 }
3291
3292 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3293        3, 3, "r\nzCoding system: ",
3294   "Decode current region by specified coding system.\n\
3295 When called from a program, takes three arguments:\n\
3296 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3297 Return length of decoded text.")
3298   (b, e, coding_system)
3299      Lisp_Object b, e, coding_system;
3300 {
3301   struct coding_system coding;
3302
3303   CHECK_NUMBER_COERCE_MARKER (b, 0);
3304   CHECK_NUMBER_COERCE_MARKER (e, 1);
3305   CHECK_SYMBOL (coding_system, 2);
3306
3307   if (NILP (coding_system))
3308     return make_number (XFASTINT (e) - XFASTINT (b));
3309   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3310     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3311
3312   return code_convert_region (b, e, &coding, 0);
3313 }
3314
3315 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3316        3, 3, "r\nzCoding system: ",
3317   "Encode current region by specified coding system.\n\
3318 When called from a program, takes three arguments:\n\
3319 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3320 Return length of encoded text.")
3321   (b, e, coding_system)
3322      Lisp_Object b, e, coding_system;
3323 {
3324   struct coding_system coding;
3325
3326   CHECK_NUMBER_COERCE_MARKER (b, 0);
3327   CHECK_NUMBER_COERCE_MARKER (e, 1);
3328   CHECK_SYMBOL (coding_system, 2);
3329
3330   if (NILP (coding_system))
3331     return make_number (XFASTINT (e) - XFASTINT (b));
3332   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3333     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3334
3335   return code_convert_region (b, e, &coding, 1);
3336 }
3337
3338 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3339        2, 3, 0,
3340   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3341 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3342 of decoding.")
3343   (string, coding_system, nocopy)
3344      Lisp_Object string, coding_system, nocopy;
3345 {
3346   struct coding_system coding;
3347
3348   CHECK_STRING (string, 0);
3349   CHECK_SYMBOL (coding_system, 1);
3350
3351   if (NILP (coding_system))
3352     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3353   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3354     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3355
3356   return code_convert_string (string, &coding, 0, nocopy);
3357 }
3358
3359 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3360        2, 3, 0,
3361   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3362 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3363 of encoding.")
3364   (string, coding_system, nocopy)
3365      Lisp_Object string, coding_system, nocopy;
3366 {
3367   struct coding_system coding;
3368
3369   CHECK_STRING (string, 0);
3370   CHECK_SYMBOL (coding_system, 1);
3371
3372   if (NILP (coding_system))
3373     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3374   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3375     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3376
3377   return code_convert_string (string, &coding, 1, nocopy);
3378 }
3379
3380 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3381   "Decode a JISX0208 character of shift-jis encoding.\n\
3382 CODE is the character code in SJIS.\n\
3383 Return the corresponding character.")
3384   (code)
3385      Lisp_Object code;
3386 {
3387   unsigned char c1, c2, s1, s2;
3388   Lisp_Object val;
3389
3390   CHECK_NUMBER (code, 0);
3391   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3392   DECODE_SJIS (s1, s2, c1, c2);
3393   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3394   return val;
3395 }
3396
3397 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3398   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3399 Return the corresponding character code in SJIS.")
3400   (ch)
3401      Lisp_Object ch;
3402 {
3403   int charset, c1, c2, s1, s2;
3404   Lisp_Object val;
3405
3406   CHECK_NUMBER (ch, 0);
3407   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3408   if (charset == charset_jisx0208)
3409     {
3410       ENCODE_SJIS (c1, c2, s1, s2);
3411       XSETFASTINT (val, (s1 << 8) | s2);
3412     }
3413   else
3414     XSETFASTINT (val, 0);
3415   return val;
3416 }
3417
3418 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3419   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3420 CODE is the character code in BIG5.\n\
3421 Return the corresponding character.")
3422   (code)
3423      Lisp_Object code;
3424 {
3425   int charset;
3426   unsigned char b1, b2, c1, c2;
3427   Lisp_Object val;
3428
3429   CHECK_NUMBER (code, 0);
3430   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3431   DECODE_BIG5 (b1, b2, charset, c1, c2);
3432   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3433   return val;
3434 }
3435
3436 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3437   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3438 Return the corresponding character code in Big5.")
3439   (ch)
3440      Lisp_Object ch;
3441 {
3442   int charset, c1, c2, b1, b2;
3443   Lisp_Object val;
3444
3445   CHECK_NUMBER (ch, 0);
3446   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3447   if (charset == charset_big5_1 || charset == charset_big5_2)
3448     {
3449       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3450       XSETFASTINT (val, (b1 << 8) | b2);
3451     }
3452   else
3453     XSETFASTINT (val, 0);
3454   return val;
3455 }
3456
3457 DEFUN ("set-terminal-coding-system-internal",
3458        Fset_terminal_coding_system_internal,
3459        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3460   (coding_system)
3461      Lisp_Object coding_system;
3462 {
3463   CHECK_SYMBOL (coding_system, 0);
3464   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3465   return Qnil;
3466 }
3467
3468 DEFUN ("terminal-coding-system",
3469        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3470   "Return coding-system of your terminal.")
3471   ()
3472 {
3473   return terminal_coding.symbol;
3474 }
3475
3476 DEFUN ("set-keyboard-coding-system-internal",
3477        Fset_keyboard_coding_system_internal,
3478        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3479   (coding_system)
3480      Lisp_Object coding_system;
3481 {
3482   CHECK_SYMBOL (coding_system, 0);
3483   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3484   return Qnil;
3485 }
3486
3487 DEFUN ("keyboard-coding-system",
3488        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3489   "Return coding-system of what is sent from terminal keyboard.")
3490   ()
3491 {
3492   return keyboard_coding.symbol;
3493 }
3494
3495 \f
3496 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3497        Sfind_operation_coding_system,  1, MANY, 0,
3498   "Choose a coding system for an operation based on the target name.\n\
3499 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3500 DECODING-SYSTEM is the coding system to use for decoding\n\
3501 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3502 for encoding (in case OPERATION does encoding).\n\
3503 \n\
3504 The first argument OPERATION specifies an I/O primitive:\n\
3505   For file I/O, `insert-file-contents' or `write-region'.\n\
3506   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3507   For network I/O, `open-network-stream'.\n\
3508 \n\
3509 The remaining arguments should be the same arguments that were passed\n\
3510 to the primitive.  Depending on which primitive, one of those arguments\n\
3511 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3512 whichever argument specifies the file name is TARGET.\n\
3513 \n\
3514 TARGET has a meaning which depends on OPERATION:\n\
3515   For file I/O, TARGET is a file name.\n\
3516   For process I/O, TARGET is a process name.\n\
3517   For network I/O, TARGET is a service name or a port number\n\
3518 \n\
3519 This function looks up what specified for TARGET in,\n\
3520 `file-coding-system-alist', `process-coding-system-alist',\n\
3521 or `network-coding-system-alist' depending on OPERATION.\n\
3522 They may specify a coding system, a cons of coding systems,\n\
3523 or a function symbol to call.\n\
3524 In the last case, we call the function with one argument,\n\
3525 which is a list of all the arguments given to this function.")
3526   (nargs, args)
3527      int nargs;
3528      Lisp_Object *args;
3529 {
3530   Lisp_Object operation, target_idx, target, val;
3531   register Lisp_Object chain;
3532
3533   if (nargs < 2)
3534     error ("Too few arguments");
3535   operation = args[0];
3536   if (!SYMBOLP (operation)
3537       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3538     error ("Invalid first arguement");
3539   if (nargs < 1 + XINT (target_idx))
3540     error ("Too few arguments for operation: %s",
3541            XSYMBOL (operation)->name->data);
3542   target = args[XINT (target_idx) + 1];
3543   if (!(STRINGP (target)
3544         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3545     error ("Invalid %dth argument", XINT (target_idx) + 1);
3546
3547   chain = ((EQ (operation, Qinsert_file_contents)
3548             || EQ (operation, Qwrite_region))
3549            ? Vfile_coding_system_alist
3550            : (EQ (operation, Qopen_network_stream)
3551               ? Vnetwork_coding_system_alist
3552               : Vprocess_coding_system_alist));
3553   if (NILP (chain))
3554     return Qnil;
3555
3556   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3557     {
3558       Lisp_Object elt = XCONS (chain)->car;
3559
3560       if (CONSP (elt)
3561           && ((STRINGP (target)
3562                && STRINGP (XCONS (elt)->car)
3563                && fast_string_match (XCONS (elt)->car, target) >= 0)
3564               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3565         {
3566           val = XCONS (elt)->cdr;
3567           if (CONSP (val))
3568             return val;
3569           if (! SYMBOLP (val))
3570             return Qnil;
3571           if (! NILP (Fcoding_system_p (val)))
3572             return Fcons (val, val);
3573           if (!NILP (Fboundp (val)))
3574             return call1 (val, Flist (nargs, args));
3575           return Qnil;
3576         }
3577     }
3578   return Qnil;
3579 }
3580
3581 #endif /* emacs */
3582
3583 \f
3584 /*** 8. Post-amble ***/
3585
3586 init_coding_once ()
3587 {
3588   int i;
3589
3590   /* Emacs' internal format specific initialize routine.  */
3591   for (i = 0; i <= 0x20; i++)
3592     emacs_code_class[i] = EMACS_control_code;
3593   emacs_code_class[0x0A] = EMACS_linefeed_code;
3594   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3595   for (i = 0x21 ; i < 0x7F; i++)
3596     emacs_code_class[i] = EMACS_ascii_code;
3597   emacs_code_class[0x7F] = EMACS_control_code;
3598   emacs_code_class[0x80] = EMACS_leading_code_composition;
3599   for (i = 0x81; i < 0xFF; i++)
3600     emacs_code_class[i] = EMACS_invalid_code;
3601   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3602   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3603   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3604   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3605
3606   /* ISO2022 specific initialize routine.  */
3607   for (i = 0; i < 0x20; i++)
3608     iso_code_class[i] = ISO_control_code;
3609   for (i = 0x21; i < 0x7F; i++)
3610     iso_code_class[i] = ISO_graphic_plane_0;
3611   for (i = 0x80; i < 0xA0; i++)
3612     iso_code_class[i] = ISO_control_code;
3613   for (i = 0xA1; i < 0xFF; i++)
3614     iso_code_class[i] = ISO_graphic_plane_1;
3615   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3616   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3617   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3618   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3619   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3620   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3621   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3622   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3623   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3624   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3625
3626   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3627   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3628
3629   setup_coding_system (Qnil, &keyboard_coding);
3630   setup_coding_system (Qnil, &terminal_coding);
3631
3632 #if defined (MSDOS) || defined (WINDOWSNT)
3633   system_eol_type = CODING_EOL_CRLF;
3634 #else
3635   system_eol_type = CODING_EOL_LF;
3636 #endif
3637 }
3638
3639 #ifdef emacs
3640
3641 syms_of_coding ()
3642 {
3643   Qtarget_idx = intern ("target-idx");
3644   staticpro (&Qtarget_idx);
3645
3646   /* Target FILENAME is the first argument.  */
3647   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3648   /* Target FILENAME is the third argument.  */
3649   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3650
3651   Qcall_process = intern ("call-process");
3652   staticpro (&Qcall_process);
3653   /* Target PROGRAM is the first argument.  */
3654   Fput (Qcall_process, Qtarget_idx, make_number (0));
3655
3656   Qcall_process_region = intern ("call-process-region");
3657   staticpro (&Qcall_process_region);
3658   /* Target PROGRAM is the third argument.  */
3659   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3660
3661   Qstart_process = intern ("start-process");
3662   staticpro (&Qstart_process);
3663   /* Target PROGRAM is the third argument.  */
3664   Fput (Qstart_process, Qtarget_idx, make_number (2));
3665
3666   Qopen_network_stream = intern ("open-network-stream");
3667   staticpro (&Qopen_network_stream);
3668   /* Target SERVICE is the fourth argument.  */
3669   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3670
3671   Qcoding_system = intern ("coding-system");
3672   staticpro (&Qcoding_system);
3673
3674   Qeol_type = intern ("eol-type");
3675   staticpro (&Qeol_type);
3676
3677   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3678   staticpro (&Qbuffer_file_coding_system);
3679
3680   Qpost_read_conversion = intern ("post-read-conversion");
3681   staticpro (&Qpost_read_conversion);
3682
3683   Qpre_write_conversion = intern ("pre-write-conversion");
3684   staticpro (&Qpre_write_conversion);
3685
3686   Qcoding_system_spec = intern ("coding-system-spec");
3687   staticpro (&Qcoding_system_spec);
3688
3689   Qcoding_system_p = intern ("coding-system-p");
3690   staticpro (&Qcoding_system_p);
3691
3692   Qcoding_system_error = intern ("coding-system-error");
3693   staticpro (&Qcoding_system_error);
3694
3695   Fput (Qcoding_system_error, Qerror_conditions,
3696         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3697   Fput (Qcoding_system_error, Qerror_message,
3698         build_string ("Invalid coding system"));
3699
3700   Qcoding_category_index = intern ("coding-category-index");
3701   staticpro (&Qcoding_category_index);
3702
3703   {
3704     int i;
3705     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3706       {
3707         coding_category_table[i] = intern (coding_category_name[i]);
3708         staticpro (&coding_category_table[i]);
3709         Fput (coding_category_table[i], Qcoding_category_index,
3710               make_number (i));
3711       }
3712   }
3713
3714   Qcharacter_unification_table = intern ("character-unification-table");
3715   staticpro (&Qcharacter_unification_table);
3716   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3717         make_number (0));
3718
3719   Qcharacter_unification_table_for_decode
3720     = intern ("character-unification-table-for-decode");
3721   staticpro (&Qcharacter_unification_table_for_decode);
3722
3723   Qcharacter_unification_table_for_encode
3724     = intern ("character-unification-table-for-encode");
3725   staticpro (&Qcharacter_unification_table_for_encode);
3726
3727   Qemacs_mule = intern ("emacs-mule");
3728   staticpro (&Qemacs_mule);
3729
3730   defsubr (&Scoding_system_spec);
3731   defsubr (&Scoding_system_p);
3732   defsubr (&Sread_coding_system);
3733   defsubr (&Sread_non_nil_coding_system);
3734   defsubr (&Scheck_coding_system);
3735   defsubr (&Sdetect_coding_region);
3736   defsubr (&Sdecode_coding_region);
3737   defsubr (&Sencode_coding_region);
3738   defsubr (&Sdecode_coding_string);
3739   defsubr (&Sencode_coding_string);
3740   defsubr (&Sdecode_sjis_char);
3741   defsubr (&Sencode_sjis_char);
3742   defsubr (&Sdecode_big5_char);
3743   defsubr (&Sencode_big5_char);
3744   defsubr (&Sset_terminal_coding_system_internal);
3745   defsubr (&Sterminal_coding_system);
3746   defsubr (&Sset_keyboard_coding_system_internal);
3747   defsubr (&Skeyboard_coding_system);
3748   defsubr (&Sfind_operation_coding_system);
3749
3750   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3751     "List of coding-categories (symbols) ordered by priority.");
3752   {
3753     int i;
3754
3755     Vcoding_category_list = Qnil;
3756     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3757       Vcoding_category_list
3758         = Fcons (coding_category_table[i], Vcoding_category_list);
3759   }
3760
3761   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3762     "A variable of internal use only.\n\
3763 If the value is a coding system, it is used for decoding on read operation.\n\
3764 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3765   Vcoding_system_for_read = Qnil;
3766
3767   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3768     "A variable of internal use only.\n\
3769 If the value is a coding system, it is used for encoding on write operation.\n\
3770 If not, an appropriate element in `coding-system-alist' (which see) is used.");
3771   Vcoding_system_for_write = Qnil;
3772
3773   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
3774     "Coding-system used in the latest file or process I/O.");
3775   Vlast_coding_system_used = Qnil;
3776
3777   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
3778     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
3779   inhibit_eol_conversion = 0;
3780
3781   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
3782     "Alist to decide a coding system to use for a file I/O operation.\n\
3783 The format is ((PATTERN . VAL) ...),\n\
3784 where PATTERN is a regular expression matching a file name,\n\
3785 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3786 If VAL is a coding system, it is used for both decoding and encoding\n\
3787 the file contents.\n\
3788 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3789 and the cdr part is used for encoding.\n\
3790 If VAL is a function symbol, the function must return a coding system\n\
3791 or a cons of coding systems which are used as above.\n\
3792 \n\
3793 See also the function `find-operation-coding-system'.");
3794   Vfile_coding_system_alist = Qnil;
3795
3796   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
3797     "Alist to decide a coding system to use for a process I/O operation.\n\
3798 The format is ((PATTERN . VAL) ...),\n\
3799 where PATTERN is a regular expression matching a program name,\n\
3800 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3801 If VAL is a coding system, it is used for both decoding what received\n\
3802 from the program and encoding what sent to the program.\n\
3803 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3804 and the cdr part is used for encoding.\n\
3805 If VAL is a function symbol, the function must return a coding system\n\
3806 or a cons of coding systems which are used as above.\n\
3807 \n\
3808 See also the function `find-operation-coding-system'.");
3809   Vprocess_coding_system_alist = Qnil;
3810
3811   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
3812     "Alist to decide a coding system to use for a network I/O operation.\n\
3813 The format is ((PATTERN . VAL) ...),\n\
3814 where PATTERN is a regular expression matching a network service name\n\
3815 or is a port number to connect to,\n\
3816 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
3817 If VAL is a coding system, it is used for both decoding what received\n\
3818 from the network stream and encoding what sent to the network stream.\n\
3819 If VAL is a cons of coding systems, the car part is used for decoding,\n\
3820 and the cdr part is used for encoding.\n\
3821 If VAL is a function symbol, the function must return a coding system\n\
3822 or a cons of coding systems which are used as above.\n\
3823 \n\
3824 See also the function `find-operation-coding-system'.");
3825   Vnetwork_coding_system_alist = Qnil;
3826
3827   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
3828     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
3829   eol_mnemonic_unix = ':';
3830
3831   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
3832     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
3833   eol_mnemonic_dos = '\\';
3834
3835   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
3836     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
3837   eol_mnemonic_mac = '/';
3838
3839   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
3840     "Mnemonic character indicating end-of-line format is not yet decided.");
3841   eol_mnemonic_undecided = ':';
3842
3843   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
3844     "Non-nil means ISO 2022 encoder/decoder do character unification.");
3845   Venable_character_unification = Qt;
3846
3847   DEFVAR_LISP ("standard-character-unification-table-for-decode",
3848     &Vstandard_character_unification_table_for_decode,
3849     "Table for unifying characters when reading.");
3850   Vstandard_character_unification_table_for_decode = Qnil;
3851
3852   DEFVAR_LISP ("standard-character-unification-table-for-encode",
3853     &Vstandard_character_unification_table_for_encode,
3854     "Table for unifying characters when writing.");
3855   Vstandard_character_unification_table_for_encode = Qnil;
3856
3857   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
3858     "Alist of charsets vs revision numbers.\n\
3859 While encoding, if a charset (car part of an element) is found,\n\
3860 designate it with the escape sequence identifing revision (cdr part of the element).");
3861   Vcharset_revision_alist = Qnil;
3862
3863   DEFVAR_LISP ("default-process-coding-system",
3864                &Vdefault_process_coding_system,
3865     "Cons of coding systems used for process I/O by default.\n\
3866 The car part is used for decoding a process output,\n\
3867 the cdr part is used for encoding a text to be sent to a process.");
3868   Vdefault_process_coding_system = Qnil;
3869 }
3870
3871 #endif /* emacs */