src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   1. Preamble
  25   2. Emacs' internal format (emacs-mule) handlers
  26   3. ISO2022 handlers
  27   4. Shift-JIS and BIG5 handlers
  28   5. End-of-line handlers
  29   6. C library functions
  30   7. Emacs Lisp library functions
  31   8. Post-amble
  32
  33 */
  34
  35 /*** GENERAL NOTE on CODING SYSTEM ***
  36
  37   Coding system is an encoding mechanism of one or more character
  38   sets.  Here's a list of coding systems which Emacs can handle.  When
  39   we say "decode", it means converting some other coding system to
  40   Emacs' internal format (emacs-internal), and when we say "encode",
  41   it means converting the coding system emacs-mule to some other
  42   coding system.
  43
  44   0. Emacs' internal format (emacs-mule)
  45
  46   Emacs itself holds a multi-lingual character in a buffer and a string
  47   in a special format.  Details are described in section 2.
  48
  49   1. ISO2022
  50
  51   The most famous coding system for multiple character sets.  X's
  52   Compound Text, various EUCs (Extended Unix Code), and coding
  53   systems used in Internet communication such as ISO-2022-JP are
  54   all variants of ISO2022.  Details are described in section 3.
  55
  56   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  57
  58   A coding system to encode character sets: ASCII, JISX0201, and
  59   JISX0208.  Widely used for PC's in Japan.  Details are described in
  60   section 4.
  61
  62   3. BIG5
  63
  64   A coding system to encode character sets: ASCII and Big5.  Widely
  65   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  66   described in section 4.  In this file, when we write "BIG5"
  67   (all uppercase), we mean the coding system, and when we write
  68   "Big5" (capitalized), we mean the character set.
  69
  70   4. Raw text
  71
  72   A coding system to for a text containing random 8-bit code.  Emacs
  73   does no code conversion on such a text except for end-of-line
  74   format.
  75
  76   5. Other
  77
  78   If a user wants to read/write a text encoded in a coding system not
  79   listed above, he can supply a decoder and an encoder for it in CCL
  80   (Code Conversion Language) programs.  Emacs executes the CCL program
  81   while reading/writing.
  82
  83   Emacs represents a coding-system by a Lisp symbol that has a property
  84   `coding-system'.  But, before actually using the coding-system, the
  85   information about it is set in a structure of type `struct
  86   coding_system' for rapid processing.  See section 6 for more details.
  87
  88 */
  89
  90 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  91
  92   How end-of-line of a text is encoded depends on a system.  For
  93   instance, Unix's format is just one byte of `line-feed' code,
  94   whereas DOS's format is two-byte sequence of `carriage-return' and
  95   `line-feed' codes.  MacOS's format is one byte of `carriage-return'.
  96
  97   Since text characters encoding and end-of-line encoding are
  98   independent, any coding system described above can take
  99   any format of end-of-line.  So, Emacs has information of format of
 100   end-of-line in each coding-system.  See section 6 for more details.
 101
 102 */
 103
 104 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 105
 106   These functions check if a text between SRC and SRC_END is encoded
 107   in the coding system category XXX.  Each returns an integer value in
 108   which appropriate flag bits for the category XXX is set.  The flag
 109   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 110   template of these functions.  */
 111 #if 0
 112 int
 113 detect_coding_emacs_mule (src, src_end)
 114      unsigned char *src, *src_end;
 115 {
 116   ...
 117 }
 118 #endif
 119
 120 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 121
 122   These functions decode SRC_BYTES length text at SOURCE encoded in
 123   CODING to Emacs' internal format (emacs-mule).  The resulting text
 124   goes to a place pointed to by DESTINATION, the length of which should
 125   not exceed DST_BYTES.  The number of bytes actually processed is
 126   returned as *CONSUMED.  The return value is the length of the decoded
 127   text.  Below is a template of these functions.  */
 128 #if 0
 129 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 130      struct coding_system *coding;
 131      unsigned char *source, *destination;
 132      int src_bytes, dst_bytes;
 133      int *consumed;
 134 {
 135   ...
 136 }
 137 #endif
 138
 139 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 140
 141   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 142   internal format (emacs-mule) to CODING.  The resulting text goes to
 143   a place pointed to by DESTINATION, the length of which should not
 144   exceed DST_BYTES.  The number of bytes actually processed is
 145   returned as *CONSUMED.  The return value is the length of the
 146   encoded text.  Below is a template of these functions.  */
 147 #if 0
 148 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes, consumed)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152      int *consumed;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** COMMONLY USED MACROS ***/
 159
 160 /* The following three macros ONE_MORE_BYTE, TWO_MORE_BYTES, and
 161    THREE_MORE_BYTES safely get one, two, and three bytes from the
 162    source text respectively.  If there are not enough bytes in the
 163    source, they jump to `label_end_of_loop'.  The caller should set
 164    variables `src' and `src_end' to appropriate areas in advance.  */
 165
 166 #define ONE_MORE_BYTE(c1)       \
 167   do {                          \
 168     if (src < src_end)          \
 169       c1 = *src++;              \
 170     else                        \
 171       goto label_end_of_loop;   \
 172   } while (0)
 173
 174 #define TWO_MORE_BYTES(c1, c2)  \
 175   do {                          \
 176     if (src + 1 < src_end)      \
 177       c1 = *src++, c2 = *src++; \
 178     else                        \
 179       goto label_end_of_loop;   \
 180   } while (0)
 181
 182 #define THREE_MORE_BYTES(c1, c2, c3)            \
 183   do {                                          \
 184     if (src + 2 < src_end)                      \
 185       c1 = *src++, c2 = *src++, c3 = *src++;    \
 186     else                                        \
 187       goto label_end_of_loop;                   \
 188   } while (0)
 189
 190 /* The following three macros DECODE_CHARACTER_ASCII,
 191    DECODE_CHARACTER_DIMENSION1, and DECODE_CHARACTER_DIMENSION2 put
 192    the multi-byte form of a character of each class at the place
 193    pointed by `dst'.  The caller should set the variable `dst' to
 194    point to an appropriate area and the variable `coding' to point to
 195    the coding-system of the currently decoding text in advance.  */
 196
 197 /* Decode one ASCII character C.  */
 198
 199 #define DECODE_CHARACTER_ASCII(c)                               \
 200   do {                                                          \
 201     if (COMPOSING_P (coding->composing))                        \
 202       *dst++ = 0xA0, *dst++ = (c) | 0x80;                       \
 203     else                                                        \
 204       *dst++ = (c);                                             \
 205   } while (0)
 206
 207 /* Decode one DIMENSION1 character whose charset is CHARSET and whose
 208    position-code is C.  */
 209
 210 #define DECODE_CHARACTER_DIMENSION1(charset, c)                         \
 211   do {                                                                  \
 212     unsigned char leading_code = CHARSET_LEADING_CODE_BASE (charset);   \
 213     if (COMPOSING_P (coding->composing))                                \
 214       *dst++ = leading_code + 0x20;                                     \
 215     else                                                                \
 216       *dst++ = leading_code;                                            \
 217     if (leading_code = CHARSET_LEADING_CODE_EXT (charset))              \
 218       *dst++ = leading_code;                                            \
 219     *dst++ = (c) | 0x80;                                                \
 220   } while (0)
 221
 222 /* Decode one DIMENSION2 character whose charset is CHARSET and whose
 223    position-codes are C1 and C2.  */
 224
 225 #define DECODE_CHARACTER_DIMENSION2(charset, c1, c2)    \
 226   do {                                                  \
 227     DECODE_CHARACTER_DIMENSION1 (charset, c1);          \
 228     *dst++ = (c2) | 0x80;                               \
 229   } while (0)
 230
 231 \f
 232 /*** 1. Preamble ***/
 233
 234 #include <stdio.h>
 235
 236 #ifdef emacs
 237
 238 #include <config.h>
 239 #include "lisp.h"
 240 #include "buffer.h"
 241 #include "charset.h"
 242 #include "ccl.h"
 243 #include "coding.h"
 244 #include "window.h"
 245
 246 #else  /* not emacs */
 247
 248 #include "mulelib.h"
 249
 250 #endif /* not emacs */
 251
 252 Lisp_Object Qcoding_system, Qeol_type;
 253 Lisp_Object Qbuffer_file_coding_system;
 254 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 255 Lisp_Object Qno_conversion, Qundecided;
 256
 257 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 258 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 259 Lisp_Object Qstart_process, Qopen_network_stream;
 260 Lisp_Object Qtarget_idx;
 261
 262 /* Mnemonic character of each format of end-of-line.  */
 263 int eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 264 /* Mnemonic character to indicate format of end-of-line is not yet
 265    decided.  */
 266 int eol_mnemonic_undecided;
 267
 268 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 269    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 270 int system_eol_type;
 271
 272 #ifdef emacs
 273
 274 Lisp_Object Qcoding_system_spec, Qcoding_system_p, Qcoding_system_error;
 275
 276 /* Coding system emacs-mule is for converting only end-of-line format.  */
 277 Lisp_Object Qemacs_mule;
 278
 279 /* Coding-systems are handed between Emacs Lisp programs and C internal
 280    routines by the following three variables.  */
 281 /* Coding-system for reading files and receiving data from process.  */
 282 Lisp_Object Vcoding_system_for_read;
 283 /* Coding-system for writing files and sending data to process.  */
 284 Lisp_Object Vcoding_system_for_write;
 285 /* Coding-system actually used in the latest I/O.  */
 286 Lisp_Object Vlast_coding_system_used;
 287
 288 /* A vector of length 256 which contains information about special
 289    Latin codes (espepcially for dealing with Microsoft code).  */
 290 Lisp_Object Vlatin_extra_code_table;
 291
 292 /* Flag to inhibit code conversion of end-of-line format.  */
 293 int inhibit_eol_conversion;
 294
 295 /* Coding system to be used to encode text for terminal display.  */
 296 struct coding_system terminal_coding;
 297
 298 /* Coding system to be used to encode text for terminal display when
 299    terminal coding system is nil.  */
 300 struct coding_system safe_terminal_coding;
 301
 302 /* Coding system of what is sent from terminal keyboard.  */
 303 struct coding_system keyboard_coding;
 304
 305 Lisp_Object Vfile_coding_system_alist;
 306 Lisp_Object Vprocess_coding_system_alist;
 307 Lisp_Object Vnetwork_coding_system_alist;
 308
 309 #endif /* emacs */
 310
 311 Lisp_Object Qcoding_category_index;
 312
 313 /* List of symbols `coding-category-xxx' ordered by priority.  */
 314 Lisp_Object Vcoding_category_list;
 315
 316 /* Table of coding-systems currently assigned to each coding-category.  */
 317 Lisp_Object coding_category_table[CODING_CATEGORY_IDX_MAX];
 318
 319 /* Table of names of symbol for each coding-category.  */
 320 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 321   "coding-category-emacs-mule",
 322   "coding-category-sjis",
 323   "coding-category-iso-7",
 324   "coding-category-iso-8-1",
 325   "coding-category-iso-8-2",
 326   "coding-category-iso-7-else",
 327   "coding-category-iso-8-else",
 328   "coding-category-big5",
 329   "coding-category-raw-text",
 330   "coding-category-binary"
 331 };
 332
 333 /* Flag to tell if we look up unification table on character code
 334    conversion.  */
 335 Lisp_Object Venable_character_unification;
 336 /* Standard unification table to look up on decoding (reading).  */
 337 Lisp_Object Vstandard_character_unification_table_for_decode;
 338 /* Standard unification table to look up on encoding (writing).  */
 339 Lisp_Object Vstandard_character_unification_table_for_encode;
 340
 341 Lisp_Object Qcharacter_unification_table;
 342 Lisp_Object Qcharacter_unification_table_for_decode;
 343 Lisp_Object Qcharacter_unification_table_for_encode;
 344
 345 /* Alist of charsets vs revision number.  */
 346 Lisp_Object Vcharset_revision_alist;
 347
 348 /* Default coding systems used for process I/O.  */
 349 Lisp_Object Vdefault_process_coding_system;
 350
 351 \f
 352 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 353
 354 /* Emacs' internal format for encoding multiple character sets is a
 355    kind of multi-byte encoding, i.e. characters are encoded by
 356    variable-length sequences of one-byte codes.  ASCII characters
 357    and control characters (e.g. `tab', `newline') are represented by
 358    one-byte sequences which are their ASCII codes, in the range 0x00
 359    through 0x7F.  The other characters are represented by a sequence
 360    of `base leading-code', optional `extended leading-code', and one
 361    or two `position-code's.  The length of the sequence is determined
 362    by the base leading-code.  Leading-code takes the range 0x80
 363    through 0x9F, whereas extended leading-code and position-code take
 364    the range 0xA0 through 0xFF.  See `charset.h' for more details
 365    about leading-code and position-code.
 366
 367    There's one exception to this rule.  Special leading-code
 368    `leading-code-composition' denotes that the following several
 369    characters should be composed into one character.  Leading-codes of
 370    components (except for ASCII) are added 0x20.  An ASCII character
 371    component is represented by a 2-byte sequence of `0xA0' and
 372    `ASCII-code + 0x80'.  See also the comments in `charset.h' for the
 373    details of composite character.  Hence, we can summarize the code
 374    range as follows:
 375
 376    --- CODE RANGE of Emacs' internal format ---
 377    (character set)      (range)
 378    ASCII                0x00 .. 0x7F
 379    ELSE (1st byte)      0x80 .. 0x9F
 380         (rest bytes)    0xA0 .. 0xFF
 381    ---------------------------------------------
 382
 383   */
 384
 385 enum emacs_code_class_type emacs_code_class[256];
 386
 387 /* Go to the next statement only if *SRC is accessible and the code is
 388    greater than 0xA0.  */
 389 #define CHECK_CODE_RANGE_A0_FF  \
 390   do {                          \
 391     if (src >= src_end)         \
 392       goto label_end_of_switch; \
 393     else if (*src++ < 0xA0)     \
 394       return 0;                 \
 395   } while (0)
 396
 397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 398    Check if a text is encoded in Emacs' internal format.  If it is,
 399    return CODING_CATEGORY_MASK_EMASC_MULE, else return 0.  */
 400
 401 int
 402 detect_coding_emacs_mule (src, src_end)
 403      unsigned char *src, *src_end;
 404 {
 405   unsigned char c;
 406   int composing = 0;
 407
 408   while (src < src_end)
 409     {
 410       c = *src++;
 411
 412       if (composing)
 413         {
 414           if (c < 0xA0)
 415             composing = 0;
 416           else
 417             c -= 0x20;
 418         }
 419
 420       switch (emacs_code_class[c])
 421         {
 422         case EMACS_ascii_code:
 423         case EMACS_linefeed_code:
 424           break;
 425
 426         case EMACS_control_code:
 427           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 428             return 0;
 429           break;
 430
 431         case EMACS_invalid_code:
 432           return 0;
 433
 434         case EMACS_leading_code_composition: /* c == 0x80 */
 435           if (composing)
 436             CHECK_CODE_RANGE_A0_FF;
 437           else
 438             composing = 1;
 439           break;
 440
 441         case EMACS_leading_code_4:
 442           CHECK_CODE_RANGE_A0_FF;
 443           /* fall down to check it two more times ...  */
 444
 445         case EMACS_leading_code_3:
 446           CHECK_CODE_RANGE_A0_FF;
 447           /* fall down to check it one more time ...  */
 448
 449         case EMACS_leading_code_2:
 450           CHECK_CODE_RANGE_A0_FF;
 451           break;
 452
 453         default:
 454         label_end_of_switch:
 455           break;
 456         }
 457     }
 458   return CODING_CATEGORY_MASK_EMACS_MULE;
 459 }
 460
 461 \f
 462 /*** 3. ISO2022 handlers ***/
 463
 464 /* The following note describes the coding system ISO2022 briefly.
 465    Since the intention of this note is to help in understanding of
 466    the programs in this file, some parts are NOT ACCURATE or OVERLY
 467    SIMPLIFIED.  For the thorough understanding, please refer to the
 468    original document of ISO2022.
 469
 470    ISO2022 provides many mechanisms to encode several character sets
 471    in 7-bit and 8-bit environment.  If one chooses 7-bite environment,
 472    all text is encoded by codes of less than 128.  This may make the
 473    encoded text a little bit longer, but the text gets more stability
 474    to pass through several gateways (some of them strip off the MSB).
 475
 476    There are two kinds of character set: control character set and
 477    graphic character set.  The former contains control characters such
 478    as `newline' and `escape' to provide control functions (control
 479    functions are provided also by escape sequences).  The latter
 480    contains graphic characters such as ' A' and '-'.  Emacs recognizes
 481    two control character sets and many graphic character sets.
 482
 483    Graphic character sets are classified into one of the following
 484    four classes, DIMENSION1_CHARS94, DIMENSION1_CHARS96,
 485    DIMENSION2_CHARS94, DIMENSION2_CHARS96 according to the number of
 486    bytes (DIMENSION) and the number of characters in one dimension
 487    (CHARS) of the set.  In addition, each character set is assigned an
 488    identification tag (called "final character" and denoted as <F>
 489    here after) which is unique in each class.  <F> of each character
 490    set is decided by ECMA(*) when it is registered in ISO.  Code range
 491    of <F> is 0x30..0x7F (0x30..0x3F are for private use only).
 492
 493    Note (*): ECMA = European Computer Manufacturers Association
 494
 495    Here are examples of graphic character set [NAME(<F>)]:
 496         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 497         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 498         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 499         o DIMENSION2_CHARS96 -- none for the moment
 500
 501    A code area (1byte=8bits) is divided into 4 areas, C0, GL, C1, and GR.
 502         C0 [0x00..0x1F] -- control character plane 0
 503         GL [0x20..0x7F] -- graphic character plane 0
 504         C1 [0x80..0x9F] -- control character plane 1
 505         GR [0xA0..0xFF] -- graphic character plane 1
 506
 507    A control character set is directly designated and invoked to C0 or
 508    C1 by an escape sequence.  The most common case is that ISO646's
 509    control character set is designated/invoked to C0 and ISO6429's
 510    control character set is designated/invoked to C1, and usually
 511    these designations/invocations are omitted in a coded text.  With
 512    7-bit environment, only C0 can be used, and a control character for
 513    C1 is encoded by an appropriate escape sequence to fit in the
 514    environment.  All control characters for C1 are defined the
 515    corresponding escape sequences.
 516
 517    A graphic character set is at first designated to one of four
 518    graphic registers (G0 through G3), then these graphic registers are
 519    invoked to GL or GR.  These designations and invocations can be
 520    done independently.  The most common case is that G0 is invoked to
 521    GL, G1 is invoked to GR, and ASCII is designated to G0, and usually
 522    these invocations and designations are omitted in a coded text.
 523    With 7-bit environment, only GL can be used.
 524
 525    When a graphic character set of CHARS94 is invoked to GL, code 0x20
 526    and 0x7F of GL area work as control characters SPACE and DEL
 527    respectively, and code 0xA0 and 0xFF of GR area should not be used.
 528
 529    There are two ways of invocation: locking-shift and single-shift.
 530    With locking-shift, the invocation lasts until the next different
 531    invocation, whereas with single-shift, the invocation works only
 532    for the following character and doesn't affect locking-shift.
 533    Invocations are done by the following control characters or escape
 534    sequences.
 535
 536    ----------------------------------------------------------------------
 537    function             control char    escape sequence description
 538    ----------------------------------------------------------------------
 539    SI  (shift-in)               0x0F    none            invoke G0 to GL
 540    SO  (shift-out)              0x0E    none            invoke G1 to GL
 541    LS2 (locking-shift-2)        none    ESC 'n'         invoke G2 into GL
 542    LS3 (locking-shift-3)        none    ESC 'o'         invoke G3 into GL
 543    SS2 (single-shift-2)         0x8E    ESC 'N'         invoke G2 into GL
 544    SS3 (single-shift-3)         0x8F    ESC 'O'         invoke G3 into GL
 545    ----------------------------------------------------------------------
 546    The first four are for locking-shift.  Control characters for these
 547    functions are defined by macros ISO_CODE_XXX in `coding.h'.
 548
 549    Designations are done by the following escape sequences.
 550    ----------------------------------------------------------------------
 551    escape sequence      description
 552    ----------------------------------------------------------------------
 553    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 554    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 555    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 556    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 557    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 558    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 559    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 560    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 561    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 562    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 563    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 564    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 565    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 566    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 567    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 568    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 569    ----------------------------------------------------------------------
 570
 571    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 572    of dimension 1, chars 94, and final character <F>, and etc.
 573
 574    Note (*): Although these designations are not allowed in ISO2022,
 575    Emacs accepts them on decoding, and produces them on encoding
 576    CHARS96 character set in a coding system which is characterized as
 577    7-bit environment, non-locking-shift, and non-single-shift.
 578
 579    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 580    '(' can be omitted.  We call this as "short-form" here after.
 581
 582    Now you may notice that there are a lot of ways for encoding the
 583    same multilingual text in ISO2022.  Actually, there exists many
 584    coding systems such as Compound Text (used in X's inter client
 585    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
 586    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
 587    localized platforms), and all of these are variants of ISO2022.
 588
 589    In addition to the above, Emacs handles two more kinds of escape
 590    sequences: ISO6429's direction specification and Emacs' private
 591    sequence for specifying character composition.
 592
 593    ISO6429's direction specification takes the following format:
 594         o CSI ']'      -- end of the current direction
 595         o CSI '0' ']'  -- end of the current direction
 596         o CSI '1' ']'  -- start of left-to-right text
 597         o CSI '2' ']'  -- start of right-to-left text
 598    The control character CSI (0x9B: control sequence introducer) is
 599    abbreviated to the escape sequence ESC '[' in 7-bit environment.
 600
 601    Character composition specification takes the following format:
 602         o ESC '0' -- start character composition
 603         o ESC '1' -- end character composition
 604    Since these are not standard escape sequences of any ISO, the use
 605    of them for these meaning is restricted to Emacs only.  */
 606
 607 enum iso_code_class_type iso_code_class[256];
 608
 609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 610    Check if a text is encoded in ISO2022.  If it is, returns an
 611    integer in which appropriate flag bits any of:
 612         CODING_CATEGORY_MASK_ISO_7
 613         CODING_CATEGORY_MASK_ISO_8_1
 614         CODING_CATEGORY_MASK_ISO_8_2
 615         CODING_CATEGORY_MASK_ISO_7_ELSE
 616         CODING_CATEGORY_MASK_ISO_8_ELSE
 617    are set.  If a code which should never appear in ISO2022 is found,
 618    returns 0.  */
 619
 620 int
 621 detect_coding_iso2022 (src, src_end)
 622      unsigned char *src, *src_end;
 623 {
 624   int mask = (CODING_CATEGORY_MASK_ISO_7
 625               | CODING_CATEGORY_MASK_ISO_8_1
 626               | CODING_CATEGORY_MASK_ISO_8_2
 627               | CODING_CATEGORY_MASK_ISO_7_ELSE
 628               | CODING_CATEGORY_MASK_ISO_8_ELSE
 629               );
 630   int g1 = 0;                   /* 1 iff designating to G1.  */
 631   int c, i;
 632   struct coding_system coding_iso_8_1, coding_iso_8_2;
 633
 634   /* Coding systems of these categories may accept latin extra codes.  */
 635   setup_coding_system
 636     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_1])->value,
 637      &coding_iso_8_1);
 638   setup_coding_system
 639     (XSYMBOL (coding_category_table[CODING_CATEGORY_IDX_ISO_8_2])->value,
 640      &coding_iso_8_2);
 641
 642   while (mask && src < src_end)
 643     {
 644       c = *src++;
 645       switch (c)
 646         {
 647         case ISO_CODE_ESC:
 648           if (src >= src_end)
 649             break;
 650           c = *src++;
 651           if ((c >= '(' && c <= '/'))
 652             {
 653               /* Designation sequence for a charset of dimension 1.  */
 654               if (src >= src_end)
 655                 break;
 656               c = *src++;
 657               if (c < ' ' || c >= 0x80)
 658                 /* Invalid designation sequence.  */
 659                 return 0;
 660             }
 661           else if (c == '$')
 662             {
 663               /* Designation sequence for a charset of dimension 2.  */
 664               if (src >= src_end)
 665                 break;
 666               c = *src++;
 667               if (c >= '@' && c <= 'B')
 668                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 669                 ;
 670               else if (c >= '(' && c <= '/')
 671                 {
 672                   if (src >= src_end)
 673                     break;
 674                   c = *src++;
 675                   if (c < ' ' || c >= 0x80)
 676                     /* Invalid designation sequence.  */
 677                     return 0;
 678                 }
 679               else
 680                 /* Invalid designation sequence.  */
 681                 return 0;
 682             }
 683           else if (c == 'N' || c == 'O' || c == 'n' || c == 'o')
 684             /* Locking shift.  */
 685             mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 686                      | CODING_CATEGORY_MASK_ISO_8_ELSE);
 687           else if (c == '0' || c == '1' || c == '2')
 688             /* Start/end composition.  */
 689             ;
 690           else
 691             /* Invalid escape sequence.  */
 692             return 0;
 693           break;
 694
 695         case ISO_CODE_SO:
 696           mask &= (CODING_CATEGORY_MASK_ISO_7_ELSE
 697                    | CODING_CATEGORY_MASK_ISO_8_ELSE);
 698           break;
 699
 700         case ISO_CODE_CSI:
 701         case ISO_CODE_SS2:
 702         case ISO_CODE_SS3:
 703           {
 704             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
 705
 706             if (VECTORP (Vlatin_extra_code_table)
 707                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 708               {
 709                 if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 710                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 711                 if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 712                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 713               }
 714             mask &= newmask;
 715           }
 716           break;
 717
 718         default:
 719           if (c < 0x80)
 720             break;
 721           else if (c < 0xA0)
 722             {
 723               if (VECTORP (Vlatin_extra_code_table)
 724                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
 725                 {
 726                   int newmask = 0;
 727
 728                   if (coding_iso_8_1.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 729                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
 730                   if (coding_iso_8_2.flags & CODING_FLAG_ISO_LATIN_EXTRA)
 731                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
 732                   mask &= newmask;
 733                 }
 734               else
 735                 return 0;
 736             }
 737           else
 738             {
 739               unsigned char *src_begin = src;
 740
 741               mask &= ~(CODING_CATEGORY_MASK_ISO_7
 742                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
 743               while (src < src_end && *src >= 0xA0)
 744                 src++;
 745               if ((src - src_begin - 1) & 1 && src < src_end)
 746                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
 747             }
 748           break;
 749         }
 750     }
 751
 752   return mask;
 753 }
 754
 755 /* Decode a character of which charset is CHARSET and the 1st position
 756    code is C1.  If dimension of CHARSET is 2, the 2nd position code is
 757    fetched from SRC and set to C2.  If CHARSET is negative, it means
 758    that we are decoding ill formed text, and what we can do is just to
 759    read C1 as is.  */
 760
 761 #define DECODE_ISO_CHARACTER(charset, c1)                               \
 762   do {                                                                  \
 763     int c_alt, charset_alt = (charset);                                 \
 764     if (COMPOSING_HEAD_P (coding->composing))                           \
 765       {                                                                 \
 766         *dst++ = LEADING_CODE_COMPOSITION;                              \
 767         if (COMPOSING_WITH_RULE_P (coding->composing))                  \
 768           /* To tell composition rules are embeded.  */                 \
 769           *dst++ = 0xFF;                                                \
 770         coding->composing += 2;                                         \
 771       }                                                                 \
 772     if ((charset) >= 0)                                                 \
 773       {                                                                 \
 774         if (CHARSET_DIMENSION (charset) == 2)                           \
 775           ONE_MORE_BYTE (c2);                                           \
 776         if (!NILP (unification_table)                                   \
 777             && ((c_alt = unify_char (unification_table,                 \
 778                                      -1, (charset), c1, c2)) >= 0))     \
 779           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
 780       }                                                                 \
 781     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
 782       DECODE_CHARACTER_ASCII (c1);                                      \
 783     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
 784       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
 785     else                                                                \
 786       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
 787     if (COMPOSING_WITH_RULE_P (coding->composing))                      \
 788       /* To tell a composition rule follows.  */                        \
 789       coding->composing = COMPOSING_WITH_RULE_RULE;                     \
 790   } while (0)
 791
 792 /* Set designation state into CODING.  */
 793 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)           \
 794   do {                                                                  \
 795     int charset = ISO_CHARSET_TABLE (make_number (dimension),           \
 796                                      make_number (chars),               \
 797                                      make_number (final_char));         \
 798     if (charset >= 0)                                                   \
 799       {                                                                 \
 800         if (coding->direction == 1                                      \
 801             && CHARSET_REVERSE_CHARSET (charset) >= 0)                  \
 802           charset = CHARSET_REVERSE_CHARSET (charset);                  \
 803         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;            \
 804       }                                                                 \
 805   } while (0)
 806
 807 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 808
 809 int
 810 decode_coding_iso2022 (coding, source, destination,
 811                        src_bytes, dst_bytes, consumed)
 812      struct coding_system *coding;
 813      unsigned char *source, *destination;
 814      int src_bytes, dst_bytes;
 815      int *consumed;
 816 {
 817   unsigned char *src = source;
 818   unsigned char *src_end = source + src_bytes;
 819   unsigned char *dst = destination;
 820   unsigned char *dst_end = destination + dst_bytes;
 821   /* Since the maximum bytes produced by each loop is 7, we subtract 6
 822      from DST_END to assure that overflow checking is necessary only
 823      at the head of loop.  */
 824   unsigned char *adjusted_dst_end = dst_end - 6;
 825   int charset;
 826   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
 827   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 828   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
 829   Lisp_Object unification_table
 830       = coding->character_unification_table_for_decode;
 831
 832   if (!NILP (Venable_character_unification) && NILP (unification_table))
 833     unification_table = Vstandard_character_unification_table_for_decode;
 834
 835   while (src < src_end && dst < adjusted_dst_end)
 836     {
 837       /* SRC_BASE remembers the start position in source in each loop.
 838          The loop will be exited when there's not enough source text
 839          to analyze long escape sequence or 2-byte code (within macros
 840          ONE_MORE_BYTE or TWO_MORE_BYTES).  In that case, SRC is reset
 841          to SRC_BASE before exiting.  */
 842       unsigned char *src_base = src;
 843       int c1 = *src++, c2;
 844
 845       switch (iso_code_class [c1])
 846         {
 847         case ISO_0x20_or_0x7F:
 848           if (!coding->composing
 849               && (charset0 < 0 || CHARSET_CHARS (charset0) == 94))
 850             {
 851               /* This is SPACE or DEL.  */
 852               *dst++ = c1;
 853               break;
 854             }
 855           /* This is a graphic character, we fall down ...  */
 856
 857         case ISO_graphic_plane_0:
 858           if (coding->composing == COMPOSING_WITH_RULE_RULE)
 859             {
 860               /* This is a composition rule.  */
 861               *dst++ = c1 | 0x80;
 862               coding->composing = COMPOSING_WITH_RULE_TAIL;
 863             }
 864           else
 865             DECODE_ISO_CHARACTER (charset0, c1);
 866           break;
 867
 868         case ISO_0xA0_or_0xFF:
 869           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94)
 870             {
 871               /* Invalid code.  */
 872               *dst++ = c1;
 873               break;
 874             }
 875           /* This is a graphic character, we fall down ... */
 876
 877         case ISO_graphic_plane_1:
 878           DECODE_ISO_CHARACTER (charset1, c1);
 879           break;
 880
 881         case ISO_control_code:
 882           /* All ISO2022 control characters in this class have the
 883              same representation in Emacs internal format.  */
 884           *dst++ = c1;
 885           break;
 886
 887         case ISO_carriage_return:
 888           if (coding->eol_type == CODING_EOL_CR)
 889             {
 890               *dst++ = '\n';
 891             }
 892           else if (coding->eol_type == CODING_EOL_CRLF)
 893             {
 894               ONE_MORE_BYTE (c1);
 895               if (c1 == ISO_CODE_LF)
 896                 *dst++ = '\n';
 897               else
 898                 {
 899                   src--;
 900                   *dst++ = c1;
 901                 }
 902             }
 903           else
 904             {
 905               *dst++ = c1;
 906             }
 907           break;
 908
 909         case ISO_shift_out:
 910           if (CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
 911             goto label_invalid_escape_sequence;
 912           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
 913           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 914           break;
 915
 916         case ISO_shift_in:
 917           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
 918           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 919           break;
 920
 921         case ISO_single_shift_2_7:
 922         case ISO_single_shift_2:
 923           /* SS2 is handled as an escape sequence of ESC 'N' */
 924           c1 = 'N';
 925           goto label_escape_sequence;
 926
 927         case ISO_single_shift_3:
 928           /* SS2 is handled as an escape sequence of ESC 'O' */
 929           c1 = 'O';
 930           goto label_escape_sequence;
 931
 932         case ISO_control_sequence_introducer:
 933           /* CSI is handled as an escape sequence of ESC '[' ...  */
 934           c1 = '[';
 935           goto label_escape_sequence;
 936
 937         case ISO_escape:
 938           ONE_MORE_BYTE (c1);
 939         label_escape_sequence:
 940           /* Escape sequences handled by Emacs are invocation,
 941              designation, direction specification, and character
 942              composition specification.  */
 943           switch (c1)
 944             {
 945             case '&':           /* revision of following character set */
 946               ONE_MORE_BYTE (c1);
 947               if (!(c1 >= '@' && c1 <= '~'))
 948                 goto label_invalid_escape_sequence;
 949               ONE_MORE_BYTE (c1);
 950               if (c1 != ISO_CODE_ESC)
 951                 goto label_invalid_escape_sequence;
 952               ONE_MORE_BYTE (c1);
 953               goto label_escape_sequence;
 954
 955             case '$':           /* designation of 2-byte character set */
 956               ONE_MORE_BYTE (c1);
 957               if (c1 >= '@' && c1 <= 'B')
 958                 {       /* designation of JISX0208.1978, GB2312.1980,
 959                                    or JISX0208.1980 */
 960                   DECODE_DESIGNATION (0, 2, 94, c1);
 961                 }
 962               else if (c1 >= 0x28 && c1 <= 0x2B)
 963                 {       /* designation of DIMENSION2_CHARS94 character set */
 964                   ONE_MORE_BYTE (c2);
 965                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
 966                 }
 967               else if (c1 >= 0x2C && c1 <= 0x2F)
 968                 {       /* designation of DIMENSION2_CHARS96 character set */
 969                   ONE_MORE_BYTE (c2);
 970                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
 971                 }
 972               else
 973                 goto label_invalid_escape_sequence;
 974               break;
 975
 976             case 'n':           /* invocation of locking-shift-2 */
 977               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 978                 goto label_invalid_escape_sequence;
 979               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
 980               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 981               break;
 982
 983             case 'o':           /* invocation of locking-shift-3 */
 984               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
 985                 goto label_invalid_escape_sequence;
 986               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
 987               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
 988               break;
 989
 990             case 'N':           /* invocation of single-shift-2 */
 991               if (CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
 992                 goto label_invalid_escape_sequence;
 993               ONE_MORE_BYTE (c1);
 994               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
 995               DECODE_ISO_CHARACTER (charset, c1);
 996               break;
 997
 998             case 'O':           /* invocation of single-shift-3 */
 999               if (CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1000                 goto label_invalid_escape_sequence;
1001               ONE_MORE_BYTE (c1);
1002               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1003               DECODE_ISO_CHARACTER (charset, c1);
1004               break;
1005
1006             case '0':           /* start composing without embeded rules */
1007               coding->composing = COMPOSING_NO_RULE_HEAD;
1008               break;
1009
1010             case '1':           /* end composing */
1011               coding->composing = COMPOSING_NO;
1012               break;
1013
1014             case '2':           /* start composing with embeded rules */
1015               coding->composing = COMPOSING_WITH_RULE_HEAD;
1016               break;
1017
1018             case '[':           /* specification of direction */
1019               /* For the moment, nested direction is not supported.
1020                  So, the value of `coding->direction' is 0 or 1: 0
1021                  means left-to-right, 1 means right-to-left.  */
1022               ONE_MORE_BYTE (c1);
1023               switch (c1)
1024                 {
1025                 case ']':       /* end of the current direction */
1026                   coding->direction = 0;
1027
1028                 case '0':       /* end of the current direction */
1029                 case '1':       /* start of left-to-right direction */
1030                   ONE_MORE_BYTE (c1);
1031                   if (c1 == ']')
1032                     coding->direction = 0;
1033                   else
1034                     goto label_invalid_escape_sequence;
1035                   break;
1036
1037                 case '2':       /* start of right-to-left direction */
1038                   ONE_MORE_BYTE (c1);
1039                   if (c1 == ']')
1040                     coding->direction= 1;
1041                   else
1042                     goto label_invalid_escape_sequence;
1043                   break;
1044
1045                 default:
1046                   goto label_invalid_escape_sequence;
1047                 }
1048               break;
1049
1050             default:
1051               if (c1 >= 0x28 && c1 <= 0x2B)
1052                 {       /* designation of DIMENSION1_CHARS94 character set */
1053                   ONE_MORE_BYTE (c2);
1054                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1055                 }
1056               else if (c1 >= 0x2C && c1 <= 0x2F)
1057                 {       /* designation of DIMENSION1_CHARS96 character set */
1058                   ONE_MORE_BYTE (c2);
1059                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1060                 }
1061               else
1062                 {
1063                   goto label_invalid_escape_sequence;
1064                 }
1065             }
1066           /* We must update these variables now.  */
1067           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1068           charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1069           break;
1070
1071         label_invalid_escape_sequence:
1072           {
1073             int length = src - src_base;
1074
1075             bcopy (src_base, dst, length);
1076             dst += length;
1077           }
1078         }
1079       continue;
1080
1081     label_end_of_loop:
1082       coding->carryover_size = src - src_base;
1083       bcopy (src_base, coding->carryover, coding->carryover_size);
1084       src = src_base;
1085       break;
1086     }
1087
1088   /* If this is the last block of the text to be decoded, we had
1089      better just flush out all remaining codes in the text although
1090      they are not valid characters.  */
1091   if (coding->last_block)
1092     {
1093       bcopy (src, dst, src_end - src);
1094       dst += (src_end - src);
1095       src = src_end;
1096     }
1097   *consumed = src - source;
1098   return dst - destination;
1099 }
1100
1101 /* ISO2022 encoding stuff.  */
1102
1103 /*
1104    It is not enough to say just "ISO2022" on encoding, we have to
1105    specify more details.  In Emacs, each coding-system of ISO2022
1106    variant has the following specifications:
1107         1. Initial designation to G0 thru G3.
1108         2. Allows short-form designation?
1109         3. ASCII should be designated to G0 before control characters?
1110         4. ASCII should be designated to G0 at end of line?
1111         5. 7-bit environment or 8-bit environment?
1112         6. Use locking-shift?
1113         7. Use Single-shift?
1114    And the following two are only for Japanese:
1115         8. Use ASCII in place of JIS0201-1976-Roman?
1116         9. Use JISX0208-1983 in place of JISX0208-1978?
1117    These specifications are encoded in `coding->flags' as flag bits
1118    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1119    details.
1120 */
1121
1122 /* Produce codes (escape sequence) for designating CHARSET to graphic
1123    register REG.  If <final-char> of CHARSET is '@', 'A', or 'B' and
1124    the coding system CODING allows, produce designation sequence of
1125    short-form.  */
1126
1127 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1128   do {                                                                  \
1129     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1130     char *intermediate_char_94 = "()*+";                                \
1131     char *intermediate_char_96 = ",-./";                                \
1132     Lisp_Object temp                                                    \
1133       = Fassq (make_number (charset), Vcharset_revision_alist);         \
1134     if (! NILP (temp))                                                  \
1135         {                                                               \
1136         *dst++ = ISO_CODE_ESC;                                          \
1137         *dst++ = '&';                                                   \
1138         *dst++ = XINT (XCONS (temp)->cdr) + '@';                        \
1139       }                                                                 \
1140     *dst++ = ISO_CODE_ESC;                                              \
1141     if (CHARSET_DIMENSION (charset) == 1)                               \
1142       {                                                                 \
1143         if (CHARSET_CHARS (charset) == 94)                              \
1144           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1145         else                                                            \
1146           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1147       }                                                                 \
1148     else                                                                \
1149       {                                                                 \
1150         *dst++ = '$';                                                   \
1151         if (CHARSET_CHARS (charset) == 94)                              \
1152           {                                                             \
1153             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1154                 || reg != 0                                             \
1155                 || final_char < '@' || final_char > 'B')                \
1156               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1157           }                                                             \
1158         else                                                            \
1159           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1160       }                                                                 \
1161     *dst++ = final_char;                                                \
1162     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1163   } while (0)
1164
1165 /* The following two macros produce codes (control character or escape
1166    sequence) for ISO2022 single-shift functions (single-shift-2 and
1167    single-shift-3).  */
1168
1169 #define ENCODE_SINGLE_SHIFT_2                           \
1170   do {                                                  \
1171     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1172       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1173     else                                                \
1174       *dst++ = ISO_CODE_SS2;                            \
1175     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1176   } while (0)
1177
1178 #define ENCODE_SINGLE_SHIFT_3                           \
1179   do {                                                  \
1180     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1181       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1182     else                                                \
1183       *dst++ = ISO_CODE_SS3;                            \
1184     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1185   } while (0)
1186
1187 /* The following four macros produce codes (control character or
1188    escape sequence) for ISO2022 locking-shift functions (shift-in,
1189    shift-out, locking-shift-2, and locking-shift-3).  */
1190
1191 #define ENCODE_SHIFT_IN                         \
1192   do {                                          \
1193     *dst++ = ISO_CODE_SI;                       \
1194     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1195   } while (0)
1196
1197 #define ENCODE_SHIFT_OUT                        \
1198   do {                                          \
1199     *dst++ = ISO_CODE_SO;                       \
1200     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1201   } while (0)
1202
1203 #define ENCODE_LOCKING_SHIFT_2                  \
1204   do {                                          \
1205     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1206     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1207   } while (0)
1208
1209 #define ENCODE_LOCKING_SHIFT_3                  \
1210   do {                                          \
1211     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1212     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1213   } while (0)
1214
1215 /* Produce codes for a DIMENSION1 character whose character set is
1216    CHARSET and whose position-code is C1.  Designation and invocation
1217    sequences are also produced in advance if necessary.  */
1218
1219
1220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1221   do {                                                                  \
1222     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1223       {                                                                 \
1224         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1225           *dst++ = c1 & 0x7F;                                           \
1226         else                                                            \
1227           *dst++ = c1 | 0x80;                                           \
1228         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1229         break;                                                          \
1230       }                                                                 \
1231     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1232       {                                                                 \
1233         *dst++ = c1 & 0x7F;                                             \
1234         break;                                                          \
1235       }                                                                 \
1236     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1237       {                                                                 \
1238         *dst++ = c1 | 0x80;                                             \
1239         break;                                                          \
1240       }                                                                 \
1241     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1242              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1243       {                                                                 \
1244         /* We should not encode this character, instead produce one or  \
1245            two `?'s.  */                                                \
1246         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1247         if (CHARSET_WIDTH (charset) == 2)                               \
1248           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1249         break;                                                          \
1250       }                                                                 \
1251     else                                                                \
1252       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1253          must invoke it, or, at first, designate it to some graphic     \
1254          register.  Then repeat the loop to actually produce the        \
1255          character.  */                                                 \
1256       dst = encode_invocation_designation (charset, coding, dst);       \
1257   } while (1)
1258
1259 /* Produce codes for a DIMENSION2 character whose character set is
1260    CHARSET and whose position-codes are C1 and C2.  Designation and
1261    invocation codes are also produced in advance if necessary.  */
1262
1263 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1264   do {                                                                  \
1265     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1266       {                                                                 \
1267         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1268           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1269         else                                                            \
1270           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1271         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1272         break;                                                          \
1273       }                                                                 \
1274     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1275       {                                                                 \
1276         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1277         break;                                                          \
1278       }                                                                 \
1279     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1280       {                                                                 \
1281         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1282         break;                                                          \
1283       }                                                                 \
1284     else if (coding->flags & CODING_FLAG_ISO_SAFE                       \
1285              && !CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset])   \
1286       {                                                                 \
1287         /* We should not encode this character, instead produce one or  \
1288            two `?'s.  */                                                \
1289         *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;                 \
1290         if (CHARSET_WIDTH (charset) == 2)                               \
1291           *dst++ = CODING_INHIBIT_CHARACTER_SUBSTITUTION;               \
1292         break;                                                          \
1293       }                                                                 \
1294     else                                                                \
1295       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1296          must invoke it, or, at first, designate it to some graphic     \
1297          register.  Then repeat the loop to actually produce the        \
1298          character.  */                                                 \
1299       dst = encode_invocation_designation (charset, coding, dst);       \
1300   } while (1)
1301
1302 #define ENCODE_ISO_CHARACTER(charset, c1, c2)                             \
1303   do {                                                                    \
1304     int c_alt, charset_alt;                                               \
1305     if (!NILP (unification_table)                                         \
1306         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1307             >= 0))                                                        \
1308       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1309     else                                                                  \
1310       charset_alt = charset;                                              \
1311     if (CHARSET_DIMENSION (charset_alt) == 1)                             \
1312       ENCODE_ISO_CHARACTER_DIMENSION1 (charset_alt, c1);                  \
1313     else                                                                  \
1314       ENCODE_ISO_CHARACTER_DIMENSION2 (charset_alt, c1, c2);              \
1315   } while (0)
1316
1317 /* Produce designation and invocation codes at a place pointed by DST
1318    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1319    Return new DST.  */
1320
1321 unsigned char *
1322 encode_invocation_designation (charset, coding, dst)
1323      int charset;
1324      struct coding_system *coding;
1325      unsigned char *dst;
1326 {
1327   int reg;                      /* graphic register number */
1328
1329   /* At first, check designations.  */
1330   for (reg = 0; reg < 4; reg++)
1331     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1332       break;
1333
1334   if (reg >= 4)
1335     {
1336       /* CHARSET is not yet designated to any graphic registers.  */
1337       /* At first check the requested designation.  */
1338       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1339       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1340         /* Since CHARSET requests no special designation, designate it
1341            to graphic register 0.  */
1342         reg = 0;
1343
1344       ENCODE_DESIGNATION (charset, reg, coding);
1345     }
1346
1347   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1348       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1349     {
1350       /* Since the graphic register REG is not invoked to any graphic
1351          planes, invoke it to graphic plane 0.  */
1352       switch (reg)
1353         {
1354         case 0:                 /* graphic register 0 */
1355           ENCODE_SHIFT_IN;
1356           break;
1357
1358         case 1:                 /* graphic register 1 */
1359           ENCODE_SHIFT_OUT;
1360           break;
1361
1362         case 2:                 /* graphic register 2 */
1363           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1364             ENCODE_SINGLE_SHIFT_2;
1365           else
1366             ENCODE_LOCKING_SHIFT_2;
1367           break;
1368
1369         case 3:                 /* graphic register 3 */
1370           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1371             ENCODE_SINGLE_SHIFT_3;
1372           else
1373             ENCODE_LOCKING_SHIFT_3;
1374           break;
1375         }
1376     }
1377   return dst;
1378 }
1379
1380 /* The following two macros produce codes for indicating composition.  */
1381 #define ENCODE_COMPOSITION_NO_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '0'
1382 #define ENCODE_COMPOSITION_WITH_RULE_START  *dst++ = ISO_CODE_ESC, *dst++ = '2'
1383 #define ENCODE_COMPOSITION_END    *dst++ = ISO_CODE_ESC, *dst++ = '1'
1384
1385 /* The following three macros produce codes for indicating direction
1386    of text.  */
1387 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1388   do {                                                  \
1389     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1390       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1391     else                                                \
1392       *dst++ = ISO_CODE_CSI;                            \
1393   } while (0)
1394
1395 #define ENCODE_DIRECTION_R2L    \
1396   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '2', *dst++ = ']'
1397
1398 #define ENCODE_DIRECTION_L2R    \
1399   ENCODE_CONTROL_SEQUENCE_INTRODUCER, *dst++ = '0', *dst++ = ']'
1400
1401 /* Produce codes for designation and invocation to reset the graphic
1402    planes and registers to initial state.  */
1403 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
1404   do {                                                                      \
1405     int reg;                                                                \
1406     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
1407       ENCODE_SHIFT_IN;                                                      \
1408     for (reg = 0; reg < 4; reg++)                                           \
1409       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
1410           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
1411               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
1412         ENCODE_DESIGNATION                                                  \
1413           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
1414   } while (0)
1415
1416 /* Produce designation sequences of charsets in the line started from
1417    *SRC to a place pointed by DSTP.
1418
1419    If the current block ends before any end-of-line, we may fail to
1420    find all the necessary *designations.  */
1421 encode_designation_at_bol (coding, table, src, src_end, dstp)
1422      struct coding_system *coding;
1423      Lisp_Object table;
1424      unsigned char *src, *src_end, **dstp;
1425 {
1426   int charset, c, found = 0, reg;
1427   /* Table of charsets to be designated to each graphic register.  */
1428   int r[4];
1429   unsigned char *dst = *dstp;
1430
1431   for (reg = 0; reg < 4; reg++)
1432     r[reg] = -1;
1433
1434   while (src < src_end && *src != '\n' && found < 4)
1435     {
1436       int bytes = BYTES_BY_CHAR_HEAD (*src);
1437
1438       if (NILP (table))
1439         charset = CHARSET_AT (src);
1440       else
1441         {
1442           int c_alt, c1, c2;
1443
1444           SPLIT_STRING(src, bytes, charset, c1, c2);
1445           if ((c_alt = unify_char (table, -1, charset, c1, c2)) >= 0)
1446             charset = CHAR_CHARSET (c_alt);
1447         }
1448
1449       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1450       if (r[reg] == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1451         {
1452           found++;
1453           r[reg] = charset;
1454         }
1455
1456       src += bytes;
1457     }
1458
1459   if (found)
1460     {
1461       for (reg = 0; reg < 4; reg++)
1462         if (r[reg] >= 0
1463             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
1464           ENCODE_DESIGNATION (r[reg], reg, coding);
1465       *dstp = dst;
1466     }
1467 }
1468
1469 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
1470
1471 int
1472 encode_coding_iso2022 (coding, source, destination,
1473                        src_bytes, dst_bytes, consumed)
1474      struct coding_system *coding;
1475      unsigned char *source, *destination;
1476      int src_bytes, dst_bytes;
1477      int *consumed;
1478 {
1479   unsigned char *src = source;
1480   unsigned char *src_end = source + src_bytes;
1481   unsigned char *dst = destination;
1482   unsigned char *dst_end = destination + dst_bytes;
1483   /* Since the maximum bytes produced by each loop is 20, we subtract 19
1484      from DST_END to assure overflow checking is necessary only at the
1485      head of loop.  */
1486   unsigned char *adjusted_dst_end = dst_end - 19;
1487   Lisp_Object unification_table
1488       = coding->character_unification_table_for_encode;
1489
1490   if (!NILP (Venable_character_unification) && NILP (unification_table))
1491     unification_table = Vstandard_character_unification_table_for_encode;
1492
1493   while (src < src_end && dst < adjusted_dst_end)
1494     {
1495       /* SRC_BASE remembers the start position in source in each loop.
1496          The loop will be exited when there's not enough source text
1497          to analyze multi-byte codes (within macros ONE_MORE_BYTE,
1498          TWO_MORE_BYTES, and THREE_MORE_BYTES).  In that case, SRC is
1499          reset to SRC_BASE before exiting.  */
1500       unsigned char *src_base = src;
1501       int charset, c1, c2, c3, c4;
1502
1503       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
1504           && CODING_SPEC_ISO_BOL (coding))
1505         {
1506           /* We have to produce designation sequences if any now.  */
1507           encode_designation_at_bol (coding, unification_table,
1508                                      src, src_end, &dst);
1509           CODING_SPEC_ISO_BOL (coding) = 0;
1510         }
1511
1512       c1 = *src++;
1513       /* If we are seeing a component of a composite character, we are
1514          seeing a leading-code specially encoded for composition, or a
1515          composition rule if composing with rule.  We must set C1
1516          to a normal leading-code or an ASCII code.  If we are not at
1517          a composed character, we must reset the composition state.  */
1518       if (COMPOSING_P (coding->composing))
1519         {
1520           if (c1 < 0xA0)
1521             {
1522               /* We are not in a composite character any longer.  */
1523               coding->composing = COMPOSING_NO;
1524               ENCODE_COMPOSITION_END;
1525             }
1526           else
1527             {
1528               if (coding->composing == COMPOSING_WITH_RULE_RULE)
1529                 {
1530                   *dst++ = c1 & 0x7F;
1531                   coding->composing = COMPOSING_WITH_RULE_HEAD;
1532                   continue;
1533                 }
1534               else if (coding->composing == COMPOSING_WITH_RULE_HEAD)
1535                 coding->composing = COMPOSING_WITH_RULE_RULE;
1536               if (c1 == 0xA0)
1537                 {
1538                   /* This is an ASCII component.  */
1539                   ONE_MORE_BYTE (c1);
1540                   c1 &= 0x7F;
1541                 }
1542               else
1543                 /* This is a leading-code of non ASCII component.  */
1544                 c1 -= 0x20;
1545             }
1546         }
1547
1548       /* Now encode one character.  C1 is a control character, an
1549          ASCII character, or a leading-code of multi-byte character.  */
1550       switch (emacs_code_class[c1])
1551         {
1552         case EMACS_ascii_code:
1553           ENCODE_ISO_CHARACTER (CHARSET_ASCII, c1, /* dummy */ c2);
1554           break;
1555
1556         case EMACS_control_code:
1557           if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1558             ENCODE_RESET_PLANE_AND_REGISTER;
1559           *dst++ = c1;
1560           break;
1561
1562         case EMACS_carriage_return_code:
1563           if (!coding->selective)
1564             {
1565               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
1566                 ENCODE_RESET_PLANE_AND_REGISTER;
1567               *dst++ = c1;
1568               break;
1569             }
1570           /* fall down to treat '\r' as '\n' ...  */
1571
1572         case EMACS_linefeed_code:
1573           if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
1574             ENCODE_RESET_PLANE_AND_REGISTER;
1575           if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
1576             bcopy (coding->spec.iso2022.initial_designation,
1577                    coding->spec.iso2022.current_designation,
1578                    sizeof coding->spec.iso2022.initial_designation);
1579           if (coding->eol_type == CODING_EOL_LF
1580               || coding->eol_type == CODING_EOL_UNDECIDED)
1581             *dst++ = ISO_CODE_LF;
1582           else if (coding->eol_type == CODING_EOL_CRLF)
1583             *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
1584           else
1585             *dst++ = ISO_CODE_CR;
1586           CODING_SPEC_ISO_BOL (coding) = 1;
1587           break;
1588
1589         case EMACS_leading_code_2:
1590           ONE_MORE_BYTE (c2);
1591           if (c2 < 0xA0)
1592             {
1593               /* invalid sequence */
1594               *dst++ = c1;
1595               *dst++ = c2;
1596             }
1597           else
1598             ENCODE_ISO_CHARACTER (c1, c2, /* dummy */ c3);
1599           break;
1600
1601         case EMACS_leading_code_3:
1602           TWO_MORE_BYTES (c2, c3);
1603           if (c2 < 0xA0 || c3 < 0xA0)
1604             {
1605               /* invalid sequence */
1606               *dst++ = c1;
1607               *dst++ = c2;
1608               *dst++ = c3;
1609             }
1610           else if (c1 < LEADING_CODE_PRIVATE_11)
1611             ENCODE_ISO_CHARACTER (c1, c2, c3);
1612           else
1613             ENCODE_ISO_CHARACTER (c2, c3, /* dummy */ c4);
1614           break;
1615
1616         case EMACS_leading_code_4:
1617           THREE_MORE_BYTES (c2, c3, c4);
1618           if (c2 < 0xA0 || c3 < 0xA0 || c4 < 0xA0)
1619             {
1620               /* invalid sequence */
1621               *dst++ = c1;
1622               *dst++ = c2;
1623               *dst++ = c3;
1624               *dst++ = c4;
1625             }
1626           else
1627             ENCODE_ISO_CHARACTER (c2, c3, c4);
1628           break;
1629
1630         case EMACS_leading_code_composition:
1631           ONE_MORE_BYTE (c2);
1632           if (c2 < 0xA0)
1633             {
1634               /* invalid sequence */
1635               *dst++ = c1;
1636               *dst++ = c2;
1637             }
1638           else if (c2 == 0xFF)
1639             {
1640               coding->composing = COMPOSING_WITH_RULE_HEAD;
1641               ENCODE_COMPOSITION_WITH_RULE_START;
1642             }
1643           else
1644             {
1645               /* Rewind one byte because it is a character code of
1646                  composition elements.  */
1647               src--;
1648               coding->composing = COMPOSING_NO_RULE_HEAD;
1649               ENCODE_COMPOSITION_NO_RULE_START;
1650             }
1651           break;
1652
1653         case EMACS_invalid_code:
1654           *dst++ = c1;
1655           break;
1656         }
1657       continue;
1658     label_end_of_loop:
1659       /* We reach here because the source date ends not at character
1660          boundary.  */
1661       coding->carryover_size = src_end - src_base;
1662       bcopy (src_base, coding->carryover, coding->carryover_size);
1663       src = src_end;
1664       break;
1665     }
1666
1667   /* If this is the last block of the text to be encoded, we must
1668      reset graphic planes and registers to the initial state.  */
1669   if (src >= src_end && coding->last_block)
1670     {
1671       ENCODE_RESET_PLANE_AND_REGISTER;
1672       if (coding->carryover_size > 0
1673           && coding->carryover_size < (dst_end - dst))
1674         {
1675           bcopy (coding->carryover, dst, coding->carryover_size);
1676           dst += coding->carryover_size;
1677           coding->carryover_size = 0;
1678         }
1679     }
1680   *consumed = src - source;
1681   return dst - destination;
1682 }
1683
1684 \f
1685 /*** 4. SJIS and BIG5 handlers ***/
1686
1687 /* Although SJIS and BIG5 are not ISO's coding system, they are used
1688    quite widely.  So, for the moment, Emacs supports them in the bare
1689    C code.  But, in the future, they may be supported only by CCL.  */
1690
1691 /* SJIS is a coding system encoding three character sets: ASCII, right
1692    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
1693    as is.  A character of charset katakana-jisx0201 is encoded by
1694    "position-code + 0x80".  A character of charset japanese-jisx0208
1695    is encoded in 2-byte but two position-codes are divided and shifted
1696    so that it fit in the range below.
1697
1698    --- CODE RANGE of SJIS ---
1699    (character set)      (range)
1700    ASCII                0x00 .. 0x7F
1701    KATAKANA-JISX0201    0xA0 .. 0xDF
1702    JISX0208 (1st byte)  0x80 .. 0x9F and 0xE0 .. 0xFF
1703             (2nd byte)  0x40 .. 0xFF
1704    -------------------------------
1705
1706 */
1707
1708 /* BIG5 is a coding system encoding two character sets: ASCII and
1709    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
1710    character set and is encoded in two-byte.
1711
1712    --- CODE RANGE of BIG5 ---
1713    (character set)      (range)
1714    ASCII                0x00 .. 0x7F
1715    Big5 (1st byte)      0xA1 .. 0xFE
1716         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
1717    --------------------------
1718
1719    Since the number of characters in Big5 is larger than maximum
1720    characters in Emacs' charset (96x96), it can't be handled as one
1721    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
1722    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
1723    contains frequently used characters and the latter contains less
1724    frequently used characters.  */
1725
1726 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
1727    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
1728    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
1729    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
1730
1731 /* Number of Big5 characters which have the same code in 1st byte.  */
1732 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
1733
1734 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
1735   do {                                                                  \
1736     unsigned int temp                                                   \
1737       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
1738     if (b1 < 0xC9)                                                      \
1739       charset = charset_big5_1;                                         \
1740     else                                                                \
1741       {                                                                 \
1742         charset = charset_big5_2;                                       \
1743         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
1744       }                                                                 \
1745     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
1746     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
1747   } while (0)
1748
1749 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
1750   do {                                                                  \
1751     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
1752     if (charset == charset_big5_2)                                      \
1753       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
1754     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
1755     b2 = temp % BIG5_SAME_ROW;                                          \
1756     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
1757   } while (0)
1758
1759 #define DECODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                     \
1760   do {                                                                  \
1761     int c_alt, charset_alt = (charset);                                 \
1762     if (!NILP (unification_table)                                       \
1763         && ((c_alt = unify_char (unification_table,                     \
1764                                  -1, (charset), c1, c2)) >= 0))         \
1765           SPLIT_CHAR (c_alt, charset_alt, c1, c2);                      \
1766     if (charset_alt == CHARSET_ASCII || charset_alt < 0)                \
1767       DECODE_CHARACTER_ASCII (c1);                                      \
1768     else if (CHARSET_DIMENSION (charset_alt) == 1)                      \
1769       DECODE_CHARACTER_DIMENSION1 (charset_alt, c1);                    \
1770     else                                                                \
1771       DECODE_CHARACTER_DIMENSION2 (charset_alt, c1, c2);                \
1772   } while (0)
1773
1774 #define ENCODE_SJIS_BIG5_CHARACTER(charset, c1, c2)                       \
1775   do {                                                                    \
1776     int c_alt, charset_alt;                                               \
1777     if (!NILP (unification_table)                                         \
1778         && ((c_alt = unify_char (unification_table, -1, charset, c1, c2)) \
1779             >= 0))                                                        \
1780       SPLIT_CHAR (c_alt, charset_alt, c1, c2);                            \
1781     else                                                                  \
1782       charset_alt = charset;                                              \
1783     if (charset_alt == charset_ascii)                                     \
1784       *dst++ = c1;                                                        \
1785     else if (CHARSET_DIMENSION (charset_alt) == 1)                        \
1786       {                                                                   \
1787         if (sjis_p && charset_alt == charset_katakana_jisx0201)           \
1788           *dst++ = c1;                                                    \
1789         else                                                              \
1790           *dst++ = charset_alt, *dst++ = c1;                              \
1791       }                                                                   \
1792     else                                                                  \
1793       {                                                                   \
1794         c1 &= 0x7F, c2 &= 0x7F;                                           \
1795         if (sjis_p && charset_alt == charset_jisx0208)                    \
1796           {                                                               \
1797             unsigned char s1, s2;                                         \
1798                                                                           \
1799             ENCODE_SJIS (c1, c2, s1, s2);                                 \
1800             *dst++ = s1, *dst++ = s2;                                     \
1801           }                                                               \
1802         else if (!sjis_p                                                  \
1803                  && (charset_alt == charset_big5_1                        \
1804                      || charset_alt == charset_big5_2))                   \
1805           {                                                               \
1806             unsigned char b1, b2;                                         \
1807                                                                           \
1808             ENCODE_BIG5 (charset_alt, c1, c2, b1, b2);                    \
1809             *dst++ = b1, *dst++ = b2;                                     \
1810           }                                                               \
1811         else                                                              \
1812           *dst++ = charset_alt, *dst++ = c1, *dst++ = c2;                 \
1813       }                                                                   \
1814   } while (0);
1815
1816 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1817    Check if a text is encoded in SJIS.  If it is, return
1818    CODING_CATEGORY_MASK_SJIS, else return 0.  */
1819
1820 int
1821 detect_coding_sjis (src, src_end)
1822      unsigned char *src, *src_end;
1823 {
1824   unsigned char c;
1825
1826   while (src < src_end)
1827     {
1828       c = *src++;
1829       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1830         return 0;
1831       if ((c >= 0x80 && c < 0xA0) || c >= 0xE0)
1832         {
1833           if (src < src_end && *src++ < 0x40)
1834             return 0;
1835         }
1836     }
1837   return CODING_CATEGORY_MASK_SJIS;
1838 }
1839
1840 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1841    Check if a text is encoded in BIG5.  If it is, return
1842    CODING_CATEGORY_MASK_BIG5, else return 0.  */
1843
1844 int
1845 detect_coding_big5 (src, src_end)
1846      unsigned char *src, *src_end;
1847 {
1848   unsigned char c;
1849
1850   while (src < src_end)
1851     {
1852       c = *src++;
1853       if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
1854         return 0;
1855       if (c >= 0xA1)
1856         {
1857           if (src >= src_end)
1858             break;
1859           c = *src++;
1860           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
1861             return 0;
1862         }
1863     }
1864   return CODING_CATEGORY_MASK_BIG5;
1865 }
1866
1867 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
1868    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
1869
1870 int
1871 decode_coding_sjis_big5 (coding, source, destination,
1872                          src_bytes, dst_bytes, consumed, sjis_p)
1873      struct coding_system *coding;
1874      unsigned char *source, *destination;
1875      int src_bytes, dst_bytes;
1876      int *consumed;
1877      int sjis_p;
1878 {
1879   unsigned char *src = source;
1880   unsigned char *src_end = source + src_bytes;
1881   unsigned char *dst = destination;
1882   unsigned char *dst_end = destination + dst_bytes;
1883   /* Since the maximum bytes produced by each loop is 4, we subtract 3
1884      from DST_END to assure overflow checking is necessary only at the
1885      head of loop.  */
1886   unsigned char *adjusted_dst_end = dst_end - 3;
1887   Lisp_Object unification_table
1888       = coding->character_unification_table_for_decode;
1889
1890   if (!NILP (Venable_character_unification) && NILP (unification_table))
1891     unification_table = Vstandard_character_unification_table_for_decode;
1892
1893   while (src < src_end && dst < adjusted_dst_end)
1894     {
1895       /* SRC_BASE remembers the start position in source in each loop.
1896          The loop will be exited when there's not enough source text
1897          to analyze two-byte character (within macro ONE_MORE_BYTE).
1898          In that case, SRC is reset to SRC_BASE before exiting.  */
1899       unsigned char *src_base = src;
1900       unsigned char c1 = *src++, c2, c3, c4;
1901
1902       if (c1 == '\r')
1903         {
1904           if (coding->eol_type == CODING_EOL_CRLF)
1905             {
1906               ONE_MORE_BYTE (c2);
1907               if (c2 == '\n')
1908                 *dst++ = c2;
1909               else
1910                 /* To process C2 again, SRC is subtracted by 1.  */
1911                 *dst++ = c1, src--;
1912             }
1913           else
1914             *dst++ = c1;
1915         }
1916       else if (c1 < 0x20)
1917         *dst++ = c1;
1918       else if (c1 < 0x80)
1919         DECODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
1920       else if (c1 < 0xA0 || c1 >= 0xE0)
1921         {
1922           /* SJIS -> JISX0208, BIG5 -> Big5 (only if 0xE0 <= c1 < 0xFF) */
1923           if (sjis_p)
1924             {
1925               ONE_MORE_BYTE (c2);
1926               DECODE_SJIS (c1, c2, c3, c4);
1927               DECODE_SJIS_BIG5_CHARACTER (charset_jisx0208, c3, c4);
1928             }
1929           else if (c1 >= 0xE0 && c1 < 0xFF)
1930             {
1931               int charset;
1932
1933               ONE_MORE_BYTE (c2);
1934               DECODE_BIG5 (c1, c2, charset, c3, c4);
1935               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1936             }
1937           else                  /* Invalid code */
1938             *dst++ = c1;
1939         }
1940       else
1941         {
1942           /* SJIS -> JISX0201-Kana, BIG5 -> Big5 */
1943           if (sjis_p)
1944             DECODE_SJIS_BIG5_CHARACTER (charset_katakana_jisx0201, c1, /* dummy */ c2);
1945           else
1946             {
1947               int charset;
1948
1949               ONE_MORE_BYTE (c2);
1950               DECODE_BIG5 (c1, c2, charset, c3, c4);
1951               DECODE_SJIS_BIG5_CHARACTER (charset, c3, c4);
1952             }
1953         }
1954       continue;
1955
1956     label_end_of_loop:
1957       coding->carryover_size = src - src_base;
1958       bcopy (src_base, coding->carryover, coding->carryover_size);
1959       src = src_base;
1960       break;
1961     }
1962
1963   *consumed = src - source;
1964   return dst - destination;
1965 }
1966
1967 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
1968    This function can encode `charset_ascii', `charset_katakana_jisx0201',
1969    `charset_jisx0208', `charset_big5_1', and `charset_big5-2'.  We are
1970    sure that all these charsets are registered as official charset
1971    (i.e. do not have extended leading-codes).  Characters of other
1972    charsets are produced without any encoding.  If SJIS_P is 1, encode
1973    SJIS text, else encode BIG5 text.  */
1974
1975 int
1976 encode_coding_sjis_big5 (coding, source, destination,
1977                          src_bytes, dst_bytes, consumed, sjis_p)
1978      struct coding_system *coding;
1979      unsigned char *source, *destination;
1980      int src_bytes, dst_bytes;
1981      int *consumed;
1982      int sjis_p;
1983 {
1984   unsigned char *src = source;
1985   unsigned char *src_end = source + src_bytes;
1986   unsigned char *dst = destination;
1987   unsigned char *dst_end = destination + dst_bytes;
1988   /* Since the maximum bytes produced by each loop is 2, we subtract 1
1989      from DST_END to assure overflow checking is necessary only at the
1990      head of loop.  */
1991   unsigned char *adjusted_dst_end = dst_end - 1;
1992   Lisp_Object unification_table
1993       = coding->character_unification_table_for_encode;
1994
1995   if (!NILP (Venable_character_unification) && NILP (unification_table))
1996     unification_table = Vstandard_character_unification_table_for_encode;
1997
1998   while (src < src_end && dst < adjusted_dst_end)
1999     {
2000       /* SRC_BASE remembers the start position in source in each loop.
2001          The loop will be exited when there's not enough source text
2002          to analyze multi-byte codes (within macros ONE_MORE_BYTE and
2003          TWO_MORE_BYTES).  In that case, SRC is reset to SRC_BASE
2004          before exiting.  */
2005       unsigned char *src_base = src;
2006       unsigned char c1 = *src++, c2, c3, c4;
2007
2008       if (coding->composing)
2009         {
2010           if (c1 == 0xA0)
2011             {
2012               ONE_MORE_BYTE (c1);
2013               c1 &= 0x7F;
2014             }
2015           else if (c1 >= 0xA0)
2016             c1 -= 0x20;
2017           else
2018             coding->composing = 0;
2019         }
2020
2021       switch (emacs_code_class[c1])
2022         {
2023         case EMACS_ascii_code:
2024           ENCODE_SJIS_BIG5_CHARACTER (charset_ascii, c1, /* dummy */ c2);
2025           break;
2026
2027         case EMACS_control_code:
2028           *dst++ = c1;
2029           break;
2030
2031         case EMACS_carriage_return_code:
2032           if (!coding->selective)
2033             {
2034               *dst++ = c1;
2035               break;
2036             }
2037           /* fall down to treat '\r' as '\n' ...  */
2038
2039         case EMACS_linefeed_code:
2040           if (coding->eol_type == CODING_EOL_LF
2041               || coding->eol_type == CODING_EOL_UNDECIDED)
2042             *dst++ = '\n';
2043           else if (coding->eol_type == CODING_EOL_CRLF)
2044             *dst++ = '\r', *dst++ = '\n';
2045           else
2046             *dst++ = '\r';
2047           break;
2048
2049         case EMACS_leading_code_2:
2050           ONE_MORE_BYTE (c2);
2051           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, /* dummy */ c3);
2052           break;
2053
2054         case EMACS_leading_code_3:
2055           TWO_MORE_BYTES (c2, c3);
2056           ENCODE_SJIS_BIG5_CHARACTER (c1, c2, c3);
2057           break;
2058
2059         case EMACS_leading_code_4:
2060           THREE_MORE_BYTES (c2, c3, c4);
2061           ENCODE_SJIS_BIG5_CHARACTER (c2, c3, c4);
2062           break;
2063
2064         case EMACS_leading_code_composition:
2065           coding->composing = 1;
2066           break;
2067
2068         default:                /* i.e. case EMACS_invalid_code: */
2069           *dst++ = c1;
2070         }
2071       continue;
2072
2073     label_end_of_loop:
2074       coding->carryover_size = src_end - src_base;
2075       bcopy (src_base, coding->carryover, coding->carryover_size);
2076       src = src_end;
2077       break;
2078     }
2079
2080   *consumed = src - source;
2081   return dst - destination;
2082 }
2083
2084 \f
2085 /*** 5. End-of-line handlers ***/
2086
2087 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2088    This function is called only when `coding->eol_type' is
2089    CODING_EOL_CRLF or CODING_EOL_CR.  */
2090
2091 decode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2092      struct coding_system *coding;
2093      unsigned char *source, *destination;
2094      int src_bytes, dst_bytes;
2095      int *consumed;
2096 {
2097   unsigned char *src = source;
2098   unsigned char *src_end = source + src_bytes;
2099   unsigned char *dst = destination;
2100   unsigned char *dst_end = destination + dst_bytes;
2101   int produced;
2102
2103   switch (coding->eol_type)
2104     {
2105     case CODING_EOL_CRLF:
2106       {
2107         /* Since the maximum bytes produced by each loop is 2, we
2108            subtract 1 from DST_END to assure overflow checking is
2109            necessary only at the head of loop.  */
2110         unsigned char *adjusted_dst_end = dst_end - 1;
2111
2112         while (src < src_end && dst < adjusted_dst_end)
2113           {
2114             unsigned char *src_base = src;
2115             unsigned char c = *src++;
2116             if (c == '\r')
2117               {
2118                 ONE_MORE_BYTE (c);
2119                 if (c != '\n')
2120                   *dst++ = '\r';
2121                 *dst++ = c;
2122               }
2123             else
2124               *dst++ = c;
2125             continue;
2126
2127           label_end_of_loop:
2128             coding->carryover_size = src - src_base;
2129             bcopy (src_base, coding->carryover, coding->carryover_size);
2130             src = src_base;
2131             break;
2132           }
2133         *consumed = src - source;
2134         produced = dst - destination;
2135         break;
2136       }
2137
2138     case CODING_EOL_CR:
2139       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2140       bcopy (source, destination, produced);
2141       dst_end = destination + produced;
2142       while (dst < dst_end)
2143         if (*dst++ == '\r') dst[-1] = '\n';
2144       *consumed = produced;
2145       break;
2146
2147     default:                    /* i.e. case: CODING_EOL_LF */
2148       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2149       bcopy (source, destination, produced);
2150       *consumed = produced;
2151       break;
2152     }
2153
2154   return produced;
2155 }
2156
2157 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2158    format of end-of-line according to `coding->eol_type'.  If
2159    `coding->selective' is 1, code '\r' in source text also means
2160    end-of-line.  */
2161
2162 encode_eol (coding, source, destination, src_bytes, dst_bytes, consumed)
2163      struct coding_system *coding;
2164      unsigned char *source, *destination;
2165      int src_bytes, dst_bytes;
2166      int *consumed;
2167 {
2168   unsigned char *src = source;
2169   unsigned char *dst = destination;
2170   int produced;
2171
2172   if (src_bytes <= 0)
2173     return 0;
2174
2175   switch (coding->eol_type)
2176     {
2177     case CODING_EOL_LF:
2178     case CODING_EOL_UNDECIDED:
2179       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2180       bcopy (source, destination, produced);
2181       if (coding->selective)
2182         {
2183           int i = produced;
2184           while (i--)
2185             if (*dst++ == '\r') dst[-1] = '\n';
2186         }
2187       *consumed = produced;
2188
2189     case CODING_EOL_CRLF:
2190       {
2191         unsigned char c;
2192         unsigned char *src_end = source + src_bytes;
2193         unsigned char *dst_end = destination + dst_bytes;
2194         /* Since the maximum bytes produced by each loop is 2, we
2195            subtract 1 from DST_END to assure overflow checking is
2196            necessary only at the head of loop.  */
2197         unsigned char *adjusted_dst_end = dst_end - 1;
2198
2199         while (src < src_end && dst < adjusted_dst_end)
2200           {
2201             c = *src++;
2202             if (c == '\n' || (c == '\r' && coding->selective))
2203               *dst++ = '\r', *dst++ = '\n';
2204             else
2205               *dst++ = c;
2206           }
2207         produced = dst - destination;
2208         *consumed = src - source;
2209         break;
2210       }
2211
2212     default:                    /* i.e. case CODING_EOL_CR: */
2213       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2214       bcopy (source, destination, produced);
2215       {
2216         int i = produced;
2217         while (i--)
2218           if (*dst++ == '\n') dst[-1] = '\r';
2219       }
2220       *consumed = produced;
2221     }
2222
2223   return produced;
2224 }
2225
2226 \f
2227 /*** 6. C library functions ***/
2228
2229 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2230    has a property `coding-system'.  The value of this property is a
2231    vector of length 5 (called as coding-vector).  Among elements of
2232    this vector, the first (element[0]) and the fifth (element[4])
2233    carry important information for decoding/encoding.  Before
2234    decoding/encoding, this information should be set in fields of a
2235    structure of type `coding_system'.
2236
2237    A value of property `coding-system' can be a symbol of another
2238    subsidiary coding-system.  In that case, Emacs gets coding-vector
2239    from that symbol.
2240
2241    `element[0]' contains information to be set in `coding->type'.  The
2242    value and its meaning is as follows:
2243
2244    0 -- coding_type_emacs_mule
2245    1 -- coding_type_sjis
2246    2 -- coding_type_iso2022
2247    3 -- coding_type_big5
2248    4 -- coding_type_ccl encoder/decoder written in CCL
2249    nil -- coding_type_no_conversion
2250    t -- coding_type_undecided (automatic conversion on decoding,
2251                                no-conversion on encoding)
2252
2253    `element[4]' contains information to be set in `coding->flags' and
2254    `coding->spec'.  The meaning varies by `coding->type'.
2255
2256    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2257    of length 32 (of which the first 13 sub-elements are used now).
2258    Meanings of these sub-elements are:
2259
2260    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2261         If the value is an integer of valid charset, the charset is
2262         assumed to be designated to graphic register N initially.
2263
2264         If the value is minus, it is a minus value of charset which
2265         reserves graphic register N, which means that the charset is
2266         not designated initially but should be designated to graphic
2267         register N just before encoding a character in that charset.
2268
2269         If the value is nil, graphic register N is never used on
2270         encoding.
2271
2272    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2273         Each value takes t or nil.  See the section ISO2022 of
2274         `coding.h' for more information.
2275
2276    If `coding->type' is `coding_type_big5', element[4] is t to denote
2277    BIG5-ETen or nil to denote BIG5-HKU.
2278
2279    If `coding->type' takes the other value, element[4] is ignored.
2280
2281    Emacs Lisp's coding system also carries information about format of
2282    end-of-line in a value of property `eol-type'.  If the value is
2283    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2284    means CODING_EOL_CR.  If it is not integer, it should be a vector
2285    of subsidiary coding systems of which property `eol-type' has one
2286    of above values.
2287
2288 */
2289
2290 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2291    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2292    is setup so that no conversion is necessary and return -1, else
2293    return 0.  */
2294
2295 int
2296 setup_coding_system (coding_system, coding)
2297      Lisp_Object coding_system;
2298      struct coding_system *coding;
2299 {
2300   Lisp_Object type, eol_type;
2301
2302   /* At first, set several fields to default values.  */
2303   coding->require_flushing = 0;
2304   coding->last_block = 0;
2305   coding->selective = 0;
2306   coding->composing = 0;
2307   coding->direction = 0;
2308   coding->carryover_size = 0;
2309   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2310   coding->character_unification_table_for_decode = Qnil;
2311   coding->character_unification_table_for_encode = Qnil;
2312
2313   Vlast_coding_system_used = coding->symbol = coding_system;
2314   eol_type = Qnil;
2315   /* Get value of property `coding-system' until we get a vector.
2316      While doing that, also get values of properties
2317      `post-read-conversion', `pre-write-conversion',
2318      `character-unification-table-for-decode',
2319      `character-unification-table-for-encode' and `eol-type'.  */
2320   while (!NILP (coding_system) && SYMBOLP (coding_system))
2321     {
2322       if (NILP (coding->post_read_conversion))
2323         coding->post_read_conversion = Fget (coding_system,
2324                                              Qpost_read_conversion);
2325       if (NILP (coding->pre_write_conversion))
2326         coding->pre_write_conversion = Fget (coding_system,
2327                                              Qpre_write_conversion);
2328       if (!inhibit_eol_conversion && NILP (eol_type))
2329         eol_type = Fget (coding_system, Qeol_type);
2330
2331       if (NILP (coding->character_unification_table_for_decode))
2332         coding->character_unification_table_for_decode
2333           = Fget (coding_system, Qcharacter_unification_table_for_decode);
2334
2335       if (NILP (coding->character_unification_table_for_encode))
2336         coding->character_unification_table_for_encode
2337           = Fget (coding_system, Qcharacter_unification_table_for_encode);
2338
2339       coding_system = Fget (coding_system, Qcoding_system);
2340     }
2341
2342   while (!NILP (coding->character_unification_table_for_decode)
2343          && SYMBOLP (coding->character_unification_table_for_decode))
2344         coding->character_unification_table_for_decode
2345           = Fget (coding->character_unification_table_for_decode,
2346                   Qcharacter_unification_table_for_decode);
2347   if (!NILP (coding->character_unification_table_for_decode)
2348       && !CHAR_TABLE_P (coding->character_unification_table_for_decode))
2349       coding->character_unification_table_for_decode = Qnil;
2350
2351   while (!NILP (coding->character_unification_table_for_encode)
2352          && SYMBOLP (coding->character_unification_table_for_encode))
2353         coding->character_unification_table_for_encode
2354           = Fget (coding->character_unification_table_for_encode,
2355                   Qcharacter_unification_table_for_encode);
2356   if (!NILP (coding->character_unification_table_for_encode)
2357       && !CHAR_TABLE_P (coding->character_unification_table_for_encode))
2358       coding->character_unification_table_for_encode = Qnil;
2359
2360   if (!VECTORP (coding_system)
2361       || XVECTOR (coding_system)->size != 5)
2362     goto label_invalid_coding_system;
2363
2364   if (VECTORP (eol_type))
2365     coding->eol_type = CODING_EOL_UNDECIDED;
2366   else if (XFASTINT (eol_type) == 1)
2367     coding->eol_type = CODING_EOL_CRLF;
2368   else if (XFASTINT (eol_type) == 2)
2369     coding->eol_type = CODING_EOL_CR;
2370   else
2371     coding->eol_type = CODING_EOL_LF;
2372
2373   type = XVECTOR (coding_system)->contents[0];
2374   switch (XFASTINT (type))
2375     {
2376     case 0:
2377       coding->type = coding_type_emacs_mule;
2378       break;
2379
2380     case 1:
2381       coding->type = coding_type_sjis;
2382       break;
2383
2384     case 2:
2385       coding->type = coding_type_iso2022;
2386       {
2387         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2388         Lisp_Object *flags;
2389         int i, charset, default_reg_bits = 0;
2390
2391         if (!VECTORP (val) || XVECTOR (val)->size != 32)
2392           goto label_invalid_coding_system;
2393
2394         flags = XVECTOR (val)->contents;
2395         coding->flags
2396           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
2397              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
2398              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
2399              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
2400              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
2401              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
2402              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
2403              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
2404              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
2405              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
2406              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
2407              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
2408              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
2409              );
2410
2411         /* Invoke graphic register 0 to plane 0.  */
2412         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
2413         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
2414         CODING_SPEC_ISO_INVOCATION (coding, 1)
2415           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
2416         /* Not single shifting at first.  */
2417         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
2418         /* Beginning of buffer should also be regarded as bol. */
2419         CODING_SPEC_ISO_BOL (coding) = 1;
2420
2421         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
2422            FLAGS[REG] can be one of below:
2423                 integer CHARSET: CHARSET occupies register I,
2424                 t: designate nothing to REG initially, but can be used
2425                   by any charsets,
2426                 list of integer, nil, or t: designate the first
2427                   element (if integer) to REG initially, the remaining
2428                   elements (if integer) is designated to REG on request,
2429                   if an element is t, REG can be used by any charset,
2430                 nil: REG is never used.  */
2431         for (charset = 0; charset <= MAX_CHARSET; charset++)
2432           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2433             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
2434         bzero (CODING_SPEC_ISO_EXPECTED_CHARSETS (coding), MAX_CHARSET + 1);
2435         for (i = 0; i < 4; i++)
2436           {
2437             if (INTEGERP (flags[i])
2438                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
2439                 || (charset = get_charset_id (flags[i])) >= 0)
2440               {
2441                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2442                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
2443                 CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2444               }
2445             else if (EQ (flags[i], Qt))
2446               {
2447                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2448                 default_reg_bits |= 1 << i;
2449               }
2450             else if (CONSP (flags[i]))
2451               {
2452                 Lisp_Object tail = flags[i];
2453
2454                 if (INTEGERP (XCONS (tail)->car)
2455                     && (charset = XINT (XCONS (tail)->car),
2456                         CHARSET_VALID_P (charset))
2457                     || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2458                   {
2459                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
2460                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
2461                     CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset] = 1;
2462                   }
2463                 else
2464                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2465                 tail = XCONS (tail)->cdr;
2466                 while (CONSP (tail))
2467                   {
2468                     if (INTEGERP (XCONS (tail)->car)
2469                         && (charset = XINT (XCONS (tail)->car),
2470                             CHARSET_VALID_P (charset))
2471                         || (charset = get_charset_id (XCONS (tail)->car)) >= 0)
2472                       {
2473                         CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2474                           = i;
2475                         CODING_SPEC_ISO_EXPECTED_CHARSETS (coding)[charset]
2476                           = 1;
2477                       }
2478                     else if (EQ (XCONS (tail)->car, Qt))
2479                       default_reg_bits |= 1 << i;
2480                     tail = XCONS (tail)->cdr;
2481                   }
2482               }
2483             else
2484               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
2485
2486             CODING_SPEC_ISO_DESIGNATION (coding, i)
2487               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
2488           }
2489
2490         if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
2491           {
2492             /* REG 1 can be used only by locking shift in 7-bit env.  */
2493             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2494               default_reg_bits &= ~2;
2495             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
2496               /* Without any shifting, only REG 0 and 1 can be used.  */
2497               default_reg_bits &= 3;
2498           }
2499
2500         for (charset = 0; charset <= MAX_CHARSET; charset++)
2501           if (CHARSET_VALID_P (charset)
2502               && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2503                   == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
2504             {
2505               /* We have not yet decided where to designate CHARSET.  */
2506               int reg_bits = default_reg_bits;
2507
2508               if (CHARSET_CHARS (charset) == 96)
2509                 /* A charset of CHARS96 can't be designated to REG 0.  */
2510                 reg_bits &= ~1;
2511
2512               if (reg_bits)
2513                 /* There exist some default graphic register.  */
2514                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2515                   = (reg_bits & 1
2516                      ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
2517               else
2518                 /* We anyway have to designate CHARSET to somewhere.  */
2519                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
2520                   = (CHARSET_CHARS (charset) == 94
2521                      ? 0
2522                      : ((coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT
2523                          || ! coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
2524                         ? 1
2525                         : (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT
2526                            ? 2 : 0)));
2527             }
2528       }
2529       coding->require_flushing = 1;
2530       break;
2531
2532     case 3:
2533       coding->type = coding_type_big5;
2534       coding->flags
2535         = (NILP (XVECTOR (coding_system)->contents[4])
2536            ? CODING_FLAG_BIG5_HKU
2537            : CODING_FLAG_BIG5_ETEN);
2538       break;
2539
2540     case 4:
2541       coding->type = coding_type_ccl;
2542       {
2543         Lisp_Object val = XVECTOR (coding_system)->contents[4];
2544         if (CONSP  (val)
2545             && VECTORP (XCONS (val)->car)
2546             && VECTORP (XCONS (val)->cdr))
2547           {
2548             setup_ccl_program (&(coding->spec.ccl.decoder), XCONS (val)->car);
2549             setup_ccl_program (&(coding->spec.ccl.encoder), XCONS (val)->cdr);
2550           }
2551         else
2552           goto label_invalid_coding_system;
2553       }
2554       coding->require_flushing = 1;
2555       break;
2556
2557     case 5:
2558       coding->type = coding_type_raw_text;
2559       break;
2560
2561     default:
2562       if (EQ (type, Qt))
2563         coding->type = coding_type_undecided;
2564       else
2565         coding->type = coding_type_no_conversion;
2566       break;
2567     }
2568   return 0;
2569
2570  label_invalid_coding_system:
2571   coding->type = coding_type_no_conversion;
2572   coding->eol_type = CODING_EOL_LF;
2573   coding->symbol = coding->pre_write_conversion = coding->post_read_conversion
2574     = Qnil;
2575   return -1;
2576 }
2577
2578 /* Emacs has a mechanism to automatically detect a coding system if it
2579    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
2580    it's impossible to distinguish some coding systems accurately
2581    because they use the same range of codes.  So, at first, coding
2582    systems are categorized into 7, those are:
2583
2584    o coding-category-emacs-mule
2585
2586         The category for a coding system which has the same code range
2587         as Emacs' internal format.  Assigned the coding-system (Lisp
2588         symbol) `emacs-mule' by default.
2589
2590    o coding-category-sjis
2591
2592         The category for a coding system which has the same code range
2593         as SJIS.  Assigned the coding-system (Lisp
2594         symbol) `japanese-shift-jis' by default.
2595
2596    o coding-category-iso-7
2597
2598         The category for a coding system which has the same code range
2599         as ISO2022 of 7-bit environment.  This doesn't use any locking
2600         shift and single shift functions.  Assigned the coding-system
2601         (Lisp symbol) `iso-2022-7bit' by default.
2602
2603    o coding-category-iso-8-1
2604
2605         The category for a coding system which has the same code range
2606         as ISO2022 of 8-bit environment and graphic plane 1 used only
2607         for DIMENSION1 charset.  This doesn't use any locking shift
2608         and single shift functions.  Assigned the coding-system (Lisp
2609         symbol) `iso-latin-1' by default.
2610
2611    o coding-category-iso-8-2
2612
2613         The category for a coding system which has the same code range
2614         as ISO2022 of 8-bit environment and graphic plane 1 used only
2615         for DIMENSION2 charset.  This doesn't use any locking shift
2616         and single shift functions.  Assigned the coding-system (Lisp
2617         symbol) `japanese-iso-8bit' by default.
2618
2619    o coding-category-iso-7-else
2620
2621         The category for a coding system which has the same code range
2622         as ISO2022 of 7-bit environemnt but uses locking shift or
2623         single shift functions.  Assigned the coding-system (Lisp
2624         symbol) `iso-2022-7bit-lock' by default.
2625
2626    o coding-category-iso-8-else
2627
2628         The category for a coding system which has the same code range
2629         as ISO2022 of 8-bit environemnt but uses locking shift or
2630         single shift functions.  Assigned the coding-system (Lisp
2631         symbol) `iso-2022-8bit-ss2' by default.
2632
2633    o coding-category-big5
2634
2635         The category for a coding system which has the same code range
2636         as BIG5.  Assigned the coding-system (Lisp symbol)
2637         `cn-big5' by default.
2638
2639    o coding-category-binary
2640
2641         The category for a coding system not categorized in any of the
2642         above.  Assigned the coding-system (Lisp symbol)
2643         `no-conversion' by default.
2644
2645    Each of them is a Lisp symbol and the value is an actual
2646    `coding-system's (this is also a Lisp symbol) assigned by a user.
2647    What Emacs does actually is to detect a category of coding system.
2648    Then, it uses a `coding-system' assigned to it.  If Emacs can't
2649    decide only one possible category, it selects a category of the
2650    highest priority.  Priorities of categories are also specified by a
2651    user in a Lisp variable `coding-category-list'.
2652
2653 */
2654
2655 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2656    If it detects possible coding systems, return an integer in which
2657    appropriate flag bits are set.  Flag bits are defined by macros
2658    CODING_CATEGORY_MASK_XXX in `coding.h'.  */
2659
2660 int
2661 detect_coding_mask (src, src_bytes)
2662      unsigned char *src;
2663      int src_bytes;
2664 {
2665   register unsigned char c;
2666   unsigned char *src_end = src + src_bytes;
2667   int mask;
2668
2669   /* At first, skip all ASCII characters and control characters except
2670      for three ISO2022 specific control characters.  */
2671  label_loop_detect_coding:
2672   while (src < src_end)
2673     {
2674       c = *src;
2675       if (c >= 0x80
2676           || (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2677         break;
2678       src++;
2679     }
2680
2681   if (src >= src_end)
2682     /* We found nothing other than ASCII.  There's nothing to do.  */
2683     return CODING_CATEGORY_MASK_ANY;
2684
2685   /* The text seems to be encoded in some multilingual coding system.
2686      Now, try to find in which coding system the text is encoded.  */
2687   if (c < 0x80)
2688     {
2689       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
2690       /* C is an ISO2022 specific control code of C0.  */
2691       mask = detect_coding_iso2022 (src, src_end);
2692       src++;
2693       if (mask == CODING_CATEGORY_MASK_ANY)
2694         /* No valid ISO2022 code follows C.  Try again.  */
2695         goto label_loop_detect_coding;
2696     }
2697   else if (c < 0xA0)
2698     {
2699       /* If C is a special latin extra code,
2700          or is an ISO2022 specific control code of C1 (SS2 or SS3),
2701          or is an ISO2022 control-sequence-introducer (CSI),
2702          we should also consider the possibility of ISO2022 codings.  */
2703       if ((VECTORP (Vlatin_extra_code_table)
2704            && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2705           || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
2706           || (c == ISO_CODE_CSI
2707               && (src < src_end
2708                   && (*src == ']'
2709                       || (src + 1 < src_end
2710                           && src[1] == ']'
2711                           && (*src == '0' || *src == '1' || *src == '2'))))))
2712         mask = (detect_coding_iso2022 (src, src_end)
2713                 | detect_coding_sjis (src, src_end)
2714                 | detect_coding_emacs_mule (src, src_end)
2715                 | CODING_CATEGORY_MASK_RAW_TEXT);
2716
2717       else
2718         /* C is the first byte of SJIS character code,
2719            or a leading-code of Emacs' internal format (emacs-mule).  */
2720         mask = (detect_coding_sjis (src, src_end)
2721                 | detect_coding_emacs_mule (src, src_end)
2722                 | CODING_CATEGORY_MASK_RAW_TEXT);
2723     }
2724   else
2725     /* C is a character of ISO2022 in graphic plane right,
2726        or a SJIS's 1-byte character code (i.e. JISX0201),
2727        or the first byte of BIG5's 2-byte code.  */
2728     mask = (detect_coding_iso2022 (src, src_end)
2729             | detect_coding_sjis (src, src_end)
2730             | detect_coding_big5 (src, src_end)
2731             | CODING_CATEGORY_MASK_RAW_TEXT);
2732
2733   return mask;
2734 }
2735
2736 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
2737    The information of the detected coding system is set in CODING.  */
2738
2739 void
2740 detect_coding (coding, src, src_bytes)
2741      struct coding_system *coding;
2742      unsigned char *src;
2743      int src_bytes;
2744 {
2745   int mask = detect_coding_mask (src, src_bytes);
2746   int idx;
2747   Lisp_Object val = Vcoding_category_list;
2748
2749   if (mask == CODING_CATEGORY_MASK_ANY)
2750     /* We found nothing other than ASCII.  There's nothing to do.  */
2751     return;
2752
2753   /* We found some plausible coding systems.  Let's use a coding
2754      system of the highest priority.  */
2755
2756   if (CONSP (val))
2757     while (!NILP (val))
2758       {
2759         idx = XFASTINT (Fget (XCONS (val)->car, Qcoding_category_index));
2760         if ((idx < CODING_CATEGORY_IDX_MAX) && (mask & (1 << idx)))
2761           break;
2762         val = XCONS (val)->cdr;
2763       }
2764   else
2765     val = Qnil;
2766
2767   if (NILP (val))
2768     {
2769       /* For unknown reason, `Vcoding_category_list' contains none of
2770          found categories.  Let's use any of them.  */
2771       for (idx = 0; idx < CODING_CATEGORY_IDX_MAX; idx++)
2772         if (mask & (1 << idx))
2773           break;
2774     }
2775   setup_coding_system (XSYMBOL (coding_category_table[idx])->value, coding);
2776 }
2777
2778 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2779    is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
2780    CODING_EOL_CR, and CODING_EOL_UNDECIDED.  */
2781
2782 #define MAX_EOL_CHECK_COUNT 3
2783
2784 int
2785 detect_eol_type (src, src_bytes)
2786      unsigned char *src;
2787      int src_bytes;
2788 {
2789   unsigned char *src_end = src + src_bytes;
2790   unsigned char c;
2791   int total = 0;                /* How many end-of-lines are found so far.  */
2792   int eol_type = CODING_EOL_UNDECIDED;
2793   int this_eol_type;
2794
2795   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
2796     {
2797       c = *src++;
2798       if (c == '\n' || c == '\r')
2799         {
2800           total++;
2801           if (c == '\n')
2802             this_eol_type = CODING_EOL_LF;
2803           else if (src >= src_end || *src != '\n')
2804             this_eol_type = CODING_EOL_CR;
2805           else
2806             this_eol_type = CODING_EOL_CRLF, src++;
2807
2808           if (eol_type == CODING_EOL_UNDECIDED)
2809             /* This is the first end-of-line.  */
2810             eol_type = this_eol_type;
2811           else if (eol_type != this_eol_type)
2812             /* The found type is different from what found before.
2813                Let's notice the caller about this inconsistency.  */
2814             return CODING_EOL_INCONSISTENT;
2815         }
2816     }
2817
2818   return eol_type;
2819 }
2820
2821 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
2822    is encoded.  If it detects an appropriate format of end-of-line, it
2823    sets the information in *CODING.  */
2824
2825 void
2826 detect_eol (coding, src, src_bytes)
2827      struct coding_system *coding;
2828      unsigned char *src;
2829      int src_bytes;
2830 {
2831   Lisp_Object val, coding_system;
2832   int eol_type = detect_eol_type (src, src_bytes);
2833
2834   if (eol_type == CODING_EOL_UNDECIDED)
2835     /*  We found no end-of-line in the source text.  */
2836     return;
2837
2838   if (eol_type == CODING_EOL_INCONSISTENT)
2839     {
2840 #if 0
2841       /* This code is suppressed until we find a better way to
2842          distinguish raw text file and binary file.  */
2843
2844       /* If we have already detected that the coding is raw-text, the
2845          coding should actually be no-conversion.  */
2846       if (coding->type == coding_type_raw_text)
2847         {
2848           setup_coding_system (Qno_conversion, coding);
2849           return;
2850         }
2851       /* Else, let's decode only text code anyway.  */
2852 #endif /* 0 */
2853       eol_type == CODING_EOL_LF;
2854     }
2855
2856   coding_system = coding->symbol;
2857   while (!NILP (coding_system)
2858          && NILP (val = Fget (coding_system, Qeol_type)))
2859     coding_system = Fget (coding_system, Qcoding_system);
2860   if (VECTORP (val) && XVECTOR (val)->size == 3)
2861     setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
2862 }
2863
2864 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
2865    decoding, it may detect coding system and format of end-of-line if
2866    those are not yet decided.  */
2867
2868 int
2869 decode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2870      struct coding_system *coding;
2871      unsigned char *source, *destination;
2872      int src_bytes, dst_bytes;
2873      int *consumed;
2874 {
2875   int produced;
2876
2877   if (src_bytes <= 0)
2878     {
2879       *consumed = 0;
2880       return 0;
2881     }
2882
2883   if (coding->type == coding_type_undecided)
2884     detect_coding (coding, source, src_bytes);
2885
2886   if (coding->eol_type == CODING_EOL_UNDECIDED)
2887     detect_eol (coding, source, src_bytes);
2888
2889   coding->carryover_size = 0;
2890   switch (coding->type)
2891     {
2892     case coding_type_no_conversion:
2893     label_no_conversion:
2894       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2895       bcopy (source, destination, produced);
2896       *consumed = produced;
2897       break;
2898
2899     case coding_type_emacs_mule:
2900     case coding_type_undecided:
2901     case coding_type_raw_text:
2902       if (coding->eol_type == CODING_EOL_LF
2903           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2904         goto label_no_conversion;
2905       produced = decode_eol (coding, source, destination,
2906                              src_bytes, dst_bytes, consumed);
2907       break;
2908
2909     case coding_type_sjis:
2910       produced = decode_coding_sjis_big5 (coding, source, destination,
2911                                           src_bytes, dst_bytes, consumed,
2912                                           1);
2913       break;
2914
2915     case coding_type_iso2022:
2916       produced = decode_coding_iso2022 (coding, source, destination,
2917                                         src_bytes, dst_bytes, consumed);
2918       break;
2919
2920     case coding_type_big5:
2921       produced = decode_coding_sjis_big5 (coding, source, destination,
2922                                           src_bytes, dst_bytes, consumed,
2923                                           0);
2924       break;
2925
2926     case coding_type_ccl:
2927       produced = ccl_driver (&coding->spec.ccl.decoder, source, destination,
2928                              src_bytes, dst_bytes, consumed);
2929       break;
2930     }
2931
2932   return produced;
2933 }
2934
2935 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  */
2936
2937 int
2938 encode_coding (coding, source, destination, src_bytes, dst_bytes, consumed)
2939      struct coding_system *coding;
2940      unsigned char *source, *destination;
2941      int src_bytes, dst_bytes;
2942      int *consumed;
2943 {
2944   int produced;
2945
2946   switch (coding->type)
2947     {
2948     case coding_type_no_conversion:
2949     label_no_conversion:
2950       produced = (src_bytes > dst_bytes) ? dst_bytes : src_bytes;
2951       if (produced > 0)
2952         {
2953           bcopy (source, destination, produced);
2954           if (coding->selective)
2955             {
2956               unsigned char *p = destination, *pend = destination + produced;
2957               while (p < pend)
2958                 if (*p++ == '\015') p[-1] = '\n';
2959             }
2960         }
2961       *consumed = produced;
2962       break;
2963
2964     case coding_type_emacs_mule:
2965     case coding_type_undecided:
2966     case coding_type_raw_text:
2967       if (coding->eol_type == CODING_EOL_LF
2968           ||  coding->eol_type == CODING_EOL_UNDECIDED)
2969         goto label_no_conversion;
2970       produced = encode_eol (coding, source, destination,
2971                              src_bytes, dst_bytes, consumed);
2972       break;
2973
2974     case coding_type_sjis:
2975       produced = encode_coding_sjis_big5 (coding, source, destination,
2976                                           src_bytes, dst_bytes, consumed,
2977                                           1);
2978       break;
2979
2980     case coding_type_iso2022:
2981       produced = encode_coding_iso2022 (coding, source, destination,
2982                                         src_bytes, dst_bytes, consumed);
2983       break;
2984
2985     case coding_type_big5:
2986       produced = encode_coding_sjis_big5 (coding, source, destination,
2987                                           src_bytes, dst_bytes, consumed,
2988                                           0);
2989       break;
2990
2991     case coding_type_ccl:
2992       produced = ccl_driver (&coding->spec.ccl.encoder, source, destination,
2993                              src_bytes, dst_bytes, consumed);
2994       break;
2995     }
2996
2997   return produced;
2998 }
2999
3000 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3001
3002 /* Return maximum size (bytes) of a buffer enough for decoding
3003    SRC_BYTES of text encoded in CODING.  */
3004
3005 int
3006 decoding_buffer_size (coding, src_bytes)
3007      struct coding_system *coding;
3008      int src_bytes;
3009 {
3010   int magnification;
3011
3012   if (coding->type == coding_type_iso2022)
3013     magnification = 3;
3014   else if (coding->type == coding_type_ccl)
3015     magnification = coding->spec.ccl.decoder.buf_magnification;
3016   else
3017     magnification = 2;
3018
3019   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3020 }
3021
3022 /* Return maximum size (bytes) of a buffer enough for encoding
3023    SRC_BYTES of text to CODING.  */
3024
3025 int
3026 encoding_buffer_size (coding, src_bytes)
3027      struct coding_system *coding;
3028      int src_bytes;
3029 {
3030   int magnification;
3031
3032   if (coding->type == coding_type_ccl)
3033     magnification = coding->spec.ccl.encoder.buf_magnification;
3034   else
3035     magnification = 3;
3036
3037   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3038 }
3039
3040 #ifndef MINIMUM_CONVERSION_BUFFER_SIZE
3041 #define MINIMUM_CONVERSION_BUFFER_SIZE 1024
3042 #endif
3043
3044 char *conversion_buffer;
3045 int conversion_buffer_size;
3046
3047 /* Return a pointer to a SIZE bytes of buffer to be used for encoding
3048    or decoding.  Sufficient memory is allocated automatically.  If we
3049    run out of memory, return NULL.  */
3050
3051 char *
3052 get_conversion_buffer (size)
3053      int size;
3054 {
3055   if (size > conversion_buffer_size)
3056     {
3057       char *buf;
3058       int real_size = conversion_buffer_size * 2;
3059
3060       while (real_size < size) real_size *= 2;
3061       buf = (char *) xmalloc (real_size);
3062       xfree (conversion_buffer);
3063       conversion_buffer = buf;
3064       conversion_buffer_size = real_size;
3065     }
3066   return conversion_buffer;
3067 }
3068
3069 \f
3070 #ifdef emacs
3071 /*** 7. Emacs Lisp library functions ***/
3072
3073 DEFUN ("coding-system-spec", Fcoding_system_spec, Scoding_system_spec,
3074        1, 1, 0,
3075   "Return coding-spec of CODING-SYSTEM.\n\
3076 If CODING-SYSTEM is not a valid coding-system, return nil.")
3077   (obj)
3078      Lisp_Object obj;
3079 {
3080   while (SYMBOLP (obj) && !NILP (obj))
3081     obj = Fget (obj, Qcoding_system);
3082   return ((NILP (obj) || !VECTORP (obj) || XVECTOR (obj)->size != 5)
3083           ? Qnil : obj);
3084 }
3085
3086 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
3087   "Return t if OBJECT is nil or a coding-system.\n\
3088 See document of make-coding-system for coding-system object.")
3089   (obj)
3090      Lisp_Object obj;
3091 {
3092   return ((NILP (obj) || !NILP (Fcoding_system_spec (obj))) ? Qt : Qnil);
3093 }
3094
3095 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
3096        Sread_non_nil_coding_system, 1, 1, 0,
3097   "Read a coding system from the minibuffer, prompting with string PROMPT.")
3098   (prompt)
3099      Lisp_Object prompt;
3100 {
3101   Lisp_Object val;
3102   do
3103     {
3104       val = Fcompleting_read (prompt, Vobarray, Qcoding_system_spec,
3105                               Qt, Qnil, Qnil, Qnil, Qnil);
3106     }
3107   while (XSTRING (val)->size == 0);
3108   return (Fintern (val, Qnil));
3109 }
3110
3111 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 1, 0,
3112   "Read a coding system or nil from the minibuffer, prompting with string PROMPT.")
3113   (prompt)
3114      Lisp_Object prompt;
3115 {
3116   Lisp_Object val = Fcompleting_read (prompt, Vobarray, Qcoding_system_p,
3117                                       Qt, Qnil, Qnil, Qnil, Qnil);
3118   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
3119 }
3120
3121 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
3122        1, 1, 0,
3123   "Check validity of CODING-SYSTEM.\n\
3124 If valid, return CODING-SYSTEM, else `coding-system-error' is signaled.\n\
3125 CODING-SYSTEM is valid if it is a symbol and has \"coding-system\" property.\n\
3126 The value of property should be a vector of length 5.")
3127   (coding_system)
3128      Lisp_Object coding_system;
3129 {
3130   CHECK_SYMBOL (coding_system, 0);
3131   if (!NILP (Fcoding_system_p (coding_system)))
3132     return coding_system;
3133   while (1)
3134     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
3135 }
3136
3137 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
3138        2, 2, 0,
3139   "Detect coding system of the text in the region between START and END.\n\
3140 Return a list of possible coding systems ordered by priority.\n\
3141 If only ASCII characters are found, it returns `undecided'\n\
3142  or its subsidiary coding system according to a detected end-of-line format.")
3143   (b, e)
3144      Lisp_Object b, e;
3145 {
3146   int coding_mask, eol_type;
3147   Lisp_Object val;
3148   int beg, end;
3149
3150   validate_region (&b, &e);
3151   beg = XINT (b), end = XINT (e);
3152   if (beg < GPT && end >= GPT) move_gap (end);
3153
3154   coding_mask = detect_coding_mask (POS_ADDR (beg), end - beg);
3155   eol_type  = detect_eol_type (POS_ADDR (beg), end - beg);
3156
3157   if (coding_mask == CODING_CATEGORY_MASK_ANY)
3158     {
3159       val = Qundecided;
3160       if (eol_type != CODING_EOL_UNDECIDED
3161           && eol_type != CODING_EOL_INCONSISTENT)
3162         {
3163           Lisp_Object val2 = Fget (Qundecided, Qeol_type);
3164           if (VECTORP (val2))
3165             val = XVECTOR (val2)->contents[eol_type];
3166         }
3167     }
3168   else
3169     {
3170       Lisp_Object val2;
3171
3172       /* At first, gather possible coding-systems in VAL in a reverse
3173          order.  */
3174       val = Qnil;
3175       for (val2 = Vcoding_category_list;
3176            !NILP (val2);
3177            val2 = XCONS (val2)->cdr)
3178         {
3179           int idx
3180             = XFASTINT (Fget (XCONS (val2)->car, Qcoding_category_index));
3181           if (coding_mask & (1 << idx))
3182             {
3183 #if 0
3184               /* This code is suppressed until we find a better way to
3185                  distinguish raw text file and binary file.  */
3186
3187               if (idx == CODING_CATEGORY_IDX_RAW_TEXT
3188                   && eol_type == CODING_EOL_INCONSISTENT)
3189                 val = Fcons (Qno_conversion, val);
3190               else
3191 #endif /* 0 */
3192                 val = Fcons (Fsymbol_value (XCONS (val2)->car), val);
3193             }
3194         }
3195
3196       /* Then, change the order of the list, while getting subsidiary
3197          coding-systems.  */
3198       val2 = val;
3199       val = Qnil;
3200       if (eol_type == CODING_EOL_INCONSISTENT)
3201         eol_type == CODING_EOL_UNDECIDED;
3202       for (; !NILP (val2); val2 = XCONS (val2)->cdr)
3203         {
3204           if (eol_type == CODING_EOL_UNDECIDED)
3205             val = Fcons (XCONS (val2)->car, val);
3206           else
3207             {
3208               Lisp_Object val3 = Fget (XCONS (val2)->car, Qeol_type);
3209               if (VECTORP (val3))
3210                 val = Fcons (XVECTOR (val3)->contents[eol_type], val);
3211               else
3212                 val = Fcons (XCONS (val2)->car, val);
3213             }
3214         }
3215     }
3216
3217   return val;
3218 }
3219
3220 /* Scan text in the region between *BEGP and *ENDP, skip characters
3221    which we never have to encode to (iff ENCODEP is 1) or decode from
3222    coding system CODING at the head and tail, then set BEGP and ENDP
3223    to the addresses of start and end of the text we actually convert.  */
3224
3225 void
3226 shrink_conversion_area (begp, endp, coding, encodep)
3227      unsigned char **begp, **endp;
3228      struct coding_system *coding;
3229      int encodep;
3230 {
3231   register unsigned char *beg_addr = *begp, *end_addr = *endp;
3232
3233   if (coding->eol_type != CODING_EOL_LF
3234       && coding->eol_type != CODING_EOL_UNDECIDED)
3235     /* Since we anyway have to convert end-of-line format, it is not
3236        worth skipping at most 100 bytes or so.  */
3237     return;
3238
3239   if (encodep)                  /* for encoding */
3240     {
3241       switch (coding->type)
3242         {
3243         case coding_type_no_conversion:
3244         case coding_type_emacs_mule:
3245         case coding_type_undecided:
3246         case coding_type_raw_text:
3247           /* We need no conversion.  */
3248           *begp = *endp;
3249           return;
3250         case coding_type_ccl:
3251           /* We can't skip any data.  */
3252           return;
3253         case coding_type_iso2022:
3254           if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3255             {
3256               unsigned char *bol = beg_addr;
3257               while (beg_addr < end_addr && *beg_addr < 0x80)
3258                 {
3259                   beg_addr++;
3260                   if (*(beg_addr - 1) == '\n')
3261                     bol = beg_addr;
3262                 }
3263               beg_addr = bol;
3264               goto label_skip_tail;
3265             }
3266           /* fall down ... */
3267         default:
3268           /* We can skip all ASCII characters at the head and tail.  */
3269           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3270         label_skip_tail:
3271           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3272           break;
3273         }
3274     }
3275   else                          /* for decoding */
3276     {
3277       switch (coding->type)
3278         {
3279         case coding_type_no_conversion:
3280           /* We need no conversion.  */
3281           *begp = *endp;
3282           return;
3283         case coding_type_emacs_mule:
3284         case coding_type_raw_text:
3285           if (coding->eol_type == CODING_EOL_LF)
3286             {
3287               /* We need no conversion.  */
3288               *begp = *endp;
3289               return;
3290             }
3291           /* We can skip all but carriage-return.  */
3292           while (beg_addr < end_addr && *beg_addr != '\r') beg_addr++;
3293           while (beg_addr < end_addr && *(end_addr - 1) != '\r') end_addr--;
3294           break;
3295         case coding_type_sjis:
3296         case coding_type_big5:
3297           /* We can skip all ASCII characters at the head.  */
3298           while (beg_addr < end_addr && *beg_addr < 0x80) beg_addr++;
3299           /* We can skip all ASCII characters at the tail except for
3300              the second byte of SJIS or BIG5 code.  */
3301           while (beg_addr < end_addr && *(end_addr - 1) < 0x80) end_addr--;
3302           if (end_addr != *endp)
3303             end_addr++;
3304           break;
3305         case coding_type_ccl:
3306           /* We can't skip any data.  */
3307           return;
3308         default:                /* i.e. case coding_type_iso2022: */
3309           {
3310             unsigned char c;
3311
3312             /* We can skip all ASCII characters except for a few
3313                control codes at the head.  */
3314             while (beg_addr < end_addr && (c = *beg_addr) < 0x80
3315                    && c != ISO_CODE_CR && c != ISO_CODE_SO
3316                    && c != ISO_CODE_SI && c != ISO_CODE_ESC)
3317               beg_addr++;
3318           }
3319           break;
3320         }
3321     }
3322   *begp = beg_addr;
3323   *endp = end_addr;
3324   return;
3325 }
3326
3327 /* Encode to (iff ENCODEP is 1) or decode form coding system CODING a
3328    text between B and E.  B and E are buffer position.  */
3329
3330 Lisp_Object
3331 code_convert_region (b, e, coding, encodep)
3332      Lisp_Object b, e;
3333      struct coding_system *coding;
3334      int encodep;
3335 {
3336   int beg, end, len, consumed, produced;
3337   char *buf;
3338   unsigned char *begp, *endp;
3339   int pos = PT;
3340
3341   validate_region (&b, &e);
3342   beg = XINT (b), end = XINT (e);
3343   if (beg < GPT && end >= GPT)
3344     move_gap (end);
3345
3346   if (encodep && !NILP (coding->pre_write_conversion))
3347     {
3348       /* We must call a pre-conversion function which may put a new
3349          text to be converted in a new buffer.  */
3350       struct buffer *old = current_buffer, *new;
3351
3352       TEMP_SET_PT (beg);
3353       call2 (coding->pre_write_conversion, b, e);
3354       if (old != current_buffer)
3355         {
3356           /* Replace the original text by the text just generated.  */
3357           len = ZV - BEGV;
3358           new = current_buffer;
3359           set_buffer_internal (old);
3360           del_range (beg, end);
3361           insert_from_buffer (new, 1, len, 0);
3362           end = beg + len;
3363         }
3364     }
3365
3366   /* We may be able to shrink the conversion region.  */
3367   begp = POS_ADDR (beg); endp = begp + (end - beg);
3368   shrink_conversion_area (&begp, &endp, coding, encodep);
3369
3370   if (begp == endp)
3371     /* We need no conversion.  */
3372     len = end - beg;
3373   else
3374     {
3375       beg += begp - POS_ADDR (beg);
3376       end =  beg + (endp - begp);
3377
3378       if (encodep)
3379         len = encoding_buffer_size (coding, end - beg);
3380       else
3381         len = decoding_buffer_size (coding, end - beg);
3382       buf = get_conversion_buffer (len);
3383
3384       coding->last_block = 1;
3385       produced = (encodep
3386                   ? encode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3387                                    &consumed)
3388                   : decode_coding (coding, POS_ADDR (beg), buf, end - beg, len,
3389                                    &consumed));
3390
3391       len = produced + (beg - XINT (b)) + (XINT (e) - end);
3392
3393       TEMP_SET_PT (beg);
3394       insert (buf, produced);
3395       del_range (PT, PT + end - beg);
3396       if (pos >= end)
3397         pos = PT + (pos - end);
3398       else if (pos > beg)
3399         pos = beg;
3400       TEMP_SET_PT (pos);
3401   }
3402
3403   if (!encodep && !NILP (coding->post_read_conversion))
3404     {
3405       /* We must call a post-conversion function which may alter
3406          the text just converted.  */
3407       Lisp_Object insval;
3408
3409       beg = XINT (b);
3410       TEMP_SET_PT (beg);
3411       insval = call1 (coding->post_read_conversion, make_number (len));
3412       CHECK_NUMBER (insval, 0);
3413       len = XINT (insval);
3414     }
3415
3416   return make_number (len);
3417 }
3418
3419 Lisp_Object
3420 code_convert_string (str, coding, encodep, nocopy)
3421      Lisp_Object str, nocopy;
3422      struct coding_system *coding;
3423      int encodep;
3424 {
3425   int len, consumed, produced;
3426   char *buf;
3427   unsigned char *begp, *endp;
3428   int head_skip, tail_skip;
3429   struct gcpro gcpro1;
3430
3431   if (encodep && !NILP (coding->pre_write_conversion)
3432       || !encodep && !NILP (coding->post_read_conversion))
3433     {
3434       /* Since we have to call Lisp functions which assume target text
3435          is in a buffer, after setting a temporary buffer, call
3436          code_convert_region.  */
3437       int count = specpdl_ptr - specpdl;
3438       int len = XSTRING (str)->size;
3439       Lisp_Object result;
3440       struct buffer *old = current_buffer;
3441
3442       record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
3443       temp_output_buffer_setup (" *code-converting-work*");
3444       set_buffer_internal (XBUFFER (Vstandard_output));
3445       insert_from_string (str, 0, len, 0);
3446       code_convert_region (make_number (BEGV), make_number (ZV),
3447                            coding, encodep);
3448       result = make_buffer_string (BEGV, ZV, 0);
3449       set_buffer_internal (old);
3450       return unbind_to (count, result);
3451     }
3452
3453   /* We may be able to shrink the conversion region.  */
3454   begp = XSTRING (str)->data;
3455   endp = begp + XSTRING (str)->size;
3456   shrink_conversion_area (&begp, &endp, coding, encodep);
3457
3458   if (begp == endp)
3459     /* We need no conversion.  */
3460     return (NILP (nocopy) ? Fcopy_sequence (str) : str);
3461
3462   head_skip = begp - XSTRING (str)->data;
3463   tail_skip = XSTRING (str)->size - head_skip - (endp - begp);
3464
3465   GCPRO1 (str);
3466
3467   if (encodep)
3468     len = encoding_buffer_size (coding, endp - begp);
3469   else
3470     len = decoding_buffer_size (coding, endp - begp);
3471   buf = get_conversion_buffer (len + head_skip + tail_skip);
3472
3473   bcopy (XSTRING (str)->data, buf, head_skip);
3474   coding->last_block = 1;
3475   produced = (encodep
3476               ? encode_coding (coding, XSTRING (str)->data + head_skip,
3477                                buf + head_skip, endp - begp, len, &consumed)
3478               : decode_coding (coding, XSTRING (str)->data + head_skip,
3479                                buf + head_skip, endp - begp, len, &consumed));
3480   bcopy (XSTRING (str)->data + head_skip + (endp - begp),
3481          buf + head_skip + produced,
3482          tail_skip);
3483
3484   UNGCPRO;
3485
3486   return make_string (buf, head_skip + produced + tail_skip);
3487 }
3488
3489 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
3490        3, 3, "r\nzCoding system: ",
3491   "Decode current region by specified coding system.\n\
3492 When called from a program, takes three arguments:\n\
3493 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3494 Return length of decoded text.")
3495   (b, e, coding_system)
3496      Lisp_Object b, e, coding_system;
3497 {
3498   struct coding_system coding;
3499
3500   CHECK_NUMBER_COERCE_MARKER (b, 0);
3501   CHECK_NUMBER_COERCE_MARKER (e, 1);
3502   CHECK_SYMBOL (coding_system, 2);
3503
3504   if (NILP (coding_system))
3505     return make_number (XFASTINT (e) - XFASTINT (b));
3506   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3507     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3508
3509   return code_convert_region (b, e, &coding, 0);
3510 }
3511
3512 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
3513        3, 3, "r\nzCoding system: ",
3514   "Encode current region by specified coding system.\n\
3515 When called from a program, takes three arguments:\n\
3516 START, END, and CODING-SYSTEM.  START END are buffer positions.\n\
3517 Return length of encoded text.")
3518   (b, e, coding_system)
3519      Lisp_Object b, e, coding_system;
3520 {
3521   struct coding_system coding;
3522
3523   CHECK_NUMBER_COERCE_MARKER (b, 0);
3524   CHECK_NUMBER_COERCE_MARKER (e, 1);
3525   CHECK_SYMBOL (coding_system, 2);
3526
3527   if (NILP (coding_system))
3528     return make_number (XFASTINT (e) - XFASTINT (b));
3529   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3530     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3531
3532   return code_convert_region (b, e, &coding, 1);
3533 }
3534
3535 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
3536        2, 3, 0,
3537   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
3538 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3539 of decoding.")
3540   (string, coding_system, nocopy)
3541      Lisp_Object string, coding_system, nocopy;
3542 {
3543   struct coding_system coding;
3544
3545   CHECK_STRING (string, 0);
3546   CHECK_SYMBOL (coding_system, 1);
3547
3548   if (NILP (coding_system))
3549     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3550   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3551     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3552
3553   return code_convert_string (string, &coding, 0, nocopy);
3554 }
3555
3556 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
3557        2, 3, 0,
3558   "Encode STRING to CODING-SYSTEM, and return the result.\n\
3559 Optional arg NOCOPY non-nil means return STRING itself if there's no need\n\
3560 of encoding.")
3561   (string, coding_system, nocopy)
3562      Lisp_Object string, coding_system, nocopy;
3563 {
3564   struct coding_system coding;
3565
3566   CHECK_STRING (string, 0);
3567   CHECK_SYMBOL (coding_system, 1);
3568
3569   if (NILP (coding_system))
3570     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
3571   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
3572     error ("Invalid coding-system: %s", XSYMBOL (coding_system)->name->data);
3573
3574   return code_convert_string (string, &coding, 1, nocopy);
3575 }
3576
3577 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
3578   "Decode a JISX0208 character of shift-jis encoding.\n\
3579 CODE is the character code in SJIS.\n\
3580 Return the corresponding character.")
3581   (code)
3582      Lisp_Object code;
3583 {
3584   unsigned char c1, c2, s1, s2;
3585   Lisp_Object val;
3586
3587   CHECK_NUMBER (code, 0);
3588   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
3589   DECODE_SJIS (s1, s2, c1, c2);
3590   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset_jisx0208, c1, c2));
3591   return val;
3592 }
3593
3594 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
3595   "Encode a JISX0208 character CHAR to SJIS coding-system.\n\
3596 Return the corresponding character code in SJIS.")
3597   (ch)
3598      Lisp_Object ch;
3599 {
3600   int charset, c1, c2, s1, s2;
3601   Lisp_Object val;
3602
3603   CHECK_NUMBER (ch, 0);
3604   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3605   if (charset == charset_jisx0208)
3606     {
3607       ENCODE_SJIS (c1, c2, s1, s2);
3608       XSETFASTINT (val, (s1 << 8) | s2);
3609     }
3610   else
3611     XSETFASTINT (val, 0);
3612   return val;
3613 }
3614
3615 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
3616   "Decode a Big5 character CODE of BIG5 coding-system.\n\
3617 CODE is the character code in BIG5.\n\
3618 Return the corresponding character.")
3619   (code)
3620      Lisp_Object code;
3621 {
3622   int charset;
3623   unsigned char b1, b2, c1, c2;
3624   Lisp_Object val;
3625
3626   CHECK_NUMBER (code, 0);
3627   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
3628   DECODE_BIG5 (b1, b2, charset, c1, c2);
3629   XSETFASTINT (val, MAKE_NON_ASCII_CHAR (charset, c1, c2));
3630   return val;
3631 }
3632
3633 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
3634   "Encode the Big5 character CHAR to BIG5 coding-system.\n\
3635 Return the corresponding character code in Big5.")
3636   (ch)
3637      Lisp_Object ch;
3638 {
3639   int charset, c1, c2, b1, b2;
3640   Lisp_Object val;
3641
3642   CHECK_NUMBER (ch, 0);
3643   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
3644   if (charset == charset_big5_1 || charset == charset_big5_2)
3645     {
3646       ENCODE_BIG5 (charset, c1, c2, b1, b2);
3647       XSETFASTINT (val, (b1 << 8) | b2);
3648     }
3649   else
3650     XSETFASTINT (val, 0);
3651   return val;
3652 }
3653
3654 DEFUN ("set-terminal-coding-system-internal",
3655        Fset_terminal_coding_system_internal,
3656        Sset_terminal_coding_system_internal, 1, 1, 0, "")
3657   (coding_system)
3658      Lisp_Object coding_system;
3659 {
3660   CHECK_SYMBOL (coding_system, 0);
3661   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
3662   /* We had better not send unexpected characters to terminal.  */
3663   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
3664
3665   return Qnil;
3666 }
3667
3668 DEFUN ("set-safe-terminal-coding-system-internal",
3669        Fset_safe_terminal_coding_system_internal,
3670        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
3671   (coding_system)
3672      Lisp_Object coding_system;
3673 {
3674   CHECK_SYMBOL (coding_system, 0);
3675   setup_coding_system (Fcheck_coding_system (coding_system),
3676                        &safe_terminal_coding);
3677   return Qnil;
3678 }
3679
3680 DEFUN ("terminal-coding-system",
3681        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
3682   "Return coding-system of your terminal.")
3683   ()
3684 {
3685   return terminal_coding.symbol;
3686 }
3687
3688 DEFUN ("set-keyboard-coding-system-internal",
3689        Fset_keyboard_coding_system_internal,
3690        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
3691   (coding_system)
3692      Lisp_Object coding_system;
3693 {
3694   CHECK_SYMBOL (coding_system, 0);
3695   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
3696   return Qnil;
3697 }
3698
3699 DEFUN ("keyboard-coding-system",
3700        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
3701   "Return coding-system of what is sent from terminal keyboard.")
3702   ()
3703 {
3704   return keyboard_coding.symbol;
3705 }
3706
3707 \f
3708 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
3709        Sfind_operation_coding_system,  1, MANY, 0,
3710   "Choose a coding system for an operation based on the target name.\n\
3711 The value names a pair of coding systems: (DECODING-SYSTEM ENCODING-SYSTEM).\n\
3712 DECODING-SYSTEM is the coding system to use for decoding\n\
3713 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
3714 for encoding (in case OPERATION does encoding).\n\
3715 \n\
3716 The first argument OPERATION specifies an I/O primitive:\n\
3717   For file I/O, `insert-file-contents' or `write-region'.\n\
3718   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
3719   For network I/O, `open-network-stream'.\n\
3720 \n\
3721 The remaining arguments should be the same arguments that were passed\n\
3722 to the primitive.  Depending on which primitive, one of those arguments\n\
3723 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
3724 whichever argument specifies the file name is TARGET.\n\
3725 \n\
3726 TARGET has a meaning which depends on OPERATION:\n\
3727   For file I/O, TARGET is a file name.\n\
3728   For process I/O, TARGET is a process name.\n\
3729   For network I/O, TARGET is a service name or a port number\n\
3730 \n\
3731 This function looks up what specified for TARGET in,\n\
3732 `file-coding-system-alist', `process-coding-system-alist',\n\
3733 or `network-coding-system-alist' depending on OPERATION.\n\
3734 They may specify a coding system, a cons of coding systems,\n\
3735 or a function symbol to call.\n\
3736 In the last case, we call the function with one argument,\n\
3737 which is a list of all the arguments given to this function.")
3738   (nargs, args)
3739      int nargs;
3740      Lisp_Object *args;
3741 {
3742   Lisp_Object operation, target_idx, target, val;
3743   register Lisp_Object chain;
3744
3745   if (nargs < 2)
3746     error ("Too few arguments");
3747   operation = args[0];
3748   if (!SYMBOLP (operation)
3749       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
3750     error ("Invalid first arguement");
3751   if (nargs < 1 + XINT (target_idx))
3752     error ("Too few arguments for operation: %s",
3753            XSYMBOL (operation)->name->data);
3754   target = args[XINT (target_idx) + 1];
3755   if (!(STRINGP (target)
3756         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
3757     error ("Invalid %dth argument", XINT (target_idx) + 1);
3758
3759   chain = ((EQ (operation, Qinsert_file_contents)
3760             || EQ (operation, Qwrite_region))
3761            ? Vfile_coding_system_alist
3762            : (EQ (operation, Qopen_network_stream)
3763               ? Vnetwork_coding_system_alist
3764               : Vprocess_coding_system_alist));
3765   if (NILP (chain))
3766     return Qnil;
3767
3768   for (; CONSP (chain); chain = XCONS (chain)->cdr)
3769     {
3770       Lisp_Object elt = XCONS (chain)->car;
3771
3772       if (CONSP (elt)
3773           && ((STRINGP (target)
3774                && STRINGP (XCONS (elt)->car)
3775                && fast_string_match (XCONS (elt)->car, target) >= 0)
3776               || (INTEGERP (target) && EQ (target, XCONS (elt)->car))))
3777         {
3778           val = XCONS (elt)->cdr;
3779           if (CONSP (val))
3780             return val;
3781           if (! SYMBOLP (val))
3782             return Qnil;
3783           if (! NILP (Fcoding_system_p (val)))
3784             return Fcons (val, val);
3785           if (!NILP (Ffboundp (val)))
3786             return call1 (val, Flist (nargs, args));
3787           return Qnil;
3788         }
3789     }
3790   return Qnil;
3791 }
3792
3793 #endif /* emacs */
3794
3795 \f
3796 /*** 8. Post-amble ***/
3797
3798 init_coding_once ()
3799 {
3800   int i;
3801
3802   /* Emacs' internal format specific initialize routine.  */
3803   for (i = 0; i <= 0x20; i++)
3804     emacs_code_class[i] = EMACS_control_code;
3805   emacs_code_class[0x0A] = EMACS_linefeed_code;
3806   emacs_code_class[0x0D] = EMACS_carriage_return_code;
3807   for (i = 0x21 ; i < 0x7F; i++)
3808     emacs_code_class[i] = EMACS_ascii_code;
3809   emacs_code_class[0x7F] = EMACS_control_code;
3810   emacs_code_class[0x80] = EMACS_leading_code_composition;
3811   for (i = 0x81; i < 0xFF; i++)
3812     emacs_code_class[i] = EMACS_invalid_code;
3813   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
3814   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
3815   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
3816   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
3817
3818   /* ISO2022 specific initialize routine.  */
3819   for (i = 0; i < 0x20; i++)
3820     iso_code_class[i] = ISO_control_code;
3821   for (i = 0x21; i < 0x7F; i++)
3822     iso_code_class[i] = ISO_graphic_plane_0;
3823   for (i = 0x80; i < 0xA0; i++)
3824     iso_code_class[i] = ISO_control_code;
3825   for (i = 0xA1; i < 0xFF; i++)
3826     iso_code_class[i] = ISO_graphic_plane_1;
3827   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
3828   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
3829   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
3830   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
3831   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
3832   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
3833   iso_code_class[ISO_CODE_ESC] = ISO_escape;
3834   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
3835   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
3836   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
3837
3838   conversion_buffer_size = MINIMUM_CONVERSION_BUFFER_SIZE;
3839   conversion_buffer = (char *) xmalloc (MINIMUM_CONVERSION_BUFFER_SIZE);
3840
3841   setup_coding_system (Qnil, &keyboard_coding);
3842   setup_coding_system (Qnil, &terminal_coding);
3843   setup_coding_system (Qnil, &safe_terminal_coding);
3844
3845 #if defined (MSDOS) || defined (WINDOWSNT)
3846   system_eol_type = CODING_EOL_CRLF;
3847 #else
3848   system_eol_type = CODING_EOL_LF;
3849 #endif
3850 }
3851
3852 #ifdef emacs
3853
3854 syms_of_coding ()
3855 {
3856   Qtarget_idx = intern ("target-idx");
3857   staticpro (&Qtarget_idx);
3858
3859   /* Target FILENAME is the first argument.  */
3860   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
3861   /* Target FILENAME is the third argument.  */
3862   Fput (Qwrite_region, Qtarget_idx, make_number (2));
3863
3864   Qcall_process = intern ("call-process");
3865   staticpro (&Qcall_process);
3866   /* Target PROGRAM is the first argument.  */
3867   Fput (Qcall_process, Qtarget_idx, make_number (0));
3868
3869   Qcall_process_region = intern ("call-process-region");
3870   staticpro (&Qcall_process_region);
3871   /* Target PROGRAM is the third argument.  */
3872   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
3873
3874   Qstart_process = intern ("start-process");
3875   staticpro (&Qstart_process);
3876   /* Target PROGRAM is the third argument.  */
3877   Fput (Qstart_process, Qtarget_idx, make_number (2));
3878
3879   Qopen_network_stream = intern ("open-network-stream");
3880   staticpro (&Qopen_network_stream);
3881   /* Target SERVICE is the fourth argument.  */
3882   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
3883
3884   Qcoding_system = intern ("coding-system");
3885   staticpro (&Qcoding_system);
3886
3887   Qeol_type = intern ("eol-type");
3888   staticpro (&Qeol_type);
3889
3890   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
3891   staticpro (&Qbuffer_file_coding_system);
3892
3893   Qpost_read_conversion = intern ("post-read-conversion");
3894   staticpro (&Qpost_read_conversion);
3895
3896   Qpre_write_conversion = intern ("pre-write-conversion");
3897   staticpro (&Qpre_write_conversion);
3898
3899   Qno_conversion = intern ("no-conversion");
3900   staticpro (&Qno_conversion);
3901
3902   Qundecided = intern ("undecided");
3903   staticpro (&Qundecided);
3904
3905   Qcoding_system_spec = intern ("coding-system-spec");
3906   staticpro (&Qcoding_system_spec);
3907
3908   Qcoding_system_p = intern ("coding-system-p");
3909   staticpro (&Qcoding_system_p);
3910
3911   Qcoding_system_error = intern ("coding-system-error");
3912   staticpro (&Qcoding_system_error);
3913
3914   Fput (Qcoding_system_error, Qerror_conditions,
3915         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
3916   Fput (Qcoding_system_error, Qerror_message,
3917         build_string ("Invalid coding system"));
3918
3919   Qcoding_category_index = intern ("coding-category-index");
3920   staticpro (&Qcoding_category_index);
3921
3922   {
3923     int i;
3924     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3925       {
3926         coding_category_table[i] = intern (coding_category_name[i]);
3927         staticpro (&coding_category_table[i]);
3928         Fput (coding_category_table[i], Qcoding_category_index,
3929               make_number (i));
3930       }
3931   }
3932
3933   Qcharacter_unification_table = intern ("character-unification-table");
3934   staticpro (&Qcharacter_unification_table);
3935   Fput (Qcharacter_unification_table, Qchar_table_extra_slots,
3936         make_number (0));
3937
3938   Qcharacter_unification_table_for_decode
3939     = intern ("character-unification-table-for-decode");
3940   staticpro (&Qcharacter_unification_table_for_decode);
3941
3942   Qcharacter_unification_table_for_encode
3943     = intern ("character-unification-table-for-encode");
3944   staticpro (&Qcharacter_unification_table_for_encode);
3945
3946   Qemacs_mule = intern ("emacs-mule");
3947   staticpro (&Qemacs_mule);
3948
3949   defsubr (&Scoding_system_spec);
3950   defsubr (&Scoding_system_p);
3951   defsubr (&Sread_coding_system);
3952   defsubr (&Sread_non_nil_coding_system);
3953   defsubr (&Scheck_coding_system);
3954   defsubr (&Sdetect_coding_region);
3955   defsubr (&Sdecode_coding_region);
3956   defsubr (&Sencode_coding_region);
3957   defsubr (&Sdecode_coding_string);
3958   defsubr (&Sencode_coding_string);
3959   defsubr (&Sdecode_sjis_char);
3960   defsubr (&Sencode_sjis_char);
3961   defsubr (&Sdecode_big5_char);
3962   defsubr (&Sencode_big5_char);
3963   defsubr (&Sset_terminal_coding_system_internal);
3964   defsubr (&Sset_safe_terminal_coding_system_internal);
3965   defsubr (&Sterminal_coding_system);
3966   defsubr (&Sset_keyboard_coding_system_internal);
3967   defsubr (&Skeyboard_coding_system);
3968   defsubr (&Sfind_operation_coding_system);
3969
3970   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
3971     "List of coding-categories (symbols) ordered by priority.");
3972   {
3973     int i;
3974
3975     Vcoding_category_list = Qnil;
3976     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
3977       Vcoding_category_list
3978         = Fcons (coding_category_table[i], Vcoding_category_list);
3979   }
3980
3981   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
3982     "Specify the coding system for read operations.\n\
3983 It is useful to bind this variable with `let', but do not set it globally.\n\
3984 If the value is a coding system, it is used for decoding on read operation.\n\
3985 If not, an appropriate element is used from one of the coding system alists:\n\
3986 There are three such tables, `file-coding-system-alist',\n\
3987 `process-coding-system-alist', and `network-coding-system-alist'.");
3988   Vcoding_system_for_read = Qnil;
3989
3990   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
3991     "Specify the coding system for write operations.\n\
3992 It is useful to bind this variable with `let', but do not set it globally.\n\
3993 If the value is a coding system, it is used for encoding on write operation.\n\
3994 If not, an appropriate element is used from one of the coding system alists:\n\
3995 There are three such tables, `file-coding-system-alist',\n\
3996 `process-coding-system-alist', and `network-coding-system-alist'.");
3997   Vcoding_system_for_write = Qnil;
3998
3999   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
4000     "Coding system used in the latest file or process I/O.");
4001   Vlast_coding_system_used = Qnil;
4002
4003   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
4004     "*Non-nil inhibit code conversion of end-of-line format in any cases.");
4005   inhibit_eol_conversion = 0;
4006
4007   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
4008     "Alist to decide a coding system to use for a file I/O operation.\n\
4009 The format is ((PATTERN . VAL) ...),\n\
4010 where PATTERN is a regular expression matching a file name,\n\
4011 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4012 If VAL is a coding system, it is used for both decoding and encoding\n\
4013 the file contents.\n\
4014 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4015 and the cdr part is used for encoding.\n\
4016 If VAL is a function symbol, the function must return a coding system\n\
4017 or a cons of coding systems which are used as above.\n\
4018 \n\
4019 See also the function `find-operation-coding-system'.");
4020   Vfile_coding_system_alist = Qnil;
4021
4022   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
4023     "Alist to decide a coding system to use for a process I/O operation.\n\
4024 The format is ((PATTERN . VAL) ...),\n\
4025 where PATTERN is a regular expression matching a program name,\n\
4026 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4027 If VAL is a coding system, it is used for both decoding what received\n\
4028 from the program and encoding what sent to the program.\n\
4029 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4030 and the cdr part is used for encoding.\n\
4031 If VAL is a function symbol, the function must return a coding system\n\
4032 or a cons of coding systems which are used as above.\n\
4033 \n\
4034 See also the function `find-operation-coding-system'.");
4035   Vprocess_coding_system_alist = Qnil;
4036
4037   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
4038     "Alist to decide a coding system to use for a network I/O operation.\n\
4039 The format is ((PATTERN . VAL) ...),\n\
4040 where PATTERN is a regular expression matching a network service name\n\
4041 or is a port number to connect to,\n\
4042 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
4043 If VAL is a coding system, it is used for both decoding what received\n\
4044 from the network stream and encoding what sent to the network stream.\n\
4045 If VAL is a cons of coding systems, the car part is used for decoding,\n\
4046 and the cdr part is used for encoding.\n\
4047 If VAL is a function symbol, the function must return a coding system\n\
4048 or a cons of coding systems which are used as above.\n\
4049 \n\
4050 See also the function `find-operation-coding-system'.");
4051   Vnetwork_coding_system_alist = Qnil;
4052
4053   DEFVAR_INT ("eol-mnemonic-unix", &eol_mnemonic_unix,
4054     "Mnemonic character indicating UNIX-like end-of-line format (i.e. LF) .");
4055   eol_mnemonic_unix = ':';
4056
4057   DEFVAR_INT ("eol-mnemonic-dos", &eol_mnemonic_dos,
4058     "Mnemonic character indicating DOS-like end-of-line format (i.e. CRLF).");
4059   eol_mnemonic_dos = '\\';
4060
4061   DEFVAR_INT ("eol-mnemonic-mac", &eol_mnemonic_mac,
4062     "Mnemonic character indicating MAC-like end-of-line format (i.e. CR).");
4063   eol_mnemonic_mac = '/';
4064
4065   DEFVAR_INT ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
4066     "Mnemonic character indicating end-of-line format is not yet decided.");
4067   eol_mnemonic_undecided = ':';
4068
4069   DEFVAR_LISP ("enable-character-unification", &Venable_character_unification,
4070     "Non-nil means ISO 2022 encoder/decoder do character unification.");
4071   Venable_character_unification = Qt;
4072
4073   DEFVAR_LISP ("standard-character-unification-table-for-decode",
4074     &Vstandard_character_unification_table_for_decode,
4075     "Table for unifying characters when reading.");
4076   Vstandard_character_unification_table_for_decode = Qnil;
4077
4078   DEFVAR_LISP ("standard-character-unification-table-for-encode",
4079     &Vstandard_character_unification_table_for_encode,
4080     "Table for unifying characters when writing.");
4081   Vstandard_character_unification_table_for_encode = Qnil;
4082
4083   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
4084     "Alist of charsets vs revision numbers.\n\
4085 While encoding, if a charset (car part of an element) is found,\n\
4086 designate it with the escape sequence identifing revision (cdr part of the element).");
4087   Vcharset_revision_alist = Qnil;
4088
4089   DEFVAR_LISP ("default-process-coding-system",
4090                &Vdefault_process_coding_system,
4091     "Cons of coding systems used for process I/O by default.\n\
4092 The car part is used for decoding a process output,\n\
4093 the cdr part is used for encoding a text to be sent to a process.");
4094   Vdefault_process_coding_system = Qnil;
4095
4096   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
4097     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
4098 This is a vector of length 256.\n\
4099 If Nth element is non-nil, the existence of code N in a file\n\
4100 (or output of subprocess) doesn't prevent it to be detected as\n\
4101 a coding system of ISO 2022 variant which has a flag\n\
4102 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
4103 or reading output of a subprocess.\n\
4104 Only 128th through 159th elements has a meaning.");
4105   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
4106 }
4107
4108 #endif /* emacs */