src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs; see the file COPYING.  If not, write to
  19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.  */
  21
  22 /*** TABLE OF CONTENTS ***
  23
  24   0. General comments
  25   1. Preamble
  26   2. Emacs' internal format (emacs-mule) handlers
  27   3. ISO2022 handlers
  28   4. Shift-JIS and BIG5 handlers
  29   5. CCL handlers
  30   6. End-of-line handlers
  31   7. C library functions
  32   8. Emacs Lisp library functions
  33   9. Post-amble
  34
  35 */
  36
  37 /*** 0. General comments ***/
  38
  39
  40 /*** GENERAL NOTE on CODING SYSTEM ***
  41
  42   Coding system is an encoding mechanism of one or more character
  43   sets.  Here's a list of coding systems which Emacs can handle.  When
  44   we say "decode", it means converting some other coding system to
  45   Emacs' internal format (emacs-internal), and when we say "encode",
  46   it means converting the coding system emacs-mule to some other
  47   coding system.
  48
  49   0. Emacs' internal format (emacs-mule)
  50
  51   Emacs itself holds a multi-lingual character in a buffer and a string
  52   in a special format.  Details are described in section 2.
  53
  54   1. ISO2022
  55
  56   The most famous coding system for multiple character sets.  X's
  57   Compound Text, various EUCs (Extended Unix Code), and coding
  58   systems used in Internet communication such as ISO-2022-JP are
  59   all variants of ISO2022.  Details are described in section 3.
  60
  61   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  62
  63   A coding system to encode character sets: ASCII, JISX0201, and
  64   JISX0208.  Widely used for PC's in Japan.  Details are described in
  65   section 4.
  66
  67   3. BIG5
  68
  69   A coding system to encode character sets: ASCII and Big5.  Widely
  70   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  71   described in section 4.  In this file, when we write "BIG5"
  72   (all uppercase), we mean the coding system, and when we write
  73   "Big5" (capitalized), we mean the character set.
  74
  75   4. Raw text
  76
  77   A coding system for a text containing random 8-bit code.  Emacs does
  78   no code conversion on such a text except for end-of-line format.
  79
  80   5. Other
  81
  82   If a user wants to read/write a text encoded in a coding system not
  83   listed above, he can supply a decoder and an encoder for it in CCL
  84   (Code Conversion Language) programs.  Emacs executes the CCL program
  85   while reading/writing.
  86
  87   Emacs represents a coding system by a Lisp symbol that has a property
  88   `coding-system'.  But, before actually using the coding system, the
  89   information about it is set in a structure of type `struct
  90   coding_system' for rapid processing.  See section 6 for more details.
  91
  92 */
  93
  94 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  95
  96   How end-of-line of a text is encoded depends on a system.  For
  97   instance, Unix's format is just one byte of `line-feed' code,
  98   whereas DOS's format is two-byte sequence of `carriage-return' and
  99   `line-feed' codes.  MacOS's format is usually one byte of
 100   `carriage-return'.
 101
 102   Since text characters encoding and end-of-line encoding are
 103   independent, any coding system described above can take
 104   any format of end-of-line.  So, Emacs has information of format of
 105   end-of-line in each coding-system.  See section 6 for more details.
 106
 107 */
 108
 109 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 110
 111   These functions check if a text between SRC and SRC_END is encoded
 112   in the coding system category XXX.  Each returns an integer value in
 113   which appropriate flag bits for the category XXX is set.  The flag
 114   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 115   template of these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 116   of the range 0x80..0x9F are in multibyte form.  */
 117 #if 0
 118 int
 119 detect_coding_emacs_mule (src, src_end, multibytep)
 120      unsigned char *src, *src_end;
 121      int multibytep;
 122 {
 123   ...
 124 }
 125 #endif
 126
 127 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 128
 129   These functions decode SRC_BYTES length of unibyte text at SOURCE
 130   encoded in CODING to Emacs' internal format.  The resulting
 131   multibyte text goes to a place pointed to by DESTINATION, the length
 132   of which should not exceed DST_BYTES.
 133
 134   These functions set the information of original and decoded texts in
 135   the members produced, produced_char, consumed, and consumed_char of
 136   the structure *CODING.  They also set the member result to one of
 137   CODING_FINISH_XXX indicating how the decoding finished.
 138
 139   DST_BYTES zero means that source area and destination area are
 140   overlapped, which means that we can produce a decoded text until it
 141   reaches at the head of not-yet-decoded source text.
 142
 143   Below is a template of these functions.  */
 144 #if 0
 145 static void
 146 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 147      struct coding_system *coding;
 148      unsigned char *source, *destination;
 149      int src_bytes, dst_bytes;
 150 {
 151   ...
 152 }
 153 #endif
 154
 155 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 156
 157   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 158   internal multibyte format to CODING.  The resulting unibyte text
 159   goes to a place pointed to by DESTINATION, the length of which
 160   should not exceed DST_BYTES.
 161
 162   These functions set the information of original and encoded texts in
 163   the members produced, produced_char, consumed, and consumed_char of
 164   the structure *CODING.  They also set the member result to one of
 165   CODING_FINISH_XXX indicating how the encoding finished.
 166
 167   DST_BYTES zero means that source area and destination area are
 168   overlapped, which means that we can produce a encoded text until it
 169   reaches at the head of not-yet-encoded source text.
 170
 171   Below is a template of these functions.  */
 172 #if 0
 173 static void
 174 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 175      struct coding_system *coding;
 176      unsigned char *source, *destination;
 177      int src_bytes, dst_bytes;
 178 {
 179   ...
 180 }
 181 #endif
 182
 183 /*** COMMONLY USED MACROS ***/
 184
 185 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 186    get one, two, and three bytes from the source text respectively.
 187    If there are not enough bytes in the source, they jump to
 188    `label_end_of_loop'.  The caller should set variables `coding',
 189    `src' and `src_end' to appropriate pointer in advance.  These
 190    macros are called from decoding routines `decode_coding_XXX', thus
 191    it is assumed that the source text is unibyte.  */
 192
 193 #define ONE_MORE_BYTE(c1)                                       \
 194   do {                                                          \
 195     if (src >= src_end)                                         \
 196       {                                                         \
 197         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 198         goto label_end_of_loop;                                 \
 199       }                                                         \
 200     c1 = *src++;                                                \
 201   } while (0)
 202
 203 #define TWO_MORE_BYTES(c1, c2)                                  \
 204   do {                                                          \
 205     if (src + 1 >= src_end)                                     \
 206       {                                                         \
 207         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 208         goto label_end_of_loop;                                 \
 209       }                                                         \
 210     c1 = *src++;                                                \
 211     c2 = *src++;                                                \
 212   } while (0)
 213
 214
 215 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 216    form if MULTIBYTEP is nonzero.  */
 217
 218 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 219   do {                                                          \
 220     if (src >= src_end)                                         \
 221       {                                                         \
 222         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 223         goto label_end_of_loop;                                 \
 224       }                                                         \
 225     c1 = *src++;                                                \
 226     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 227       c1 = *src++ - 0x20;                                       \
 228   } while (0)
 229
 230 /* Set C to the next character at the source text pointed by `src'.
 231    If there are not enough characters in the source, jump to
 232    `label_end_of_loop'.  The caller should set variables `coding'
 233    `src', `src_end', and `translation_table' to appropriate pointers
 234    in advance.  This macro is used in encoding routines
 235    `encode_coding_XXX', thus it assumes that the source text is in
 236    multibyte form except for 8-bit characters.  8-bit characters are
 237    in multibyte form if coding->src_multibyte is nonzero, else they
 238    are represented by a single byte.  */
 239
 240 #define ONE_MORE_CHAR(c)                                        \
 241   do {                                                          \
 242     int len = src_end - src;                                    \
 243     int bytes;                                                  \
 244     if (len <= 0)                                               \
 245       {                                                         \
 246         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 247         goto label_end_of_loop;                                 \
 248       }                                                         \
 249     if (coding->src_multibyte                                   \
 250         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 251       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 252     else                                                        \
 253       c = *src, bytes = 1;                                      \
 254     if (!NILP (translation_table))                              \
 255       c = translate_char (translation_table, c, -1, 0, 0);      \
 256     src += bytes;                                               \
 257   } while (0)
 258
 259
 260 /* Produce a multibyte form of characater C to `dst'.  Jump to
 261    `label_end_of_loop' if there's not enough space at `dst'.
 262
 263    If we are now in the middle of composition sequence, the decoded
 264    character may be ALTCHAR (for the current composition).  In that
 265    case, the character goes to coding->cmp_data->data instead of
 266    `dst'.
 267
 268    This macro is used in decoding routines.  */
 269
 270 #define EMIT_CHAR(c)                                                    \
 271   do {                                                                  \
 272     if (! COMPOSING_P (coding)                                          \
 273         || coding->composing == COMPOSITION_RELATIVE                    \
 274         || coding->composing == COMPOSITION_WITH_RULE)                  \
 275       {                                                                 \
 276         int bytes = CHAR_BYTES (c);                                     \
 277         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 278           {                                                             \
 279             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 280             goto label_end_of_loop;                                     \
 281           }                                                             \
 282         dst += CHAR_STRING (c, dst);                                    \
 283         coding->produced_char++;                                        \
 284       }                                                                 \
 285                                                                         \
 286     if (COMPOSING_P (coding)                                            \
 287         && coding->composing != COMPOSITION_RELATIVE)                   \
 288       {                                                                 \
 289         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 290         coding->composition_rule_follows                                \
 291           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 292       }                                                                 \
 293   } while (0)
 294
 295
 296 #define EMIT_ONE_BYTE(c)                                        \
 297   do {                                                          \
 298     if (dst >= (dst_bytes ? dst_end : src))                     \
 299       {                                                         \
 300         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 301         goto label_end_of_loop;                                 \
 302       }                                                         \
 303     *dst++ = c;                                                 \
 304   } while (0)
 305
 306 #define EMIT_TWO_BYTES(c1, c2)                                  \
 307   do {                                                          \
 308     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 309       {                                                         \
 310         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 311         goto label_end_of_loop;                                 \
 312       }                                                         \
 313     *dst++ = c1, *dst++ = c2;                                   \
 314   } while (0)
 315
 316 #define EMIT_BYTES(from, to)                                    \
 317   do {                                                          \
 318     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 319       {                                                         \
 320         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 321         goto label_end_of_loop;                                 \
 322       }                                                         \
 323     while (from < to)                                           \
 324       *dst++ = *from++;                                         \
 325   } while (0)
 326
 327 \f
 328 /*** 1. Preamble ***/
 329
 330 #ifdef emacs
 331 #include <config.h>
 332 #endif
 333
 334 #include <stdio.h>
 335
 336 #ifdef emacs
 337
 338 #include "lisp.h"
 339 #include "buffer.h"
 340 #include "charset.h"
 341 #include "composite.h"
 342 #include "ccl.h"
 343 #include "coding.h"
 344 #include "window.h"
 345
 346 #else  /* not emacs */
 347
 348 #include "mulelib.h"
 349
 350 #endif /* not emacs */
 351
 352 Lisp_Object Qcoding_system, Qeol_type;
 353 Lisp_Object Qbuffer_file_coding_system;
 354 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 355 Lisp_Object Qno_conversion, Qundecided;
 356 Lisp_Object Qcoding_system_history;
 357 Lisp_Object Qsafe_chars;
 358 Lisp_Object Qvalid_codes;
 359
 360 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 361 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 362 Lisp_Object Qstart_process, Qopen_network_stream;
 363 Lisp_Object Qtarget_idx;
 364
 365 Lisp_Object Vselect_safe_coding_system_function;
 366
 367 /* Mnemonic string for each format of end-of-line.  */
 368 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 369 /* Mnemonic string to indicate format of end-of-line is not yet
 370    decided.  */
 371 Lisp_Object eol_mnemonic_undecided;
 372
 373 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 374    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 375 int system_eol_type;
 376
 377 #ifdef emacs
 378
 379 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 380
 381 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 382
 383 /* Coding system emacs-mule and raw-text are for converting only
 384    end-of-line format.  */
 385 Lisp_Object Qemacs_mule, Qraw_text;
 386
 387 /* Coding-systems are handed between Emacs Lisp programs and C internal
 388    routines by the following three variables.  */
 389 /* Coding-system for reading files and receiving data from process.  */
 390 Lisp_Object Vcoding_system_for_read;
 391 /* Coding-system for writing files and sending data to process.  */
 392 Lisp_Object Vcoding_system_for_write;
 393 /* Coding-system actually used in the latest I/O.  */
 394 Lisp_Object Vlast_coding_system_used;
 395
 396 /* A vector of length 256 which contains information about special
 397    Latin codes (especially for dealing with Microsoft codes).  */
 398 Lisp_Object Vlatin_extra_code_table;
 399
 400 /* Flag to inhibit code conversion of end-of-line format.  */
 401 int inhibit_eol_conversion;
 402
 403 /* Flag to inhibit ISO2022 escape sequence detection.  */
 404 int inhibit_iso_escape_detection;
 405
 406 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 407 int inherit_process_coding_system;
 408
 409 /* Coding system to be used to encode text for terminal display.  */
 410 struct coding_system terminal_coding;
 411
 412 /* Coding system to be used to encode text for terminal display when
 413    terminal coding system is nil.  */
 414 struct coding_system safe_terminal_coding;
 415
 416 /* Coding system of what is sent from terminal keyboard.  */
 417 struct coding_system keyboard_coding;
 418
 419 /* Default coding system to be used to write a file.  */
 420 struct coding_system default_buffer_file_coding;
 421
 422 Lisp_Object Vfile_coding_system_alist;
 423 Lisp_Object Vprocess_coding_system_alist;
 424 Lisp_Object Vnetwork_coding_system_alist;
 425
 426 Lisp_Object Vlocale_coding_system;
 427
 428 #endif /* emacs */
 429
 430 Lisp_Object Qcoding_category, Qcoding_category_index;
 431
 432 /* List of symbols `coding-category-xxx' ordered by priority.  */
 433 Lisp_Object Vcoding_category_list;
 434
 435 /* Table of coding categories (Lisp symbols).  */
 436 Lisp_Object Vcoding_category_table;
 437
 438 /* Table of names of symbol for each coding-category.  */
 439 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 440   "coding-category-emacs-mule",
 441   "coding-category-sjis",
 442   "coding-category-iso-7",
 443   "coding-category-iso-7-tight",
 444   "coding-category-iso-8-1",
 445   "coding-category-iso-8-2",
 446   "coding-category-iso-7-else",
 447   "coding-category-iso-8-else",
 448   "coding-category-ccl",
 449   "coding-category-big5",
 450   "coding-category-utf-8",
 451   "coding-category-utf-16-be",
 452   "coding-category-utf-16-le",
 453   "coding-category-raw-text",
 454   "coding-category-binary"
 455 };
 456
 457 /* Table of pointers to coding systems corresponding to each coding
 458    categories.  */
 459 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 460
 461 /* Table of coding category masks.  Nth element is a mask for a coding
 462    cateogry of which priority is Nth.  */
 463 static
 464 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 465
 466 /* Flag to tell if we look up translation table on character code
 467    conversion.  */
 468 Lisp_Object Venable_character_translation;
 469 /* Standard translation table to look up on decoding (reading).  */
 470 Lisp_Object Vstandard_translation_table_for_decode;
 471 /* Standard translation table to look up on encoding (writing).  */
 472 Lisp_Object Vstandard_translation_table_for_encode;
 473
 474 Lisp_Object Qtranslation_table;
 475 Lisp_Object Qtranslation_table_id;
 476 Lisp_Object Qtranslation_table_for_decode;
 477 Lisp_Object Qtranslation_table_for_encode;
 478
 479 /* Alist of charsets vs revision number.  */
 480 Lisp_Object Vcharset_revision_alist;
 481
 482 /* Default coding systems used for process I/O.  */
 483 Lisp_Object Vdefault_process_coding_system;
 484
 485 /* Global flag to tell that we can't call post-read-conversion and
 486    pre-write-conversion functions.  Usually the value is zero, but it
 487    is set to 1 temporarily while such functions are running.  This is
 488    to avoid infinite recursive call.  */
 489 static int inhibit_pre_post_conversion;
 490
 491 /* Char-table containing safe coding systems of each character.  */
 492 Lisp_Object Vchar_coding_system_table;
 493 Lisp_Object Qchar_coding_system;
 494
 495 /* Return `safe-chars' property of coding system CODING.  Don't check
 496    validity of CODING.  */
 497
 498 Lisp_Object
 499 coding_safe_chars (coding)
 500      struct coding_system *coding;
 501 {
 502   Lisp_Object coding_spec, plist, safe_chars;
 503
 504   coding_spec = Fget (coding->symbol, Qcoding_system);
 505   plist = XVECTOR (coding_spec)->contents[3];
 506   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 507   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 508 }
 509
 510 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 511   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 512
 513 \f
 514 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 515
 516 /* Emacs' internal format for encoding multiple character sets is a
 517    kind of multi-byte encoding, i.e. characters are encoded by
 518    variable-length sequences of one-byte codes.
 519
 520    ASCII characters and control characters (e.g. `tab', `newline') are
 521    represented by one-byte sequences which are their ASCII codes, in
 522    the range 0x00 through 0x7F.
 523
 524    8-bit characters of the range 0x80..0x9F are represented by
 525    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 526    code + 0x20).
 527
 528    8-bit characters of the range 0xA0..0xFF are represented by
 529    one-byte sequences which are their 8-bit code.
 530
 531    The other characters are represented by a sequence of `base
 532    leading-code', optional `extended leading-code', and one or two
 533    `position-code's.  The length of the sequence is determined by the
 534    base leading-code.  Leading-code takes the range 0x80 through 0x9F,
 535    whereas extended leading-code and position-code take the range 0xA0
 536    through 0xFF.  See `charset.h' for more details about leading-code
 537    and position-code.
 538
 539    --- CODE RANGE of Emacs' internal format ---
 540    character set        range
 541    -------------        -----
 542    ascii                0x00..0x7F
 543    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 544    eight-bit-graphic    0xA0..0xBF
 545    ELSE                 0x81..0x9F + [0xA0..0xFF]+
 546    ---------------------------------------------
 547
 548   */
 549
 550 enum emacs_code_class_type emacs_code_class[256];
 551
 552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 553    Check if a text is encoded in Emacs' internal format.  If it is,
 554    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 555
 556 static int
 557 detect_coding_emacs_mule (src, src_end, multibytep)
 558       unsigned char *src, *src_end;
 559       int multibytep;
 560 {
 561   unsigned char c;
 562   int composing = 0;
 563   /* Dummy for ONE_MORE_BYTE.  */
 564   struct coding_system dummy_coding;
 565   struct coding_system *coding = &dummy_coding;
 566
 567   while (1)
 568     {
 569       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 570
 571       if (composing)
 572         {
 573           if (c < 0xA0)
 574             composing = 0;
 575           else if (c == 0xA0)
 576             {
 577               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 578               c &= 0x7F;
 579             }
 580           else
 581             c -= 0x20;
 582         }
 583
 584       if (c < 0x20)
 585         {
 586           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 587             return 0;
 588         }
 589       else if (c >= 0x80 && c < 0xA0)
 590         {
 591           if (c == 0x80)
 592             /* Old leading code for a composite character.  */
 593             composing = 1;
 594           else
 595             {
 596               unsigned char *src_base = src - 1;
 597               int bytes;
 598
 599               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 600                                                bytes))
 601                 return 0;
 602               src = src_base + bytes;
 603             }
 604         }
 605     }
 606  label_end_of_loop:
 607   return CODING_CATEGORY_MASK_EMACS_MULE;
 608 }
 609
 610
 611 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 612
 613 static void
 614 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 615      struct coding_system *coding;
 616      unsigned char *source, *destination;
 617      int src_bytes, dst_bytes;
 618 {
 619   unsigned char *src = source;
 620   unsigned char *src_end = source + src_bytes;
 621   unsigned char *dst = destination;
 622   unsigned char *dst_end = destination + dst_bytes;
 623   /* SRC_BASE remembers the start position in source in each loop.
 624      The loop will be exited when there's not enough source code, or
 625      when there's not enough destination area to produce a
 626      character.  */
 627   unsigned char *src_base;
 628
 629   coding->produced_char = 0;
 630   while ((src_base = src) < src_end)
 631     {
 632       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 633       int bytes;
 634
 635       if (*src == '\r')
 636         {
 637           int c = *src++;
 638
 639           if (coding->eol_type == CODING_EOL_CR)
 640             c = '\n';
 641           else if (coding->eol_type == CODING_EOL_CRLF)
 642             {
 643               ONE_MORE_BYTE (c);
 644               if (c != '\n')
 645                 {
 646                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 647                     {
 648                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
 649                       goto label_end_of_loop;
 650                     }
 651                   src--;
 652                   c = '\r';
 653                 }
 654             }
 655           *dst++ = c;
 656           coding->produced_char++;
 657           continue;
 658         }
 659       else if (*src == '\n')
 660         {
 661           if ((coding->eol_type == CODING_EOL_CR
 662                || coding->eol_type == CODING_EOL_CRLF)
 663               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 664             {
 665               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 666               goto label_end_of_loop;
 667             }
 668           *dst++ = *src++;
 669           coding->produced_char++;
 670           continue;
 671         }
 672       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes))
 673         {
 674           p = src;
 675           src += bytes;
 676         }
 677       else
 678         {
 679           bytes = CHAR_STRING (*src, tmp);
 680           p = tmp;
 681           src++;
 682         }
 683       if (dst + bytes >= (dst_bytes ? dst_end : src))
 684         {
 685           coding->result = CODING_FINISH_INSUFFICIENT_DST;
 686           break;
 687         }
 688       while (bytes--) *dst++ = *p++;
 689       coding->produced_char++;
 690     }
 691  label_end_of_loop:
 692   coding->consumed = coding->consumed_char = src_base - source;
 693   coding->produced = dst - destination;
 694 }
 695
 696 #define encode_coding_emacs_mule(coding, source, destination, src_bytes, dst_bytes) \
 697   encode_eol (coding, source, destination, src_bytes, dst_bytes)
 698
 699
 700 \f
 701 /*** 3. ISO2022 handlers ***/
 702
 703 /* The following note describes the coding system ISO2022 briefly.
 704    Since the intention of this note is to help understand the
 705    functions in this file, some parts are NOT ACCURATE or OVERLY
 706    SIMPLIFIED.  For thorough understanding, please refer to the
 707    original document of ISO2022.
 708
 709    ISO2022 provides many mechanisms to encode several character sets
 710    in 7-bit and 8-bit environments.  For 7-bite environments, all text
 711    is encoded using bytes less than 128.  This may make the encoded
 712    text a little bit longer, but the text passes more easily through
 713    several gateways, some of which strip off MSB (Most Signigant Bit).
 714
 715    There are two kinds of character sets: control character set and
 716    graphic character set.  The former contains control characters such
 717    as `newline' and `escape' to provide control functions (control
 718    functions are also provided by escape sequences).  The latter
 719    contains graphic characters such as 'A' and '-'.  Emacs recognizes
 720    two control character sets and many graphic character sets.
 721
 722    Graphic character sets are classified into one of the following
 723    four classes, according to the number of bytes (DIMENSION) and
 724    number of characters in one dimension (CHARS) of the set:
 725    - DIMENSION1_CHARS94
 726    - DIMENSION1_CHARS96
 727    - DIMENSION2_CHARS94
 728    - DIMENSION2_CHARS96
 729
 730    In addition, each character set is assigned an identification tag,
 731    unique for each set, called "final character" (denoted as <F>
 732    hereafter).  The <F> of each character set is decided by ECMA(*)
 733    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
 734    (0x30..0x3F are for private use only).
 735
 736    Note (*): ECMA = European Computer Manufacturers Association
 737
 738    Here are examples of graphic character set [NAME(<F>)]:
 739         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
 740         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
 741         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
 742         o DIMENSION2_CHARS96 -- none for the moment
 743
 744    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
 745         C0 [0x00..0x1F] -- control character plane 0
 746         GL [0x20..0x7F] -- graphic character plane 0
 747         C1 [0x80..0x9F] -- control character plane 1
 748         GR [0xA0..0xFF] -- graphic character plane 1
 749
 750    A control character set is directly designated and invoked to C0 or
 751    C1 by an escape sequence.  The most common case is that:
 752    - ISO646's  control character set is designated/invoked to C0, and
 753    - ISO6429's control character set is designated/invoked to C1,
 754    and usually these designations/invocations are omitted in encoded
 755    text.  In a 7-bit environment, only C0 can be used, and a control
 756    character for C1 is encoded by an appropriate escape sequence to
 757    fit into the environment.  All control characters for C1 are
 758    defined to have corresponding escape sequences.
 759
 760    A graphic character set is at first designated to one of four
 761    graphic registers (G0 through G3), then these graphic registers are
 762    invoked to GL or GR.  These designations and invocations can be
 763    done independently.  The most common case is that G0 is invoked to
 764    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
 765    these invocations and designations are omitted in encoded text.
 766    In a 7-bit environment, only GL can be used.
 767
 768    When a graphic character set of CHARS94 is invoked to GL, codes
 769    0x20 and 0x7F of the GL area work as control characters SPACE and
 770    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
 771    be used.
 772
 773    There are two ways of invocation: locking-shift and single-shift.
 774    With locking-shift, the invocation lasts until the next different
 775    invocation, whereas with single-shift, the invocation affects the
 776    following character only and doesn't affect the locking-shift
 777    state.  Invocations are done by the following control characters or
 778    escape sequences:
 779
 780    ----------------------------------------------------------------------
 781    abbrev  function                  cntrl escape seq   description
 782    ----------------------------------------------------------------------
 783    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
 784    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
 785    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
 786    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
 787    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
 788    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
 789    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
 790    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
 791    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
 792    ----------------------------------------------------------------------
 793    (*) These are not used by any known coding system.
 794
 795    Control characters for these functions are defined by macros
 796    ISO_CODE_XXX in `coding.h'.
 797
 798    Designations are done by the following escape sequences:
 799    ----------------------------------------------------------------------
 800    escape sequence      description
 801    ----------------------------------------------------------------------
 802    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
 803    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
 804    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
 805    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
 806    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
 807    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
 808    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
 809    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
 810    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
 811    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
 812    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
 813    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
 814    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
 815    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
 816    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
 817    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
 818    ----------------------------------------------------------------------
 819
 820    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
 821    of dimension 1, chars 94, and final character <F>, etc...
 822
 823    Note (*): Although these designations are not allowed in ISO2022,
 824    Emacs accepts them on decoding, and produces them on encoding
 825    CHARS96 character sets in a coding system which is characterized as
 826    7-bit environment, non-locking-shift, and non-single-shift.
 827
 828    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
 829    '(' can be omitted.  We refer to this as "short-form" hereafter.
 830
 831    Now you may notice that there are a lot of ways for encoding the
 832    same multilingual text in ISO2022.  Actually, there exist many
 833    coding systems such as Compound Text (used in X11's inter client
 834    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
 835    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
 836    localized platforms), and all of these are variants of ISO2022.
 837
 838    In addition to the above, Emacs handles two more kinds of escape
 839    sequences: ISO6429's direction specification and Emacs' private
 840    sequence for specifying character composition.
 841
 842    ISO6429's direction specification takes the following form:
 843         o CSI ']'      -- end of the current direction
 844         o CSI '0' ']'  -- end of the current direction
 845         o CSI '1' ']'  -- start of left-to-right text
 846         o CSI '2' ']'  -- start of right-to-left text
 847    The control character CSI (0x9B: control sequence introducer) is
 848    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
 849
 850    Character composition specification takes the following form:
 851         o ESC '0' -- start relative composition
 852         o ESC '1' -- end composition
 853         o ESC '2' -- start rule-base composition (*)
 854         o ESC '3' -- start relative composition with alternate chars  (**)
 855         o ESC '4' -- start rule-base composition with alternate chars  (**)
 856   Since these are not standard escape sequences of any ISO standard,
 857   the use of them for these meaning is restricted to Emacs only.
 858
 859   (*) This form is used only in Emacs 20.5 and the older versions,
 860   but the newer versions can safely decode it.
 861   (**) This form is used only in Emacs 21.1 and the newer versions,
 862   and the older versions can't decode it.
 863
 864   Here's a list of examples usages of these composition escape
 865   sequences (categorized by `enum composition_method').
 866
 867   COMPOSITION_RELATIVE:
 868         ESC 0 CHAR [ CHAR ] ESC 1
 869   COMPOSITOIN_WITH_RULE:
 870         ESC 2 CHAR [ RULE CHAR ] ESC 1
 871   COMPOSITION_WITH_ALTCHARS:
 872         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
 873   COMPOSITION_WITH_RULE_ALTCHARS:
 874         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
 875
 876 enum iso_code_class_type iso_code_class[256];
 877
 878 #define CHARSET_OK(idx, charset, c)                                     \
 879   (coding_system_table[idx]                                             \
 880    && (charset == CHARSET_ASCII                                         \
 881        || (safe_chars = coding_safe_chars (coding_system_table[idx]),   \
 882            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
 883    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
 884                                               charset)                  \
 885        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
 886
 887 #define SHIFT_OUT_OK(idx) \
 888   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
 889
 890 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 891    Check if a text is encoded in ISO2022.  If it is, returns an
 892    integer in which appropriate flag bits any of:
 893         CODING_CATEGORY_MASK_ISO_7
 894         CODING_CATEGORY_MASK_ISO_7_TIGHT
 895         CODING_CATEGORY_MASK_ISO_8_1
 896         CODING_CATEGORY_MASK_ISO_8_2
 897         CODING_CATEGORY_MASK_ISO_7_ELSE
 898         CODING_CATEGORY_MASK_ISO_8_ELSE
 899    are set.  If a code which should never appear in ISO2022 is found,
 900    returns 0.  */
 901
 902 static int
 903 detect_coding_iso2022 (src, src_end, multibytep)
 904      unsigned char *src, *src_end;
 905      int multibytep;
 906 {
 907   int mask = CODING_CATEGORY_MASK_ISO;
 908   int mask_found = 0;
 909   int reg[4], shift_out = 0, single_shifting = 0;
 910   int c, c1, i, charset;
 911   /* Dummy for ONE_MORE_BYTE.  */
 912   struct coding_system dummy_coding;
 913   struct coding_system *coding = &dummy_coding;
 914   Lisp_Object safe_chars;
 915
 916   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
 917   while (mask && src < src_end)
 918     {
 919       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 920       switch (c)
 921         {
 922         case ISO_CODE_ESC:
 923           if (inhibit_iso_escape_detection)
 924             break;
 925           single_shifting = 0;
 926           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 927           if (c >= '(' && c <= '/')
 928             {
 929               /* Designation sequence for a charset of dimension 1.  */
 930               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 931               if (c1 < ' ' || c1 >= 0x80
 932                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
 933                 /* Invalid designation sequence.  Just ignore.  */
 934                 break;
 935               reg[(c - '(') % 4] = charset;
 936             }
 937           else if (c == '$')
 938             {
 939               /* Designation sequence for a charset of dimension 2.  */
 940               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 941               if (c >= '@' && c <= 'B')
 942                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
 943                 reg[0] = charset = iso_charset_table[1][0][c];
 944               else if (c >= '(' && c <= '/')
 945                 {
 946                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
 947                   if (c1 < ' ' || c1 >= 0x80
 948                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
 949                     /* Invalid designation sequence.  Just ignore.  */
 950                     break;
 951                   reg[(c - '(') % 4] = charset;
 952                 }
 953               else
 954                 /* Invalid designation sequence.  Just ignore.  */
 955                 break;
 956             }
 957           else if (c == 'N' || c == 'O')
 958             {
 959               /* ESC <Fe> for SS2 or SS3.  */
 960               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
 961               break;
 962             }
 963           else if (c >= '0' && c <= '4')
 964             {
 965               /* ESC <Fp> for start/end composition.  */
 966               mask_found |= CODING_CATEGORY_MASK_ISO;
 967               break;
 968             }
 969           else
 970             /* Invalid escape sequence.  Just ignore.  */
 971             break;
 972
 973           /* We found a valid designation sequence for CHARSET.  */
 974           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
 975           c = MAKE_CHAR (charset, 0, 0);
 976           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
 977             mask_found |= CODING_CATEGORY_MASK_ISO_7;
 978           else
 979             mask &= ~CODING_CATEGORY_MASK_ISO_7;
 980           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
 981             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
 982           else
 983             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
 984           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
 985             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
 986           else
 987             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
 988           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
 989             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
 990           else
 991             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
 992           break;
 993
 994         case ISO_CODE_SO:
 995           if (inhibit_iso_escape_detection)
 996             break;
 997           single_shifting = 0;
 998           if (shift_out == 0
 999               && (reg[1] >= 0
1000                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1001                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1002             {
1003               /* Locking shift out.  */
1004               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1005               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1006             }
1007           break;
1008
1009         case ISO_CODE_SI:
1010           if (inhibit_iso_escape_detection)
1011             break;
1012           single_shifting = 0;
1013           if (shift_out == 1)
1014             {
1015               /* Locking shift in.  */
1016               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1017               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1018             }
1019           break;
1020
1021         case ISO_CODE_CSI:
1022           single_shifting = 0;
1023         case ISO_CODE_SS2:
1024         case ISO_CODE_SS3:
1025           {
1026             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1027
1028             if (inhibit_iso_escape_detection)
1029               break;
1030             if (c != ISO_CODE_CSI)
1031               {
1032                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1033                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1034                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1035                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1036                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1037                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1038                 single_shifting = 1;
1039               }
1040             if (VECTORP (Vlatin_extra_code_table)
1041                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1042               {
1043                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1044                     & CODING_FLAG_ISO_LATIN_EXTRA)
1045                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1046                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1047                     & CODING_FLAG_ISO_LATIN_EXTRA)
1048                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1049               }
1050             mask &= newmask;
1051             mask_found |= newmask;
1052           }
1053           break;
1054
1055         default:
1056           if (c < 0x80)
1057             {
1058               single_shifting = 0;
1059               break;
1060             }
1061           else if (c < 0xA0)
1062             {
1063               single_shifting = 0;
1064               if (VECTORP (Vlatin_extra_code_table)
1065                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1066                 {
1067                   int newmask = 0;
1068
1069                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1070                       & CODING_FLAG_ISO_LATIN_EXTRA)
1071                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1072                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1073                       & CODING_FLAG_ISO_LATIN_EXTRA)
1074                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1075                   mask &= newmask;
1076                   mask_found |= newmask;
1077                 }
1078               else
1079                 return 0;
1080             }
1081           else
1082             {
1083               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1084                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1085               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1086               /* Check the length of succeeding codes of the range
1087                  0xA0..0FF.  If the byte length is odd, we exclude
1088                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1089                  when we are not single shifting.  */
1090               if (!single_shifting
1091                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1092                 {
1093                   int i = 1;
1094                   while (src < src_end)
1095                     {
1096                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1097                       if (c < 0xA0)
1098                         break;
1099                       i++;
1100                     }
1101
1102                   if (i & 1 && src < src_end)
1103                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1104                   else
1105                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1106                 }
1107             }
1108           break;
1109         }
1110     }
1111  label_end_of_loop:
1112   return (mask & mask_found);
1113 }
1114
1115 /* Decode a character of which charset is CHARSET, the 1st position
1116    code is C1, the 2nd position code is C2, and return the decoded
1117    character code.  If the variable `translation_table' is non-nil,
1118    returned the translated code.  */
1119
1120 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1121   (NILP (translation_table)                     \
1122    ? MAKE_CHAR (charset, c1, c2)                \
1123    : translate_char (translation_table, -1, charset, c1, c2))
1124
1125 /* Set designation state into CODING.  */
1126 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1127   do {                                                                     \
1128     int charset, c;                                                        \
1129                                                                            \
1130     if (final_char < '0' || final_char >= 128)                             \
1131       goto label_invalid_code;                                             \
1132     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1133                                  make_number (chars),                      \
1134                                  make_number (final_char));                \
1135     c = MAKE_CHAR (charset, 0, 0);                                         \
1136     if (charset >= 0                                                       \
1137         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1138             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1139       {                                                                    \
1140         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1141             && reg == 0                                                    \
1142             && charset == CHARSET_ASCII)                                   \
1143           {                                                                \
1144             /* We should insert this designation sequence as is so         \
1145                that it is surely written back to a file.  */               \
1146             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1147             goto label_invalid_code;                                       \
1148           }                                                                \
1149         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1150         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1151             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1152           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1153         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1154       }                                                                    \
1155     else                                                                   \
1156       {                                                                    \
1157         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1158         goto label_invalid_code;                                           \
1159       }                                                                    \
1160   } while (0)
1161
1162 /* Allocate a memory block for storing information about compositions.
1163    The block is chained to the already allocated blocks.  */
1164
1165 void
1166 coding_allocate_composition_data (coding, char_offset)
1167      struct coding_system *coding;
1168      int char_offset;
1169 {
1170   struct composition_data *cmp_data
1171     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1172
1173   cmp_data->char_offset = char_offset;
1174   cmp_data->used = 0;
1175   cmp_data->prev = coding->cmp_data;
1176   cmp_data->next = NULL;
1177   if (coding->cmp_data)
1178     coding->cmp_data->next = cmp_data;
1179   coding->cmp_data = cmp_data;
1180   coding->cmp_data_start = 0;
1181 }
1182
1183 /* Record the starting position START and METHOD of one composition.  */
1184
1185 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
1186   do {                                                          \
1187     struct composition_data *cmp_data = coding->cmp_data;       \
1188     int *data = cmp_data->data + cmp_data->used;                \
1189     coding->cmp_data_start = cmp_data->used;                    \
1190     data[0] = -1;                                               \
1191     data[1] = cmp_data->char_offset + start;                    \
1192     data[3] = (int) method;                                     \
1193     cmp_data->used += 4;                                        \
1194   } while (0)
1195
1196 /* Record the ending position END of the current composition.  */
1197
1198 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
1199   do {                                                          \
1200     struct composition_data *cmp_data = coding->cmp_data;       \
1201     int *data = cmp_data->data + coding->cmp_data_start;        \
1202     data[0] = cmp_data->used - coding->cmp_data_start;          \
1203     data[2] = cmp_data->char_offset + end;                      \
1204   } while (0)
1205
1206 /* Record one COMPONENT (alternate character or composition rule).  */
1207
1208 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)     \
1209   (coding->cmp_data->data[coding->cmp_data->used++] = component)
1210
1211 /* Handle compositoin start sequence ESC 0, ESC 2, ESC 3, or ESC 4.  */
1212
1213 #define DECODE_COMPOSITION_START(c1)                                       \
1214   do {                                                                     \
1215     if (coding->composing == COMPOSITION_DISABLED)                         \
1216       {                                                                    \
1217         *dst++ = ISO_CODE_ESC;                                             \
1218         *dst++ = c1 & 0x7f;                                                \
1219         coding->produced_char += 2;                                        \
1220       }                                                                    \
1221     else if (!COMPOSING_P (coding))                                        \
1222       {                                                                    \
1223         /* This is surely the start of a composition.  We must be sure     \
1224            that coding->cmp_data has enough space to store the             \
1225            information about the composition.  If not, terminate the       \
1226            current decoding loop, allocate one more memory block for       \
1227            coding->cmp_data in the calller, then start the decoding        \
1228            loop again.  We can't allocate memory here directly because     \
1229            it may cause buffer/string relocation.  */                      \
1230         if (!coding->cmp_data                                              \
1231             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1232                 >= COMPOSITION_DATA_SIZE))                                 \
1233           {                                                                \
1234             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1235             goto label_end_of_loop;                                        \
1236           }                                                                \
1237         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1238                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1239                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1240                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1241         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1242                                       coding->composing);                  \
1243         coding->composition_rule_follows = 0;                              \
1244       }                                                                    \
1245     else                                                                   \
1246       {                                                                    \
1247         /* We are already handling a composition.  If the method is        \
1248            the following two, the codes following the current escape       \
1249            sequence are actual characters stored in a buffer.  */          \
1250         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1251             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1252           {                                                                \
1253             coding->composing = COMPOSITION_RELATIVE;                      \
1254             coding->composition_rule_follows = 0;                          \
1255           }                                                                \
1256       }                                                                    \
1257   } while (0)
1258
1259 /* Handle compositoin end sequence ESC 1.  */
1260
1261 #define DECODE_COMPOSITION_END(c1)                                      \
1262   do {                                                                  \
1263     if (coding->composing == COMPOSITION_DISABLED)                      \
1264       {                                                                 \
1265         *dst++ = ISO_CODE_ESC;                                          \
1266         *dst++ = c1;                                                    \
1267         coding->produced_char += 2;                                     \
1268       }                                                                 \
1269     else                                                                \
1270       {                                                                 \
1271         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1272         coding->composing = COMPOSITION_NO;                             \
1273       }                                                                 \
1274   } while (0)
1275
1276 /* Decode a composition rule from the byte C1 (and maybe one more byte
1277    from SRC) and store one encoded composition rule in
1278    coding->cmp_data.  */
1279
1280 #define DECODE_COMPOSITION_RULE(c1)                                     \
1281   do {                                                                  \
1282     int rule = 0;                                                       \
1283     (c1) -= 32;                                                         \
1284     if (c1 < 81)                /* old format (before ver.21) */        \
1285       {                                                                 \
1286         int gref = (c1) / 9;                                            \
1287         int nref = (c1) % 9;                                            \
1288         if (gref == 4) gref = 10;                                       \
1289         if (nref == 4) nref = 10;                                       \
1290         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1291       }                                                                 \
1292     else if (c1 < 93)           /* new format (after ver.21) */         \
1293       {                                                                 \
1294         ONE_MORE_BYTE (c2);                                             \
1295         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1296       }                                                                 \
1297     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1298     coding->composition_rule_follows = 0;                               \
1299   } while (0)
1300
1301
1302 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1303
1304 static void
1305 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1306      struct coding_system *coding;
1307      unsigned char *source, *destination;
1308      int src_bytes, dst_bytes;
1309 {
1310   unsigned char *src = source;
1311   unsigned char *src_end = source + src_bytes;
1312   unsigned char *dst = destination;
1313   unsigned char *dst_end = destination + dst_bytes;
1314   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1315   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1316   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1317   /* SRC_BASE remembers the start position in source in each loop.
1318      The loop will be exited when there's not enough source code
1319      (within macro ONE_MORE_BYTE), or when there's not enough
1320      destination area to produce a character (within macro
1321      EMIT_CHAR).  */
1322   unsigned char *src_base;
1323   int c, charset;
1324   Lisp_Object translation_table;
1325   Lisp_Object safe_chars;
1326
1327   safe_chars = coding_safe_chars (coding);
1328
1329   if (NILP (Venable_character_translation))
1330     translation_table = Qnil;
1331   else
1332     {
1333       translation_table = coding->translation_table_for_decode;
1334       if (NILP (translation_table))
1335         translation_table = Vstandard_translation_table_for_decode;
1336     }
1337
1338   coding->result = CODING_FINISH_NORMAL;
1339
1340   while (1)
1341     {
1342       int c1, c2;
1343
1344       src_base = src;
1345       ONE_MORE_BYTE (c1);
1346
1347       /* We produce no character or one character.  */
1348       switch (iso_code_class [c1])
1349         {
1350         case ISO_0x20_or_0x7F:
1351           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1352             {
1353               DECODE_COMPOSITION_RULE (c1);
1354               continue;
1355             }
1356           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1357             {
1358               /* This is SPACE or DEL.  */
1359               charset = CHARSET_ASCII;
1360               break;
1361             }
1362           /* This is a graphic character, we fall down ...  */
1363
1364         case ISO_graphic_plane_0:
1365           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1366             {
1367               DECODE_COMPOSITION_RULE (c1);
1368               continue;
1369             }
1370           charset = charset0;
1371           break;
1372
1373         case ISO_0xA0_or_0xFF:
1374           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1375               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1376             goto label_invalid_code;
1377           /* This is a graphic character, we fall down ... */
1378
1379         case ISO_graphic_plane_1:
1380           if (charset1 < 0)
1381             goto label_invalid_code;
1382           charset = charset1;
1383           break;
1384
1385         case ISO_control_0:
1386           if (COMPOSING_P (coding))
1387             DECODE_COMPOSITION_END ('1');
1388
1389           /* All ISO2022 control characters in this class have the
1390              same representation in Emacs internal format.  */
1391           if (c1 == '\n'
1392               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1393               && (coding->eol_type == CODING_EOL_CR
1394                   || coding->eol_type == CODING_EOL_CRLF))
1395             {
1396               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1397               goto label_end_of_loop;
1398             }
1399           charset = CHARSET_ASCII;
1400           break;
1401
1402         case ISO_control_1:
1403           if (COMPOSING_P (coding))
1404             DECODE_COMPOSITION_END ('1');
1405           goto label_invalid_code;
1406
1407         case ISO_carriage_return:
1408           if (COMPOSING_P (coding))
1409             DECODE_COMPOSITION_END ('1');
1410
1411           if (coding->eol_type == CODING_EOL_CR)
1412             c1 = '\n';
1413           else if (coding->eol_type == CODING_EOL_CRLF)
1414             {
1415               ONE_MORE_BYTE (c1);
1416               if (c1 != ISO_CODE_LF)
1417                 {
1418                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1419                     {
1420                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
1421                       goto label_end_of_loop;
1422                     }
1423                   src--;
1424                   c1 = '\r';
1425                 }
1426             }
1427           charset = CHARSET_ASCII;
1428           break;
1429
1430         case ISO_shift_out:
1431           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1432               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1433             goto label_invalid_code;
1434           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1435           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1436           continue;
1437
1438         case ISO_shift_in:
1439           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1440             goto label_invalid_code;
1441           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1442           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1443           continue;
1444
1445         case ISO_single_shift_2_7:
1446         case ISO_single_shift_2:
1447           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1448             goto label_invalid_code;
1449           /* SS2 is handled as an escape sequence of ESC 'N' */
1450           c1 = 'N';
1451           goto label_escape_sequence;
1452
1453         case ISO_single_shift_3:
1454           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1455             goto label_invalid_code;
1456           /* SS2 is handled as an escape sequence of ESC 'O' */
1457           c1 = 'O';
1458           goto label_escape_sequence;
1459
1460         case ISO_control_sequence_introducer:
1461           /* CSI is handled as an escape sequence of ESC '[' ...  */
1462           c1 = '[';
1463           goto label_escape_sequence;
1464
1465         case ISO_escape:
1466           ONE_MORE_BYTE (c1);
1467         label_escape_sequence:
1468           /* Escape sequences handled by Emacs are invocation,
1469              designation, direction specification, and character
1470              composition specification.  */
1471           switch (c1)
1472             {
1473             case '&':           /* revision of following character set */
1474               ONE_MORE_BYTE (c1);
1475               if (!(c1 >= '@' && c1 <= '~'))
1476                 goto label_invalid_code;
1477               ONE_MORE_BYTE (c1);
1478               if (c1 != ISO_CODE_ESC)
1479                 goto label_invalid_code;
1480               ONE_MORE_BYTE (c1);
1481               goto label_escape_sequence;
1482
1483             case '$':           /* designation of 2-byte character set */
1484               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1485                 goto label_invalid_code;
1486               ONE_MORE_BYTE (c1);
1487               if (c1 >= '@' && c1 <= 'B')
1488                 {       /* designation of JISX0208.1978, GB2312.1980,
1489                            or JISX0208.1980 */
1490                   DECODE_DESIGNATION (0, 2, 94, c1);
1491                 }
1492               else if (c1 >= 0x28 && c1 <= 0x2B)
1493                 {       /* designation of DIMENSION2_CHARS94 character set */
1494                   ONE_MORE_BYTE (c2);
1495                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1496                 }
1497               else if (c1 >= 0x2C && c1 <= 0x2F)
1498                 {       /* designation of DIMENSION2_CHARS96 character set */
1499                   ONE_MORE_BYTE (c2);
1500                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1501                 }
1502               else
1503                 goto label_invalid_code;
1504               /* We must update these variables now.  */
1505               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1506               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1507               continue;
1508
1509             case 'n':           /* invocation of locking-shift-2 */
1510               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1511                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1512                 goto label_invalid_code;
1513               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1514               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1515               continue;
1516
1517             case 'o':           /* invocation of locking-shift-3 */
1518               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1519                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1520                 goto label_invalid_code;
1521               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
1522               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1523               continue;
1524
1525             case 'N':           /* invocation of single-shift-2 */
1526               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1527                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1528                 goto label_invalid_code;
1529               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
1530               ONE_MORE_BYTE (c1);
1531               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1532                 goto label_invalid_code;
1533               break;
1534
1535             case 'O':           /* invocation of single-shift-3 */
1536               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1537                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1538                 goto label_invalid_code;
1539               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
1540               ONE_MORE_BYTE (c1);
1541               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
1542                 goto label_invalid_code;
1543               break;
1544
1545             case '0': case '2': case '3': case '4': /* start composition */
1546               DECODE_COMPOSITION_START (c1);
1547               continue;
1548
1549             case '1':           /* end composition */
1550               DECODE_COMPOSITION_END (c1);
1551               continue;
1552
1553             case '[':           /* specification of direction */
1554               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
1555                 goto label_invalid_code;
1556               /* For the moment, nested direction is not supported.
1557                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
1558                  left-to-right, and nozero means right-to-left.  */
1559               ONE_MORE_BYTE (c1);
1560               switch (c1)
1561                 {
1562                 case ']':       /* end of the current direction */
1563                   coding->mode &= ~CODING_MODE_DIRECTION;
1564
1565                 case '0':       /* end of the current direction */
1566                 case '1':       /* start of left-to-right direction */
1567                   ONE_MORE_BYTE (c1);
1568                   if (c1 == ']')
1569                     coding->mode &= ~CODING_MODE_DIRECTION;
1570                   else
1571                     goto label_invalid_code;
1572                   break;
1573
1574                 case '2':       /* start of right-to-left direction */
1575                   ONE_MORE_BYTE (c1);
1576                   if (c1 == ']')
1577                     coding->mode |= CODING_MODE_DIRECTION;
1578                   else
1579                     goto label_invalid_code;
1580                   break;
1581
1582                 default:
1583                   goto label_invalid_code;
1584                 }
1585               continue;
1586
1587             default:
1588               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1589                 goto label_invalid_code;
1590               if (c1 >= 0x28 && c1 <= 0x2B)
1591                 {       /* designation of DIMENSION1_CHARS94 character set */
1592                   ONE_MORE_BYTE (c2);
1593                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
1594                 }
1595               else if (c1 >= 0x2C && c1 <= 0x2F)
1596                 {       /* designation of DIMENSION1_CHARS96 character set */
1597                   ONE_MORE_BYTE (c2);
1598                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
1599                 }
1600               else
1601                 goto label_invalid_code;
1602               /* We must update these variables now.  */
1603               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1604               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1605               continue;
1606             }
1607         }
1608
1609       /* Now we know CHARSET and 1st position code C1 of a character.
1610          Produce a multibyte sequence for that character while getting
1611          2nd position code C2 if necessary.  */
1612       if (CHARSET_DIMENSION (charset) == 2)
1613         {
1614           ONE_MORE_BYTE (c2);
1615           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
1616             /* C2 is not in a valid range.  */
1617             goto label_invalid_code;
1618         }
1619       c = DECODE_ISO_CHARACTER (charset, c1, c2);
1620       EMIT_CHAR (c);
1621       continue;
1622
1623     label_invalid_code:
1624       coding->errors++;
1625       if (COMPOSING_P (coding))
1626         DECODE_COMPOSITION_END ('1');
1627       src = src_base;
1628       c = *src++;
1629       EMIT_CHAR (c);
1630     }
1631
1632  label_end_of_loop:
1633   coding->consumed = coding->consumed_char = src_base - source;
1634   coding->produced = dst - destination;
1635   return;
1636 }
1637
1638
1639 /* ISO2022 encoding stuff.  */
1640
1641 /*
1642    It is not enough to say just "ISO2022" on encoding, we have to
1643    specify more details.  In Emacs, each coding system of ISO2022
1644    variant has the following specifications:
1645         1. Initial designation to G0 thru G3.
1646         2. Allows short-form designation?
1647         3. ASCII should be designated to G0 before control characters?
1648         4. ASCII should be designated to G0 at end of line?
1649         5. 7-bit environment or 8-bit environment?
1650         6. Use locking-shift?
1651         7. Use Single-shift?
1652    And the following two are only for Japanese:
1653         8. Use ASCII in place of JIS0201-1976-Roman?
1654         9. Use JISX0208-1983 in place of JISX0208-1978?
1655    These specifications are encoded in `coding->flags' as flag bits
1656    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
1657    details.
1658 */
1659
1660 /* Produce codes (escape sequence) for designating CHARSET to graphic
1661    register REG at DST, and increment DST.  If <final-char> of CHARSET is
1662    '@', 'A', or 'B' and the coding system CODING allows, produce
1663    designation sequence of short-form.  */
1664
1665 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
1666   do {                                                                  \
1667     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
1668     char *intermediate_char_94 = "()*+";                                \
1669     char *intermediate_char_96 = ",-./";                                \
1670     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
1671                                                                         \
1672     if (revision < 255)                                                 \
1673       {                                                                 \
1674         *dst++ = ISO_CODE_ESC;                                          \
1675         *dst++ = '&';                                                   \
1676         *dst++ = '@' + revision;                                        \
1677       }                                                                 \
1678     *dst++ = ISO_CODE_ESC;                                              \
1679     if (CHARSET_DIMENSION (charset) == 1)                               \
1680       {                                                                 \
1681         if (CHARSET_CHARS (charset) == 94)                              \
1682           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
1683         else                                                            \
1684           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1685       }                                                                 \
1686     else                                                                \
1687       {                                                                 \
1688         *dst++ = '$';                                                   \
1689         if (CHARSET_CHARS (charset) == 94)                              \
1690           {                                                             \
1691             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
1692                 || reg != 0                                             \
1693                 || final_char < '@' || final_char > 'B')                \
1694               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
1695           }                                                             \
1696         else                                                            \
1697           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
1698       }                                                                 \
1699     *dst++ = final_char;                                                \
1700     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
1701   } while (0)
1702
1703 /* The following two macros produce codes (control character or escape
1704    sequence) for ISO2022 single-shift functions (single-shift-2 and
1705    single-shift-3).  */
1706
1707 #define ENCODE_SINGLE_SHIFT_2                           \
1708   do {                                                  \
1709     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1710       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
1711     else                                                \
1712       *dst++ = ISO_CODE_SS2;                            \
1713     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1714   } while (0)
1715
1716 #define ENCODE_SINGLE_SHIFT_3                           \
1717   do {                                                  \
1718     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
1719       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
1720     else                                                \
1721       *dst++ = ISO_CODE_SS3;                            \
1722     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
1723   } while (0)
1724
1725 /* The following four macros produce codes (control character or
1726    escape sequence) for ISO2022 locking-shift functions (shift-in,
1727    shift-out, locking-shift-2, and locking-shift-3).  */
1728
1729 #define ENCODE_SHIFT_IN                         \
1730   do {                                          \
1731     *dst++ = ISO_CODE_SI;                       \
1732     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
1733   } while (0)
1734
1735 #define ENCODE_SHIFT_OUT                        \
1736   do {                                          \
1737     *dst++ = ISO_CODE_SO;                       \
1738     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
1739   } while (0)
1740
1741 #define ENCODE_LOCKING_SHIFT_2                  \
1742   do {                                          \
1743     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
1744     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
1745   } while (0)
1746
1747 #define ENCODE_LOCKING_SHIFT_3                  \
1748   do {                                          \
1749     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
1750     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
1751   } while (0)
1752
1753 /* Produce codes for a DIMENSION1 character whose character set is
1754    CHARSET and whose position-code is C1.  Designation and invocation
1755    sequences are also produced in advance if necessary.  */
1756
1757 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
1758   do {                                                                  \
1759     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1760       {                                                                 \
1761         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1762           *dst++ = c1 & 0x7F;                                           \
1763         else                                                            \
1764           *dst++ = c1 | 0x80;                                           \
1765         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1766         break;                                                          \
1767       }                                                                 \
1768     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1769       {                                                                 \
1770         *dst++ = c1 & 0x7F;                                             \
1771         break;                                                          \
1772       }                                                                 \
1773     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1774       {                                                                 \
1775         *dst++ = c1 | 0x80;                                             \
1776         break;                                                          \
1777       }                                                                 \
1778     else                                                                \
1779       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1780          must invoke it, or, at first, designate it to some graphic     \
1781          register.  Then repeat the loop to actually produce the        \
1782          character.  */                                                 \
1783       dst = encode_invocation_designation (charset, coding, dst);       \
1784   } while (1)
1785
1786 /* Produce codes for a DIMENSION2 character whose character set is
1787    CHARSET and whose position-codes are C1 and C2.  Designation and
1788    invocation codes are also produced in advance if necessary.  */
1789
1790 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
1791   do {                                                                  \
1792     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
1793       {                                                                 \
1794         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
1795           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
1796         else                                                            \
1797           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
1798         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
1799         break;                                                          \
1800       }                                                                 \
1801     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
1802       {                                                                 \
1803         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
1804         break;                                                          \
1805       }                                                                 \
1806     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
1807       {                                                                 \
1808         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
1809         break;                                                          \
1810       }                                                                 \
1811     else                                                                \
1812       /* Since CHARSET is not yet invoked to any graphic planes, we     \
1813          must invoke it, or, at first, designate it to some graphic     \
1814          register.  Then repeat the loop to actually produce the        \
1815          character.  */                                                 \
1816       dst = encode_invocation_designation (charset, coding, dst);       \
1817   } while (1)
1818
1819 #define ENCODE_ISO_CHARACTER(c)                                 \
1820   do {                                                          \
1821     int charset, c1, c2;                                        \
1822                                                                 \
1823     SPLIT_CHAR (c, charset, c1, c2);                            \
1824     if (CHARSET_DEFINED_P (charset))                            \
1825       {                                                         \
1826         if (CHARSET_DIMENSION (charset) == 1)                   \
1827           {                                                     \
1828             if (charset == CHARSET_ASCII                        \
1829                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
1830               charset = charset_latin_jisx0201;                 \
1831             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
1832           }                                                     \
1833         else                                                    \
1834           {                                                     \
1835             if (charset == charset_jisx0208                     \
1836                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
1837               charset = charset_jisx0208_1978;                  \
1838             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
1839           }                                                     \
1840       }                                                         \
1841     else                                                        \
1842       {                                                         \
1843         *dst++ = c1;                                            \
1844         if (c2 >= 0)                                            \
1845           *dst++ = c2;                                          \
1846       }                                                         \
1847   } while (0)
1848
1849
1850 /* Instead of encoding character C, produce one or two `?'s.  */
1851
1852 #define ENCODE_UNSAFE_CHARACTER(c)                                      \
1853   do {                                                                  \
1854     ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);       \
1855     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                           \
1856       ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION);     \
1857   } while (0)
1858
1859
1860 /* Produce designation and invocation codes at a place pointed by DST
1861    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
1862    Return new DST.  */
1863
1864 unsigned char *
1865 encode_invocation_designation (charset, coding, dst)
1866      int charset;
1867      struct coding_system *coding;
1868      unsigned char *dst;
1869 {
1870   int reg;                      /* graphic register number */
1871
1872   /* At first, check designations.  */
1873   for (reg = 0; reg < 4; reg++)
1874     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
1875       break;
1876
1877   if (reg >= 4)
1878     {
1879       /* CHARSET is not yet designated to any graphic registers.  */
1880       /* At first check the requested designation.  */
1881       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
1882       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
1883         /* Since CHARSET requests no special designation, designate it
1884            to graphic register 0.  */
1885         reg = 0;
1886
1887       ENCODE_DESIGNATION (charset, reg, coding);
1888     }
1889
1890   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
1891       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
1892     {
1893       /* Since the graphic register REG is not invoked to any graphic
1894          planes, invoke it to graphic plane 0.  */
1895       switch (reg)
1896         {
1897         case 0:                 /* graphic register 0 */
1898           ENCODE_SHIFT_IN;
1899           break;
1900
1901         case 1:                 /* graphic register 1 */
1902           ENCODE_SHIFT_OUT;
1903           break;
1904
1905         case 2:                 /* graphic register 2 */
1906           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1907             ENCODE_SINGLE_SHIFT_2;
1908           else
1909             ENCODE_LOCKING_SHIFT_2;
1910           break;
1911
1912         case 3:                 /* graphic register 3 */
1913           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
1914             ENCODE_SINGLE_SHIFT_3;
1915           else
1916             ENCODE_LOCKING_SHIFT_3;
1917           break;
1918         }
1919     }
1920
1921   return dst;
1922 }
1923
1924 /* Produce 2-byte codes for encoded composition rule RULE.  */
1925
1926 #define ENCODE_COMPOSITION_RULE(rule)           \
1927   do {                                          \
1928     int gref, nref;                             \
1929     COMPOSITION_DECODE_RULE (rule, gref, nref); \
1930     *dst++ = 32 + 81 + gref;                    \
1931     *dst++ = 32 + nref;                         \
1932   } while (0)
1933
1934 /* Produce codes for indicating the start of a composition sequence
1935    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
1936    which specify information about the composition.  See the comment
1937    in coding.h for the format of DATA.  */
1938
1939 #define ENCODE_COMPOSITION_START(coding, data)                          \
1940   do {                                                                  \
1941     coding->composing = data[3];                                        \
1942     *dst++ = ISO_CODE_ESC;                                              \
1943     if (coding->composing == COMPOSITION_RELATIVE)                      \
1944       *dst++ = '0';                                                     \
1945     else                                                                \
1946       {                                                                 \
1947         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
1948                   ? '3' : '4');                                         \
1949         coding->cmp_data_index = coding->cmp_data_start + 4;            \
1950         coding->composition_rule_follows = 0;                           \
1951       }                                                                 \
1952   } while (0)
1953
1954 /* Produce codes for indicating the end of the current composition.  */
1955
1956 #define ENCODE_COMPOSITION_END(coding, data)                    \
1957   do {                                                          \
1958     *dst++ = ISO_CODE_ESC;                                      \
1959     *dst++ = '1';                                               \
1960     coding->cmp_data_start += data[0];                          \
1961     coding->composing = COMPOSITION_NO;                         \
1962     if (coding->cmp_data_start == coding->cmp_data->used        \
1963         && coding->cmp_data->next)                              \
1964       {                                                         \
1965         coding->cmp_data = coding->cmp_data->next;              \
1966         coding->cmp_data_start = 0;                             \
1967       }                                                         \
1968   } while (0)
1969
1970 /* Produce composition start sequence ESC 0.  Here, this sequence
1971    doesn't mean the start of a new composition but means that we have
1972    just produced components (alternate chars and composition rules) of
1973    the composition and the actual text follows in SRC.  */
1974
1975 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
1976   do {                                          \
1977     *dst++ = ISO_CODE_ESC;                      \
1978     *dst++ = '0';                               \
1979     coding->composing = COMPOSITION_RELATIVE;   \
1980   } while (0)
1981
1982 /* The following three macros produce codes for indicating direction
1983    of text.  */
1984 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
1985   do {                                                  \
1986     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
1987       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
1988     else                                                \
1989       *dst++ = ISO_CODE_CSI;                            \
1990   } while (0)
1991
1992 #define ENCODE_DIRECTION_R2L    \
1993   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
1994
1995 #define ENCODE_DIRECTION_L2R    \
1996   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
1997
1998 /* Produce codes for designation and invocation to reset the graphic
1999    planes and registers to initial state.  */
2000 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2001   do {                                                                      \
2002     int reg;                                                                \
2003     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2004       ENCODE_SHIFT_IN;                                                      \
2005     for (reg = 0; reg < 4; reg++)                                           \
2006       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2007           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2008               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2009         ENCODE_DESIGNATION                                                  \
2010           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2011   } while (0)
2012
2013 /* Produce designation sequences of charsets in the line started from
2014    SRC to a place pointed by DST, and return updated DST.
2015
2016    If the current block ends before any end-of-line, we may fail to
2017    find all the necessary designations.  */
2018
2019 static unsigned char *
2020 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2021      struct coding_system *coding;
2022      Lisp_Object translation_table;
2023      unsigned char *src, *src_end, *dst;
2024 {
2025   int charset, c, found = 0, reg;
2026   /* Table of charsets to be designated to each graphic register.  */
2027   int r[4];
2028
2029   for (reg = 0; reg < 4; reg++)
2030     r[reg] = -1;
2031
2032   while (found < 4)
2033     {
2034       ONE_MORE_CHAR (c);
2035       if (c == '\n')
2036         break;
2037
2038       charset = CHAR_CHARSET (c);
2039       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2040       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2041         {
2042           found++;
2043           r[reg] = charset;
2044         }
2045     }
2046
2047  label_end_of_loop:
2048   if (found)
2049     {
2050       for (reg = 0; reg < 4; reg++)
2051         if (r[reg] >= 0
2052             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2053           ENCODE_DESIGNATION (r[reg], reg, coding);
2054     }
2055
2056   return dst;
2057 }
2058
2059 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2060
2061 static void
2062 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2063      struct coding_system *coding;
2064      unsigned char *source, *destination;
2065      int src_bytes, dst_bytes;
2066 {
2067   unsigned char *src = source;
2068   unsigned char *src_end = source + src_bytes;
2069   unsigned char *dst = destination;
2070   unsigned char *dst_end = destination + dst_bytes;
2071   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2072      from DST_END to assure overflow checking is necessary only at the
2073      head of loop.  */
2074   unsigned char *adjusted_dst_end = dst_end - 19;
2075   /* SRC_BASE remembers the start position in source in each loop.
2076      The loop will be exited when there's not enough source text to
2077      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2078      there's not enough destination area to produce encoded codes
2079      (within macro EMIT_BYTES).  */
2080   unsigned char *src_base;
2081   int c;
2082   Lisp_Object translation_table;
2083   Lisp_Object safe_chars;
2084
2085   safe_chars = coding_safe_chars (coding);
2086
2087   if (NILP (Venable_character_translation))
2088     translation_table = Qnil;
2089   else
2090     {
2091       translation_table = coding->translation_table_for_encode;
2092       if (NILP (translation_table))
2093         translation_table = Vstandard_translation_table_for_encode;
2094     }
2095
2096   coding->consumed_char = 0;
2097   coding->errors = 0;
2098   while (1)
2099     {
2100       src_base = src;
2101
2102       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2103         {
2104           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2105           break;
2106         }
2107
2108       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2109           && CODING_SPEC_ISO_BOL (coding))
2110         {
2111           /* We have to produce designation sequences if any now.  */
2112           dst = encode_designation_at_bol (coding, translation_table,
2113                                            src, src_end, dst);
2114           CODING_SPEC_ISO_BOL (coding) = 0;
2115         }
2116
2117       /* Check composition start and end.  */
2118       if (coding->composing != COMPOSITION_DISABLED
2119           && coding->cmp_data_start < coding->cmp_data->used)
2120         {
2121           struct composition_data *cmp_data = coding->cmp_data;
2122           int *data = cmp_data->data + coding->cmp_data_start;
2123           int this_pos = cmp_data->char_offset + coding->consumed_char;
2124
2125           if (coding->composing == COMPOSITION_RELATIVE)
2126             {
2127               if (this_pos == data[2])
2128                 {
2129                   ENCODE_COMPOSITION_END (coding, data);
2130                   cmp_data = coding->cmp_data;
2131                   data = cmp_data->data + coding->cmp_data_start;
2132                 }
2133             }
2134           else if (COMPOSING_P (coding))
2135             {
2136               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2137               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2138                 /* We have consumed components of the composition.
2139                    What follows in SRC is the compositions's base
2140                    text.  */
2141                 ENCODE_COMPOSITION_FAKE_START (coding);
2142               else
2143                 {
2144                   int c = cmp_data->data[coding->cmp_data_index++];
2145                   if (coding->composition_rule_follows)
2146                     {
2147                       ENCODE_COMPOSITION_RULE (c);
2148                       coding->composition_rule_follows = 0;
2149                     }
2150                   else
2151                     {
2152                       if (coding->flags & CODING_FLAG_ISO_SAFE
2153                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2154                         ENCODE_UNSAFE_CHARACTER (c);
2155                       else
2156                         ENCODE_ISO_CHARACTER (c);
2157                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2158                         coding->composition_rule_follows = 1;
2159                     }
2160                   continue;
2161                 }
2162             }
2163           if (!COMPOSING_P (coding))
2164             {
2165               if (this_pos == data[1])
2166                 {
2167                   ENCODE_COMPOSITION_START (coding, data);
2168                   continue;
2169                 }
2170             }
2171         }
2172
2173       ONE_MORE_CHAR (c);
2174
2175       /* Now encode the character C.  */
2176       if (c < 0x20 || c == 0x7F)
2177         {
2178           if (c == '\r')
2179             {
2180               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2181                 {
2182                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2183                     ENCODE_RESET_PLANE_AND_REGISTER;
2184                   *dst++ = c;
2185                   continue;
2186                 }
2187               /* fall down to treat '\r' as '\n' ...  */
2188               c = '\n';
2189             }
2190           if (c == '\n')
2191             {
2192               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2193                 ENCODE_RESET_PLANE_AND_REGISTER;
2194               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2195                 bcopy (coding->spec.iso2022.initial_designation,
2196                        coding->spec.iso2022.current_designation,
2197                        sizeof coding->spec.iso2022.initial_designation);
2198               if (coding->eol_type == CODING_EOL_LF
2199                   || coding->eol_type == CODING_EOL_UNDECIDED)
2200                 *dst++ = ISO_CODE_LF;
2201               else if (coding->eol_type == CODING_EOL_CRLF)
2202                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2203               else
2204                 *dst++ = ISO_CODE_CR;
2205               CODING_SPEC_ISO_BOL (coding) = 1;
2206             }
2207           else
2208             {
2209               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2210                 ENCODE_RESET_PLANE_AND_REGISTER;
2211               *dst++ = c;
2212             }
2213         }
2214       else if (ASCII_BYTE_P (c))
2215         ENCODE_ISO_CHARACTER (c);
2216       else if (SINGLE_BYTE_CHAR_P (c))
2217         {
2218           *dst++ = c;
2219           coding->errors++;
2220         }
2221       else if (coding->flags & CODING_FLAG_ISO_SAFE
2222                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2223         ENCODE_UNSAFE_CHARACTER (c);
2224       else
2225         ENCODE_ISO_CHARACTER (c);
2226
2227       coding->consumed_char++;
2228     }
2229
2230  label_end_of_loop:
2231   coding->consumed = src_base - source;
2232   coding->produced = coding->produced_char = dst - destination;
2233 }
2234
2235 \f
2236 /*** 4. SJIS and BIG5 handlers ***/
2237
2238 /* Although SJIS and BIG5 are not ISO's coding system, they are used
2239    quite widely.  So, for the moment, Emacs supports them in the bare
2240    C code.  But, in the future, they may be supported only by CCL.  */
2241
2242 /* SJIS is a coding system encoding three character sets: ASCII, right
2243    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2244    as is.  A character of charset katakana-jisx0201 is encoded by
2245    "position-code + 0x80".  A character of charset japanese-jisx0208
2246    is encoded in 2-byte but two position-codes are divided and shifted
2247    so that it fit in the range below.
2248
2249    --- CODE RANGE of SJIS ---
2250    (character set)      (range)
2251    ASCII                0x00 .. 0x7F
2252    KATAKANA-JISX0201    0xA0 .. 0xDF
2253    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2254             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2255    -------------------------------
2256
2257 */
2258
2259 /* BIG5 is a coding system encoding two character sets: ASCII and
2260    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2261    character set and is encoded in two-byte.
2262
2263    --- CODE RANGE of BIG5 ---
2264    (character set)      (range)
2265    ASCII                0x00 .. 0x7F
2266    Big5 (1st byte)      0xA1 .. 0xFE
2267         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2268    --------------------------
2269
2270    Since the number of characters in Big5 is larger than maximum
2271    characters in Emacs' charset (96x96), it can't be handled as one
2272    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2273    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2274    contains frequently used characters and the latter contains less
2275    frequently used characters.  */
2276
2277 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2278    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2279    C1 and C2 are the 1st and 2nd position-codes of of Emacs' internal
2280    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2281
2282 /* Number of Big5 characters which have the same code in 1st byte.  */
2283 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2284
2285 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2286   do {                                                                  \
2287     unsigned int temp                                                   \
2288       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2289     if (b1 < 0xC9)                                                      \
2290       charset = charset_big5_1;                                         \
2291     else                                                                \
2292       {                                                                 \
2293         charset = charset_big5_2;                                       \
2294         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2295       }                                                                 \
2296     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2297     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2298   } while (0)
2299
2300 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2301   do {                                                                  \
2302     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2303     if (charset == charset_big5_2)                                      \
2304       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2305     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2306     b2 = temp % BIG5_SAME_ROW;                                          \
2307     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2308   } while (0)
2309
2310 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2311    Check if a text is encoded in SJIS.  If it is, return
2312    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2313
2314 static int
2315 detect_coding_sjis (src, src_end, multibytep)
2316      unsigned char *src, *src_end;
2317      int multibytep;
2318 {
2319   int c;
2320   /* Dummy for ONE_MORE_BYTE.  */
2321   struct coding_system dummy_coding;
2322   struct coding_system *coding = &dummy_coding;
2323
2324   while (1)
2325     {
2326       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2327       if (c >= 0x81)
2328         {
2329           if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF))
2330             {
2331               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2332               if (c < 0x40 || c == 0x7F || c > 0xFC)
2333                 return 0;
2334             }
2335           else if (c > 0xDF)
2336             return 0;
2337         }
2338     }
2339  label_end_of_loop:
2340   return CODING_CATEGORY_MASK_SJIS;
2341 }
2342
2343 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2344    Check if a text is encoded in BIG5.  If it is, return
2345    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2346
2347 static int
2348 detect_coding_big5 (src, src_end, multibytep)
2349      unsigned char *src, *src_end;
2350      int multibytep;
2351 {
2352   int c;
2353   /* Dummy for ONE_MORE_BYTE.  */
2354   struct coding_system dummy_coding;
2355   struct coding_system *coding = &dummy_coding;
2356
2357   while (1)
2358     {
2359       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2360       if (c >= 0xA1)
2361         {
2362           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2363           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
2364             return 0;
2365         }
2366     }
2367  label_end_of_loop:
2368   return CODING_CATEGORY_MASK_BIG5;
2369 }
2370
2371 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2372    Check if a text is encoded in UTF-8.  If it is, return
2373    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2374
2375 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2376 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2377 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2378 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2379 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2380 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2381 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2382
2383 static int
2384 detect_coding_utf_8 (src, src_end, multibytep)
2385      unsigned char *src, *src_end;
2386      int multibytep;
2387 {
2388   unsigned char c;
2389   int seq_maybe_bytes;
2390   /* Dummy for ONE_MORE_BYTE.  */
2391   struct coding_system dummy_coding;
2392   struct coding_system *coding = &dummy_coding;
2393
2394   while (1)
2395     {
2396       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2397       if (UTF_8_1_OCTET_P (c))
2398         continue;
2399       else if (UTF_8_2_OCTET_LEADING_P (c))
2400         seq_maybe_bytes = 1;
2401       else if (UTF_8_3_OCTET_LEADING_P (c))
2402         seq_maybe_bytes = 2;
2403       else if (UTF_8_4_OCTET_LEADING_P (c))
2404         seq_maybe_bytes = 3;
2405       else if (UTF_8_5_OCTET_LEADING_P (c))
2406         seq_maybe_bytes = 4;
2407       else if (UTF_8_6_OCTET_LEADING_P (c))
2408         seq_maybe_bytes = 5;
2409       else
2410         return 0;
2411
2412       do
2413         {
2414           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2415           if (!UTF_8_EXTRA_OCTET_P (c))
2416             return 0;
2417           seq_maybe_bytes--;
2418         }
2419       while (seq_maybe_bytes > 0);
2420     }
2421
2422  label_end_of_loop:
2423   return CODING_CATEGORY_MASK_UTF_8;
2424 }
2425
2426 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2427    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2428    Little Endian (otherwise).  If it is, return
2429    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2430    else return 0.  */
2431
2432 #define UTF_16_INVALID_P(val)   \
2433   (((val) == 0xFFFE)            \
2434    || ((val) == 0xFFFF))
2435
2436 #define UTF_16_HIGH_SURROGATE_P(val) \
2437   (((val) & 0xD800) == 0xD800)
2438
2439 #define UTF_16_LOW_SURROGATE_P(val) \
2440   (((val) & 0xDC00) == 0xDC00)
2441
2442 static int
2443 detect_coding_utf_16 (src, src_end, multibytep)
2444      unsigned char *src, *src_end;
2445      int multibytep;
2446 {
2447   unsigned char c1, c2;
2448   /* Dummy for TWO_MORE_BYTES.  */
2449   struct coding_system dummy_coding;
2450   struct coding_system *coding = &dummy_coding;
2451
2452   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
2453   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
2454
2455   if ((c1 == 0xFF) && (c2 == 0xFE))
2456     return CODING_CATEGORY_MASK_UTF_16_LE;
2457   else if ((c1 == 0xFE) && (c2 == 0xFF))
2458     return CODING_CATEGORY_MASK_UTF_16_BE;
2459
2460  label_end_of_loop:
2461   return 0;
2462 }
2463
2464 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
2465    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
2466
2467 static void
2468 decode_coding_sjis_big5 (coding, source, destination,
2469                          src_bytes, dst_bytes, sjis_p)
2470      struct coding_system *coding;
2471      unsigned char *source, *destination;
2472      int src_bytes, dst_bytes;
2473      int sjis_p;
2474 {
2475   unsigned char *src = source;
2476   unsigned char *src_end = source + src_bytes;
2477   unsigned char *dst = destination;
2478   unsigned char *dst_end = destination + dst_bytes;
2479   /* SRC_BASE remembers the start position in source in each loop.
2480      The loop will be exited when there's not enough source code
2481      (within macro ONE_MORE_BYTE), or when there's not enough
2482      destination area to produce a character (within macro
2483      EMIT_CHAR).  */
2484   unsigned char *src_base;
2485   Lisp_Object translation_table;
2486
2487   if (NILP (Venable_character_translation))
2488     translation_table = Qnil;
2489   else
2490     {
2491       translation_table = coding->translation_table_for_decode;
2492       if (NILP (translation_table))
2493         translation_table = Vstandard_translation_table_for_decode;
2494     }
2495
2496   coding->produced_char = 0;
2497   while (1)
2498     {
2499       int c, charset, c1, c2;
2500
2501       src_base = src;
2502       ONE_MORE_BYTE (c1);
2503
2504       if (c1 < 0x80)
2505         {
2506           charset = CHARSET_ASCII;
2507           if (c1 < 0x20)
2508             {
2509               if (c1 == '\r')
2510                 {
2511                   if (coding->eol_type == CODING_EOL_CRLF)
2512                     {
2513                       ONE_MORE_BYTE (c2);
2514                       if (c2 == '\n')
2515                         c1 = c2;
2516                       else if (coding->mode
2517                                & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2518                         {
2519                           coding->result = CODING_FINISH_INCONSISTENT_EOL;
2520                           goto label_end_of_loop;
2521                         }
2522                       else
2523                         /* To process C2 again, SRC is subtracted by 1.  */
2524                         src--;
2525                     }
2526                   else if (coding->eol_type == CODING_EOL_CR)
2527                     c1 = '\n';
2528                 }
2529               else if (c1 == '\n'
2530                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2531                        && (coding->eol_type == CODING_EOL_CR
2532                            || coding->eol_type == CODING_EOL_CRLF))
2533                 {
2534                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2535                   goto label_end_of_loop;
2536                 }
2537             }
2538         }
2539       else
2540         {
2541           if (sjis_p)
2542             {
2543               if (c1 >= 0xF0)
2544                 goto label_invalid_code;
2545               if (c1 < 0xA0 || c1 >= 0xE0)
2546                 {
2547                   /* SJIS -> JISX0208 */
2548                   ONE_MORE_BYTE (c2);
2549                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
2550                     goto label_invalid_code;
2551                   DECODE_SJIS (c1, c2, c1, c2);
2552                   charset = charset_jisx0208;
2553                 }
2554               else
2555                 /* SJIS -> JISX0201-Kana */
2556                 charset = charset_katakana_jisx0201;
2557             }
2558           else
2559             {
2560               /* BIG5 -> Big5 */
2561               if (c1 < 0xA1 || c1 > 0xFE)
2562                 goto label_invalid_code;
2563               ONE_MORE_BYTE (c2);
2564               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
2565                 goto label_invalid_code;
2566               DECODE_BIG5 (c1, c2, charset, c1, c2);
2567             }
2568         }
2569
2570       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2571       EMIT_CHAR (c);
2572       continue;
2573
2574     label_invalid_code:
2575       coding->errors++;
2576       src = src_base;
2577       c = *src++;
2578       EMIT_CHAR (c);
2579     }
2580
2581  label_end_of_loop:
2582   coding->consumed = coding->consumed_char = src_base - source;
2583   coding->produced = dst - destination;
2584   return;
2585 }
2586
2587 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
2588    This function can encode charsets `ascii', `katakana-jisx0201',
2589    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
2590    are sure that all these charsets are registered as official charset
2591    (i.e. do not have extended leading-codes).  Characters of other
2592    charsets are produced without any encoding.  If SJIS_P is 1, encode
2593    SJIS text, else encode BIG5 text.  */
2594
2595 static void
2596 encode_coding_sjis_big5 (coding, source, destination,
2597                          src_bytes, dst_bytes, sjis_p)
2598      struct coding_system *coding;
2599      unsigned char *source, *destination;
2600      int src_bytes, dst_bytes;
2601      int sjis_p;
2602 {
2603   unsigned char *src = source;
2604   unsigned char *src_end = source + src_bytes;
2605   unsigned char *dst = destination;
2606   unsigned char *dst_end = destination + dst_bytes;
2607   /* SRC_BASE remembers the start position in source in each loop.
2608      The loop will be exited when there's not enough source text to
2609      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2610      there's not enough destination area to produce encoded codes
2611      (within macro EMIT_BYTES).  */
2612   unsigned char *src_base;
2613   Lisp_Object translation_table;
2614
2615   if (NILP (Venable_character_translation))
2616     translation_table = Qnil;
2617   else
2618     {
2619       translation_table = coding->translation_table_for_encode;
2620       if (NILP (translation_table))
2621         translation_table = Vstandard_translation_table_for_encode;
2622     }
2623
2624   while (1)
2625     {
2626       int c, charset, c1, c2;
2627
2628       src_base = src;
2629       ONE_MORE_CHAR (c);
2630
2631       /* Now encode the character C.  */
2632       if (SINGLE_BYTE_CHAR_P (c))
2633         {
2634           switch (c)
2635             {
2636             case '\r':
2637               if (!coding->mode & CODING_MODE_SELECTIVE_DISPLAY)
2638                 {
2639                   EMIT_ONE_BYTE (c);
2640                   break;
2641                 }
2642               c = '\n';
2643             case '\n':
2644               if (coding->eol_type == CODING_EOL_CRLF)
2645                 {
2646                   EMIT_TWO_BYTES ('\r', c);
2647                   break;
2648                 }
2649               else if (coding->eol_type == CODING_EOL_CR)
2650                 c = '\r';
2651             default:
2652               EMIT_ONE_BYTE (c);
2653             }
2654         }
2655       else
2656         {
2657           SPLIT_CHAR (c, charset, c1, c2);
2658           if (sjis_p)
2659             {
2660               if (charset == charset_jisx0208
2661                   || charset == charset_jisx0208_1978)
2662                 {
2663                   ENCODE_SJIS (c1, c2, c1, c2);
2664                   EMIT_TWO_BYTES (c1, c2);
2665                 }
2666               else if (charset == charset_katakana_jisx0201)
2667                 EMIT_ONE_BYTE (c1 | 0x80);
2668               else if (charset == charset_latin_jisx0201)
2669                 EMIT_ONE_BYTE (c1);
2670               else
2671                 /* There's no way other than producing the internal
2672                    codes as is.  */
2673                 EMIT_BYTES (src_base, src);
2674             }
2675           else
2676             {
2677               if (charset == charset_big5_1 || charset == charset_big5_2)
2678                 {
2679                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
2680                   EMIT_TWO_BYTES (c1, c2);
2681                 }
2682               else
2683                 /* There's no way other than producing the internal
2684                    codes as is.  */
2685                 EMIT_BYTES (src_base, src);
2686             }
2687         }
2688       coding->consumed_char++;
2689     }
2690
2691  label_end_of_loop:
2692   coding->consumed = src_base - source;
2693   coding->produced = coding->produced_char = dst - destination;
2694 }
2695
2696 \f
2697 /*** 5. CCL handlers ***/
2698
2699 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2700    Check if a text is encoded in a coding system of which
2701    encoder/decoder are written in CCL program.  If it is, return
2702    CODING_CATEGORY_MASK_CCL, else return 0.  */
2703
2704 static int
2705 detect_coding_ccl (src, src_end, multibytep)
2706      unsigned char *src, *src_end;
2707      int multibytep;
2708 {
2709   unsigned char *valid;
2710   int c;
2711   /* Dummy for ONE_MORE_BYTE.  */
2712   struct coding_system dummy_coding;
2713   struct coding_system *coding = &dummy_coding;
2714
2715   /* No coding system is assigned to coding-category-ccl.  */
2716   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
2717     return 0;
2718
2719   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
2720   while (1)
2721     {
2722       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2723       if (! valid[c])
2724         return 0;
2725     }
2726  label_end_of_loop:
2727   return CODING_CATEGORY_MASK_CCL;
2728 }
2729
2730 \f
2731 /*** 6. End-of-line handlers ***/
2732
2733 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2734
2735 static void
2736 decode_eol (coding, source, destination, src_bytes, dst_bytes)
2737      struct coding_system *coding;
2738      unsigned char *source, *destination;
2739      int src_bytes, dst_bytes;
2740 {
2741   unsigned char *src = source;
2742   unsigned char *dst = destination;
2743   unsigned char *src_end = src + src_bytes;
2744   unsigned char *dst_end = dst + dst_bytes;
2745   Lisp_Object translation_table;
2746   /* SRC_BASE remembers the start position in source in each loop.
2747      The loop will be exited when there's not enough source code
2748      (within macro ONE_MORE_BYTE), or when there's not enough
2749      destination area to produce a character (within macro
2750      EMIT_CHAR).  */
2751   unsigned char *src_base;
2752   int c;
2753
2754   translation_table = Qnil;
2755   switch (coding->eol_type)
2756     {
2757     case CODING_EOL_CRLF:
2758       while (1)
2759         {
2760           src_base = src;
2761           ONE_MORE_BYTE (c);
2762           if (c == '\r')
2763             {
2764               ONE_MORE_BYTE (c);
2765               if (c != '\n')
2766                 {
2767                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2768                     {
2769                       coding->result = CODING_FINISH_INCONSISTENT_EOL;
2770                       goto label_end_of_loop;
2771                     }
2772                   src--;
2773                   c = '\r';
2774                 }
2775             }
2776           else if (c == '\n'
2777                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
2778             {
2779               coding->result = CODING_FINISH_INCONSISTENT_EOL;
2780               goto label_end_of_loop;
2781             }
2782           EMIT_CHAR (c);
2783         }
2784       break;
2785
2786     case CODING_EOL_CR:
2787       while (1)
2788         {
2789           src_base = src;
2790           ONE_MORE_BYTE (c);
2791           if (c == '\n')
2792             {
2793               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
2794                 {
2795                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
2796                   goto label_end_of_loop;
2797                 }
2798             }
2799           else if (c == '\r')
2800             c = '\n';
2801           EMIT_CHAR (c);
2802         }
2803       break;
2804
2805     default:                    /* no need for EOL handling */
2806       while (1)
2807         {
2808           src_base = src;
2809           ONE_MORE_BYTE (c);
2810           EMIT_CHAR (c);
2811         }
2812     }
2813
2814  label_end_of_loop:
2815   coding->consumed = coding->consumed_char = src_base - source;
2816   coding->produced = dst - destination;
2817   return;
2818 }
2819
2820 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
2821    format of end-of-line according to `coding->eol_type'.  It also
2822    convert multibyte form 8-bit characers to unibyte if
2823    CODING->src_multibyte is nonzero.  If `coding->mode &
2824    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
2825    also means end-of-line.  */
2826
2827 static void
2828 encode_eol (coding, source, destination, src_bytes, dst_bytes)
2829      struct coding_system *coding;
2830      unsigned char *source, *destination;
2831      int src_bytes, dst_bytes;
2832 {
2833   unsigned char *src = source;
2834   unsigned char *dst = destination;
2835   unsigned char *src_end = src + src_bytes;
2836   unsigned char *dst_end = dst + dst_bytes;
2837   Lisp_Object translation_table;
2838   /* SRC_BASE remembers the start position in source in each loop.
2839      The loop will be exited when there's not enough source text to
2840      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2841      there's not enough destination area to produce encoded codes
2842      (within macro EMIT_BYTES).  */
2843   unsigned char *src_base;
2844   int c;
2845   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
2846
2847   translation_table = Qnil;
2848   if (coding->src_multibyte
2849       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
2850     {
2851       src_end--;
2852       src_bytes--;
2853       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
2854     }
2855
2856   if (coding->eol_type == CODING_EOL_CRLF)
2857     {
2858       while (src < src_end)
2859         {
2860           src_base = src;
2861           c = *src++;
2862           if (c >= 0x20)
2863             EMIT_ONE_BYTE (c);
2864           else if (c == '\n' || (c == '\r' && selective_display))
2865             EMIT_TWO_BYTES ('\r', '\n');
2866           else
2867             EMIT_ONE_BYTE (c);
2868         }
2869       src_base = src;
2870     label_end_of_loop:
2871       ;
2872     }
2873   else
2874     {
2875       if (!dst_bytes || src_bytes <= dst_bytes)
2876         {
2877           safe_bcopy (src, dst, src_bytes);
2878           src_base = src_end;
2879           dst += src_bytes;
2880         }
2881       else
2882         {
2883           if (coding->src_multibyte
2884               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
2885             dst_bytes--;
2886           safe_bcopy (src, dst, dst_bytes);
2887           src_base = src + dst_bytes;
2888           dst = destination + dst_bytes;
2889           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2890         }
2891       if (coding->eol_type == CODING_EOL_CR)
2892         {
2893           for (src = destination; src < dst; src++)
2894             if (*src == '\n') *src = '\r';
2895         }
2896       else if (selective_display)
2897         {
2898           for (src = destination; src < dst; src++)
2899             if (*src == '\r') *src = '\n';
2900         }
2901     }
2902   if (coding->src_multibyte)
2903     dst = destination + str_as_unibyte (destination, dst - destination);
2904
2905   coding->consumed = src_base - source;
2906   coding->produced = dst - destination;
2907   coding->produced_char = coding->produced;
2908 }
2909
2910 \f
2911 /*** 7. C library functions ***/
2912
2913 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
2914    has a property `coding-system'.  The value of this property is a
2915    vector of length 5 (called as coding-vector).  Among elements of
2916    this vector, the first (element[0]) and the fifth (element[4])
2917    carry important information for decoding/encoding.  Before
2918    decoding/encoding, this information should be set in fields of a
2919    structure of type `coding_system'.
2920
2921    A value of property `coding-system' can be a symbol of another
2922    subsidiary coding-system.  In that case, Emacs gets coding-vector
2923    from that symbol.
2924
2925    `element[0]' contains information to be set in `coding->type'.  The
2926    value and its meaning is as follows:
2927
2928    0 -- coding_type_emacs_mule
2929    1 -- coding_type_sjis
2930    2 -- coding_type_iso2022
2931    3 -- coding_type_big5
2932    4 -- coding_type_ccl encoder/decoder written in CCL
2933    nil -- coding_type_no_conversion
2934    t -- coding_type_undecided (automatic conversion on decoding,
2935                                no-conversion on encoding)
2936
2937    `element[4]' contains information to be set in `coding->flags' and
2938    `coding->spec'.  The meaning varies by `coding->type'.
2939
2940    If `coding->type' is `coding_type_iso2022', element[4] is a vector
2941    of length 32 (of which the first 13 sub-elements are used now).
2942    Meanings of these sub-elements are:
2943
2944    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
2945         If the value is an integer of valid charset, the charset is
2946         assumed to be designated to graphic register N initially.
2947
2948         If the value is minus, it is a minus value of charset which
2949         reserves graphic register N, which means that the charset is
2950         not designated initially but should be designated to graphic
2951         register N just before encoding a character in that charset.
2952
2953         If the value is nil, graphic register N is never used on
2954         encoding.
2955
2956    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
2957         Each value takes t or nil.  See the section ISO2022 of
2958         `coding.h' for more information.
2959
2960    If `coding->type' is `coding_type_big5', element[4] is t to denote
2961    BIG5-ETen or nil to denote BIG5-HKU.
2962
2963    If `coding->type' takes the other value, element[4] is ignored.
2964
2965    Emacs Lisp's coding system also carries information about format of
2966    end-of-line in a value of property `eol-type'.  If the value is
2967    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
2968    means CODING_EOL_CR.  If it is not integer, it should be a vector
2969    of subsidiary coding systems of which property `eol-type' has one
2970    of above values.
2971
2972 */
2973
2974 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
2975    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
2976    is setup so that no conversion is necessary and return -1, else
2977    return 0.  */
2978
2979 int
2980 setup_coding_system (coding_system, coding)
2981      Lisp_Object coding_system;
2982      struct coding_system *coding;
2983 {
2984   Lisp_Object coding_spec, coding_type, eol_type, plist;
2985   Lisp_Object val;
2986   int i;
2987
2988   /* At first, zero clear all members.  */
2989   bzero (coding, sizeof (struct coding_system));
2990
2991   /* Initialize some fields required for all kinds of coding systems.  */
2992   coding->symbol = coding_system;
2993   coding->heading_ascii = -1;
2994   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
2995   coding->composing = COMPOSITION_DISABLED;
2996   coding->cmp_data = NULL;
2997
2998   if (NILP (coding_system))
2999     goto label_invalid_coding_system;
3000
3001   coding_spec = Fget (coding_system, Qcoding_system);
3002
3003   if (!VECTORP (coding_spec)
3004       || XVECTOR (coding_spec)->size != 5
3005       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3006     goto label_invalid_coding_system;
3007
3008   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3009   if (VECTORP (eol_type))
3010     {
3011       coding->eol_type = CODING_EOL_UNDECIDED;
3012       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3013     }
3014   else if (XFASTINT (eol_type) == 1)
3015     {
3016       coding->eol_type = CODING_EOL_CRLF;
3017       coding->common_flags
3018         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3019     }
3020   else if (XFASTINT (eol_type) == 2)
3021     {
3022       coding->eol_type = CODING_EOL_CR;
3023       coding->common_flags
3024         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3025     }
3026   else
3027     coding->eol_type = CODING_EOL_LF;
3028
3029   coding_type = XVECTOR (coding_spec)->contents[0];
3030   /* Try short cut.  */
3031   if (SYMBOLP (coding_type))
3032     {
3033       if (EQ (coding_type, Qt))
3034         {
3035           coding->type = coding_type_undecided;
3036           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3037         }
3038       else
3039         coding->type = coding_type_no_conversion;
3040       /* Initialize this member.  Any thing other than
3041          CODING_CATEGORY_IDX_UTF_16_BE and
3042          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3043          special treatment in detect_eol.  */
3044       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3045
3046       return 0;
3047     }
3048
3049   /* Get values of coding system properties:
3050      `post-read-conversion', `pre-write-conversion',
3051      `translation-table-for-decode', `translation-table-for-encode'.  */
3052   plist = XVECTOR (coding_spec)->contents[3];
3053   /* Pre & post conversion functions should be disabled if
3054      inhibit_eol_conversion is nozero.  This is the case that a code
3055      conversion function is called while those functions are running.  */
3056   if (! inhibit_pre_post_conversion)
3057     {
3058       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3059       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3060     }
3061   val = Fplist_get (plist, Qtranslation_table_for_decode);
3062   if (SYMBOLP (val))
3063     val = Fget (val, Qtranslation_table_for_decode);
3064   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3065   val = Fplist_get (plist, Qtranslation_table_for_encode);
3066   if (SYMBOLP (val))
3067     val = Fget (val, Qtranslation_table_for_encode);
3068   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3069   val = Fplist_get (plist, Qcoding_category);
3070   if (!NILP (val))
3071     {
3072       val = Fget (val, Qcoding_category_index);
3073       if (INTEGERP (val))
3074         coding->category_idx = XINT (val);
3075       else
3076         goto label_invalid_coding_system;
3077     }
3078   else
3079     goto label_invalid_coding_system;
3080
3081   /* If the coding system has non-nil `composition' property, enable
3082      composition handling.  */
3083   val = Fplist_get (plist, Qcomposition);
3084   if (!NILP (val))
3085     coding->composing = COMPOSITION_NO;
3086
3087   switch (XFASTINT (coding_type))
3088     {
3089     case 0:
3090       coding->type = coding_type_emacs_mule;
3091       if (!NILP (coding->post_read_conversion))
3092         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3093       if (!NILP (coding->pre_write_conversion))
3094         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3095       break;
3096
3097     case 1:
3098       coding->type = coding_type_sjis;
3099       coding->common_flags
3100         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3101       break;
3102
3103     case 2:
3104       coding->type = coding_type_iso2022;
3105       coding->common_flags
3106         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3107       {
3108         Lisp_Object val, temp;
3109         Lisp_Object *flags;
3110         int i, charset, reg_bits = 0;
3111
3112         val = XVECTOR (coding_spec)->contents[4];
3113
3114         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3115           goto label_invalid_coding_system;
3116
3117         flags = XVECTOR (val)->contents;
3118         coding->flags
3119           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3120              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3121              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3122              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3123              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3124              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3125              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3126              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3127              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3128              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3129              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3130              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3131              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3132              );
3133
3134         /* Invoke graphic register 0 to plane 0.  */
3135         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3136         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3137         CODING_SPEC_ISO_INVOCATION (coding, 1)
3138           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3139         /* Not single shifting at first.  */
3140         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3141         /* Beginning of buffer should also be regarded as bol. */
3142         CODING_SPEC_ISO_BOL (coding) = 1;
3143
3144         for (charset = 0; charset <= MAX_CHARSET; charset++)
3145           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3146         val = Vcharset_revision_alist;
3147         while (CONSP (val))
3148           {
3149             charset = get_charset_id (Fcar_safe (XCAR (val)));
3150             if (charset >= 0
3151                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3152                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3153               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3154             val = XCDR (val);
3155           }
3156
3157         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3158            FLAGS[REG] can be one of below:
3159                 integer CHARSET: CHARSET occupies register I,
3160                 t: designate nothing to REG initially, but can be used
3161                   by any charsets,
3162                 list of integer, nil, or t: designate the first
3163                   element (if integer) to REG initially, the remaining
3164                   elements (if integer) is designated to REG on request,
3165                   if an element is t, REG can be used by any charsets,
3166                 nil: REG is never used.  */
3167         for (charset = 0; charset <= MAX_CHARSET; charset++)
3168           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3169             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3170         for (i = 0; i < 4; i++)
3171           {
3172             if (INTEGERP (flags[i])
3173                 && (charset = XINT (flags[i]), CHARSET_VALID_P (charset))
3174                 || (charset = get_charset_id (flags[i])) >= 0)
3175               {
3176                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3177                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3178               }
3179             else if (EQ (flags[i], Qt))
3180               {
3181                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3182                 reg_bits |= 1 << i;
3183                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3184               }
3185             else if (CONSP (flags[i]))
3186               {
3187                 Lisp_Object tail;
3188                 tail = flags[i];
3189
3190                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3191                 if (INTEGERP (XCAR (tail))
3192                     && (charset = XINT (XCAR (tail)),
3193                         CHARSET_VALID_P (charset))
3194                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3195                   {
3196                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3197                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3198                   }
3199                 else
3200                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3201                 tail = XCDR (tail);
3202                 while (CONSP (tail))
3203                   {
3204                     if (INTEGERP (XCAR (tail))
3205                         && (charset = XINT (XCAR (tail)),
3206                             CHARSET_VALID_P (charset))
3207                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3208                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3209                         = i;
3210                     else if (EQ (XCAR (tail), Qt))
3211                       reg_bits |= 1 << i;
3212                     tail = XCDR (tail);
3213                   }
3214               }
3215             else
3216               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3217
3218             CODING_SPEC_ISO_DESIGNATION (coding, i)
3219               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3220           }
3221
3222         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3223           {
3224             /* REG 1 can be used only by locking shift in 7-bit env.  */
3225             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3226               reg_bits &= ~2;
3227             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3228               /* Without any shifting, only REG 0 and 1 can be used.  */
3229               reg_bits &= 3;
3230           }
3231
3232         if (reg_bits)
3233           for (charset = 0; charset <= MAX_CHARSET; charset++)
3234             {
3235               if (CHARSET_VALID_P (charset)
3236                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3237                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3238                 {
3239                   /* There exist some default graphic registers to be
3240                      used by CHARSET.  */
3241
3242                   /* We had better avoid designating a charset of
3243                      CHARS96 to REG 0 as far as possible.  */
3244                   if (CHARSET_CHARS (charset) == 96)
3245                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3246                       = (reg_bits & 2
3247                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3248                   else
3249                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3250                       = (reg_bits & 1
3251                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3252                 }
3253             }
3254       }
3255       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3256       coding->spec.iso2022.last_invalid_designation_register = -1;
3257       break;
3258
3259     case 3:
3260       coding->type = coding_type_big5;
3261       coding->common_flags
3262         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3263       coding->flags
3264         = (NILP (XVECTOR (coding_spec)->contents[4])
3265            ? CODING_FLAG_BIG5_HKU
3266            : CODING_FLAG_BIG5_ETEN);
3267       break;
3268
3269     case 4:
3270       coding->type = coding_type_ccl;
3271       coding->common_flags
3272         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3273       {
3274         val = XVECTOR (coding_spec)->contents[4];
3275         if (! CONSP (val)
3276             || setup_ccl_program (&(coding->spec.ccl.decoder),
3277                                   XCAR (val)) < 0
3278             || setup_ccl_program (&(coding->spec.ccl.encoder),
3279                                   XCDR (val)) < 0)
3280           goto label_invalid_coding_system;
3281
3282         bzero (coding->spec.ccl.valid_codes, 256);
3283         val = Fplist_get (plist, Qvalid_codes);
3284         if (CONSP (val))
3285           {
3286             Lisp_Object this;
3287
3288             for (; CONSP (val); val = XCDR (val))
3289               {
3290                 this = XCAR (val);
3291                 if (INTEGERP (this)
3292                     && XINT (this) >= 0 && XINT (this) < 256)
3293                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3294                 else if (CONSP (this)
3295                          && INTEGERP (XCAR (this))
3296                          && INTEGERP (XCDR (this)))
3297                   {
3298                     int start = XINT (XCAR (this));
3299                     int end = XINT (XCDR (this));
3300
3301                     if (start >= 0 && start <= end && end < 256)
3302                       while (start <= end)
3303                         coding->spec.ccl.valid_codes[start++] = 1;
3304                   }
3305               }
3306           }
3307       }
3308       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3309       coding->spec.ccl.cr_carryover = 0;
3310       break;
3311
3312     case 5:
3313       coding->type = coding_type_raw_text;
3314       break;
3315
3316     default:
3317       goto label_invalid_coding_system;
3318     }
3319   return 0;
3320
3321  label_invalid_coding_system:
3322   coding->type = coding_type_no_conversion;
3323   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3324   coding->common_flags = 0;
3325   coding->eol_type = CODING_EOL_LF;
3326   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3327   return -1;
3328 }
3329
3330 /* Free memory blocks allocated for storing composition information.  */
3331
3332 void
3333 coding_free_composition_data (coding)
3334      struct coding_system *coding;
3335 {
3336   struct composition_data *cmp_data = coding->cmp_data, *next;
3337
3338   if (!cmp_data)
3339     return;
3340   /* Memory blocks are chained.  At first, rewind to the first, then,
3341      free blocks one by one.  */
3342   while (cmp_data->prev)
3343     cmp_data = cmp_data->prev;
3344   while (cmp_data)
3345     {
3346       next = cmp_data->next;
3347       xfree (cmp_data);
3348       cmp_data = next;
3349     }
3350   coding->cmp_data = NULL;
3351 }
3352
3353 /* Set `char_offset' member of all memory blocks pointed by
3354    coding->cmp_data to POS.  */
3355
3356 void
3357 coding_adjust_composition_offset (coding, pos)
3358      struct coding_system *coding;
3359      int pos;
3360 {
3361   struct composition_data *cmp_data;
3362
3363   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3364     cmp_data->char_offset = pos;
3365 }
3366
3367 /* Setup raw-text or one of its subsidiaries in the structure
3368    coding_system CODING according to the already setup value eol_type
3369    in CODING.  CODING should be setup for some coding system in
3370    advance.  */
3371
3372 void
3373 setup_raw_text_coding_system (coding)
3374      struct coding_system *coding;
3375 {
3376   if (coding->type != coding_type_raw_text)
3377     {
3378       coding->symbol = Qraw_text;
3379       coding->type = coding_type_raw_text;
3380       if (coding->eol_type != CODING_EOL_UNDECIDED)
3381         {
3382           Lisp_Object subsidiaries;
3383           subsidiaries = Fget (Qraw_text, Qeol_type);
3384
3385           if (VECTORP (subsidiaries)
3386               && XVECTOR (subsidiaries)->size == 3)
3387             coding->symbol
3388               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3389         }
3390       setup_coding_system (coding->symbol, coding);
3391     }
3392   return;
3393 }
3394
3395 /* Emacs has a mechanism to automatically detect a coding system if it
3396    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3397    it's impossible to distinguish some coding systems accurately
3398    because they use the same range of codes.  So, at first, coding
3399    systems are categorized into 7, those are:
3400
3401    o coding-category-emacs-mule
3402
3403         The category for a coding system which has the same code range
3404         as Emacs' internal format.  Assigned the coding-system (Lisp
3405         symbol) `emacs-mule' by default.
3406
3407    o coding-category-sjis
3408
3409         The category for a coding system which has the same code range
3410         as SJIS.  Assigned the coding-system (Lisp
3411         symbol) `japanese-shift-jis' by default.
3412
3413    o coding-category-iso-7
3414
3415         The category for a coding system which has the same code range
3416         as ISO2022 of 7-bit environment.  This doesn't use any locking
3417         shift and single shift functions.  This can encode/decode all
3418         charsets.  Assigned the coding-system (Lisp symbol)
3419         `iso-2022-7bit' by default.
3420
3421    o coding-category-iso-7-tight
3422
3423         Same as coding-category-iso-7 except that this can
3424         encode/decode only the specified charsets.
3425
3426    o coding-category-iso-8-1
3427
3428         The category for a coding system which has the same code range
3429         as ISO2022 of 8-bit environment and graphic plane 1 used only
3430         for DIMENSION1 charset.  This doesn't use any locking shift
3431         and single shift functions.  Assigned the coding-system (Lisp
3432         symbol) `iso-latin-1' by default.
3433
3434    o coding-category-iso-8-2
3435
3436         The category for a coding system which has the same code range
3437         as ISO2022 of 8-bit environment and graphic plane 1 used only
3438         for DIMENSION2 charset.  This doesn't use any locking shift
3439         and single shift functions.  Assigned the coding-system (Lisp
3440         symbol) `japanese-iso-8bit' by default.
3441
3442    o coding-category-iso-7-else
3443
3444         The category for a coding system which has the same code range
3445         as ISO2022 of 7-bit environemnt but uses locking shift or
3446         single shift functions.  Assigned the coding-system (Lisp
3447         symbol) `iso-2022-7bit-lock' by default.
3448
3449    o coding-category-iso-8-else
3450
3451         The category for a coding system which has the same code range
3452         as ISO2022 of 8-bit environemnt but uses locking shift or
3453         single shift functions.  Assigned the coding-system (Lisp
3454         symbol) `iso-2022-8bit-ss2' by default.
3455
3456    o coding-category-big5
3457
3458         The category for a coding system which has the same code range
3459         as BIG5.  Assigned the coding-system (Lisp symbol)
3460         `cn-big5' by default.
3461
3462    o coding-category-utf-8
3463
3464         The category for a coding system which has the same code range
3465         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
3466         symbol) `utf-8' by default.
3467
3468    o coding-category-utf-16-be
3469
3470         The category for a coding system in which a text has an
3471         Unicode signature (cf. Unicode Standard) in the order of BIG
3472         endian at the head.  Assigned the coding-system (Lisp symbol)
3473         `utf-16-be' by default.
3474
3475    o coding-category-utf-16-le
3476
3477         The category for a coding system in which a text has an
3478         Unicode signature (cf. Unicode Standard) in the order of
3479         LITTLE endian at the head.  Assigned the coding-system (Lisp
3480         symbol) `utf-16-le' by default.
3481
3482    o coding-category-ccl
3483
3484         The category for a coding system of which encoder/decoder is
3485         written in CCL programs.  The default value is nil, i.e., no
3486         coding system is assigned.
3487
3488    o coding-category-binary
3489
3490         The category for a coding system not categorized in any of the
3491         above.  Assigned the coding-system (Lisp symbol)
3492         `no-conversion' by default.
3493
3494    Each of them is a Lisp symbol and the value is an actual
3495    `coding-system's (this is also a Lisp symbol) assigned by a user.
3496    What Emacs does actually is to detect a category of coding system.
3497    Then, it uses a `coding-system' assigned to it.  If Emacs can't
3498    decide only one possible category, it selects a category of the
3499    highest priority.  Priorities of categories are also specified by a
3500    user in a Lisp variable `coding-category-list'.
3501
3502 */
3503
3504 static
3505 int ascii_skip_code[256];
3506
3507 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
3508    If it detects possible coding systems, return an integer in which
3509    appropriate flag bits are set.  Flag bits are defined by macros
3510    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
3511    it should point the table `coding_priorities'.  In that case, only
3512    the flag bit for a coding system of the highest priority is set in
3513    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
3514    range 0x80..0x9F are in multibyte form.
3515
3516    How many ASCII characters are at the head is returned as *SKIP.  */
3517
3518 static int
3519 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
3520      unsigned char *source;
3521      int src_bytes, *priorities, *skip;
3522      int multibytep;
3523 {
3524   register unsigned char c;
3525   unsigned char *src = source, *src_end = source + src_bytes;
3526   unsigned int mask, utf16_examined_p, iso2022_examined_p;
3527   int i, idx;
3528
3529   /* At first, skip all ASCII characters and control characters except
3530      for three ISO2022 specific control characters.  */
3531   ascii_skip_code[ISO_CODE_SO] = 0;
3532   ascii_skip_code[ISO_CODE_SI] = 0;
3533   ascii_skip_code[ISO_CODE_ESC] = 0;
3534
3535  label_loop_detect_coding:
3536   while (src < src_end && ascii_skip_code[*src]) src++;
3537   *skip = src - source;
3538
3539   if (src >= src_end)
3540     /* We found nothing other than ASCII.  There's nothing to do.  */
3541     return 0;
3542
3543   c = *src;
3544   /* The text seems to be encoded in some multilingual coding system.
3545      Now, try to find in which coding system the text is encoded.  */
3546   if (c < 0x80)
3547     {
3548       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
3549       /* C is an ISO2022 specific control code of C0.  */
3550       mask = detect_coding_iso2022 (src, src_end, multibytep);
3551       if (mask == 0)
3552         {
3553           /* No valid ISO2022 code follows C.  Try again.  */
3554           src++;
3555           if (c == ISO_CODE_ESC)
3556             ascii_skip_code[ISO_CODE_ESC] = 1;
3557           else
3558             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
3559           goto label_loop_detect_coding;
3560         }
3561       if (priorities)
3562         {
3563           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3564             {
3565               if (mask & priorities[i])
3566                 return priorities[i];
3567             }
3568           return CODING_CATEGORY_MASK_RAW_TEXT;
3569         }
3570     }
3571   else
3572     {
3573       int try;
3574
3575       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
3576         c = *src++ - 0x20;
3577
3578       if (c < 0xA0)
3579         {
3580           /* C is the first byte of SJIS character code,
3581              or a leading-code of Emacs' internal format (emacs-mule),
3582              or the first byte of UTF-16.  */
3583           try = (CODING_CATEGORY_MASK_SJIS
3584                   | CODING_CATEGORY_MASK_EMACS_MULE
3585                   | CODING_CATEGORY_MASK_UTF_16_BE
3586                   | CODING_CATEGORY_MASK_UTF_16_LE);
3587
3588           /* Or, if C is a special latin extra code,
3589              or is an ISO2022 specific control code of C1 (SS2 or SS3),
3590              or is an ISO2022 control-sequence-introducer (CSI),
3591              we should also consider the possibility of ISO2022 codings.  */
3592           if ((VECTORP (Vlatin_extra_code_table)
3593                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3594               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
3595               || (c == ISO_CODE_CSI
3596                   && (src < src_end
3597                       && (*src == ']'
3598                           || ((*src == '0' || *src == '1' || *src == '2')
3599                               && src + 1 < src_end
3600                               && src[1] == ']')))))
3601             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
3602                      | CODING_CATEGORY_MASK_ISO_8BIT);
3603         }
3604       else
3605         /* C is a character of ISO2022 in graphic plane right,
3606            or a SJIS's 1-byte character code (i.e. JISX0201),
3607            or the first byte of BIG5's 2-byte code,
3608            or the first byte of UTF-8/16.  */
3609         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
3610                 | CODING_CATEGORY_MASK_ISO_8BIT
3611                 | CODING_CATEGORY_MASK_SJIS
3612                 | CODING_CATEGORY_MASK_BIG5
3613                 | CODING_CATEGORY_MASK_UTF_8
3614                 | CODING_CATEGORY_MASK_UTF_16_BE
3615                 | CODING_CATEGORY_MASK_UTF_16_LE);
3616
3617       /* Or, we may have to consider the possibility of CCL.  */
3618       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
3619           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
3620               ->spec.ccl.valid_codes)[c])
3621         try |= CODING_CATEGORY_MASK_CCL;
3622
3623       mask = 0;
3624       utf16_examined_p = iso2022_examined_p = 0;
3625       if (priorities)
3626         {
3627           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
3628             {
3629               if (!iso2022_examined_p
3630                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
3631                 {
3632                   mask |= detect_coding_iso2022 (src, src_end);
3633                   iso2022_examined_p = 1;
3634                 }
3635               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
3636                 mask |= detect_coding_sjis (src, src_end, multibytep);
3637               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
3638                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
3639               else if (!utf16_examined_p
3640                        && (priorities[i] & try &
3641                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
3642                 {
3643                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
3644                   utf16_examined_p = 1;
3645                 }
3646               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
3647                 mask |= detect_coding_big5 (src, src_end, multibytep);
3648               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
3649                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3650               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
3651                 mask |= detect_coding_ccl (src, src_end, multibytep);
3652               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
3653                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
3654               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
3655                 mask |= CODING_CATEGORY_MASK_BINARY;
3656               if (mask & priorities[i])
3657                 return priorities[i];
3658             }
3659           return CODING_CATEGORY_MASK_RAW_TEXT;
3660         }
3661       if (try & CODING_CATEGORY_MASK_ISO)
3662         mask |= detect_coding_iso2022 (src, src_end, multibytep);
3663       if (try & CODING_CATEGORY_MASK_SJIS)
3664         mask |= detect_coding_sjis (src, src_end, multibytep);
3665       if (try & CODING_CATEGORY_MASK_BIG5)
3666         mask |= detect_coding_big5 (src, src_end, multibytep);
3667       if (try & CODING_CATEGORY_MASK_UTF_8)
3668         mask |= detect_coding_utf_8 (src, src_end, multibytep);
3669       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
3670         mask |= detect_coding_utf_16 (src, src_end, multibytep);
3671       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
3672         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
3673       if (try & CODING_CATEGORY_MASK_CCL)
3674         mask |= detect_coding_ccl (src, src_end, multibytep);
3675     }
3676   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
3677 }
3678
3679 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
3680    The information of the detected coding system is set in CODING.  */
3681
3682 void
3683 detect_coding (coding, src, src_bytes)
3684      struct coding_system *coding;
3685      unsigned char *src;
3686      int src_bytes;
3687 {
3688   unsigned int idx;
3689   int skip, mask, i;
3690   Lisp_Object val;
3691
3692   val = Vcoding_category_list;
3693   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
3694                              coding->src_multibyte);
3695   coding->heading_ascii = skip;
3696
3697   if (!mask) return;
3698
3699   /* We found a single coding system of the highest priority in MASK.  */
3700   idx = 0;
3701   while (mask && ! (mask & 1)) mask >>= 1, idx++;
3702   if (! mask)
3703     idx = CODING_CATEGORY_IDX_RAW_TEXT;
3704
3705   val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[idx])->value;
3706
3707   if (coding->eol_type != CODING_EOL_UNDECIDED)
3708     {
3709       Lisp_Object tmp;
3710
3711       tmp = Fget (val, Qeol_type);
3712       if (VECTORP (tmp))
3713         val = XVECTOR (tmp)->contents[coding->eol_type];
3714     }
3715
3716   /* Setup this new coding system while preserving some slots.  */
3717   {
3718     int src_multibyte = coding->src_multibyte;
3719     int dst_multibyte = coding->dst_multibyte;
3720
3721     setup_coding_system (val, coding);
3722     coding->src_multibyte = src_multibyte;
3723     coding->dst_multibyte = dst_multibyte;
3724     coding->heading_ascii = skip;
3725   }
3726 }
3727
3728 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
3729    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
3730    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
3731
3732    How many non-eol characters are at the head is returned as *SKIP.  */
3733
3734 #define MAX_EOL_CHECK_COUNT 3
3735
3736 static int
3737 detect_eol_type (source, src_bytes, skip)
3738      unsigned char *source;
3739      int src_bytes, *skip;
3740 {
3741   unsigned char *src = source, *src_end = src + src_bytes;
3742   unsigned char c;
3743   int total = 0;                /* How many end-of-lines are found so far.  */
3744   int eol_type = CODING_EOL_UNDECIDED;
3745   int this_eol_type;
3746
3747   *skip = 0;
3748
3749   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
3750     {
3751       c = *src++;
3752       if (c == '\n' || c == '\r')
3753         {
3754           if (*skip == 0)
3755             *skip = src - 1 - source;
3756           total++;
3757           if (c == '\n')
3758             this_eol_type = CODING_EOL_LF;
3759           else if (src >= src_end || *src != '\n')
3760             this_eol_type = CODING_EOL_CR;
3761           else
3762             this_eol_type = CODING_EOL_CRLF, src++;
3763
3764           if (eol_type == CODING_EOL_UNDECIDED)
3765             /* This is the first end-of-line.  */
3766             eol_type = this_eol_type;
3767           else if (eol_type != this_eol_type)
3768             {
3769               /* The found type is different from what found before.  */
3770               eol_type = CODING_EOL_INCONSISTENT;
3771               break;
3772             }
3773         }
3774     }
3775
3776   if (*skip == 0)
3777     *skip = src_end - source;
3778   return eol_type;
3779 }
3780
3781 /* Like detect_eol_type, but detect EOL type in 2-octet
3782    big-endian/little-endian format for coding systems utf-16-be and
3783    utf-16-le.  */
3784
3785 static int
3786 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
3787      unsigned char *source;
3788      int src_bytes, *skip;
3789 {
3790   unsigned char *src = source, *src_end = src + src_bytes;
3791   unsigned int c1, c2;
3792   int total = 0;                /* How many end-of-lines are found so far.  */
3793   int eol_type = CODING_EOL_UNDECIDED;
3794   int this_eol_type;
3795   int msb, lsb;
3796
3797   if (big_endian_p)
3798     msb = 0, lsb = 1;
3799   else
3800     msb = 1, lsb = 0;
3801
3802   *skip = 0;
3803
3804   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
3805     {
3806       c1 = (src[msb] << 8) | (src[lsb]);
3807       src += 2;
3808
3809       if (c1 == '\n' || c1 == '\r')
3810         {
3811           if (*skip == 0)
3812             *skip = src - 2 - source;
3813           total++;
3814           if (c1 == '\n')
3815             {
3816               this_eol_type = CODING_EOL_LF;
3817             }
3818           else
3819             {
3820               if ((src + 1) >= src_end)
3821                 {
3822                   this_eol_type = CODING_EOL_CR;
3823                 }
3824               else
3825                 {
3826                   c2 = (src[msb] << 8) | (src[lsb]);
3827                   if (c2 == '\n')
3828                     this_eol_type = CODING_EOL_CRLF, src += 2;
3829                   else
3830                     this_eol_type = CODING_EOL_CR;
3831                 }
3832             }
3833
3834           if (eol_type == CODING_EOL_UNDECIDED)
3835             /* This is the first end-of-line.  */
3836             eol_type = this_eol_type;
3837           else if (eol_type != this_eol_type)
3838             {
3839               /* The found type is different from what found before.  */
3840               eol_type = CODING_EOL_INCONSISTENT;
3841               break;
3842             }
3843         }
3844     }
3845
3846   if (*skip == 0)
3847     *skip = src_end - source;
3848   return eol_type;
3849 }
3850
3851 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
3852    is encoded.  If it detects an appropriate format of end-of-line, it
3853    sets the information in *CODING.  */
3854
3855 void
3856 detect_eol (coding, src, src_bytes)
3857      struct coding_system *coding;
3858      unsigned char *src;
3859      int src_bytes;
3860 {
3861   Lisp_Object val;
3862   int skip;
3863   int eol_type;
3864
3865   switch (coding->category_idx)
3866     {
3867     case CODING_CATEGORY_IDX_UTF_16_BE:
3868       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
3869       break;
3870     case CODING_CATEGORY_IDX_UTF_16_LE:
3871       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
3872       break;
3873     default:
3874       eol_type = detect_eol_type (src, src_bytes, &skip);
3875       break;
3876     }
3877
3878   if (coding->heading_ascii > skip)
3879     coding->heading_ascii = skip;
3880   else
3881     skip = coding->heading_ascii;
3882
3883   if (eol_type == CODING_EOL_UNDECIDED)
3884     return;
3885   if (eol_type == CODING_EOL_INCONSISTENT)
3886     {
3887 #if 0
3888       /* This code is suppressed until we find a better way to
3889          distinguish raw text file and binary file.  */
3890
3891       /* If we have already detected that the coding is raw-text, the
3892          coding should actually be no-conversion.  */
3893       if (coding->type == coding_type_raw_text)
3894         {
3895           setup_coding_system (Qno_conversion, coding);
3896           return;
3897         }
3898       /* Else, let's decode only text code anyway.  */
3899 #endif /* 0 */
3900       eol_type = CODING_EOL_LF;
3901     }
3902
3903   val = Fget (coding->symbol, Qeol_type);
3904   if (VECTORP (val) && XVECTOR (val)->size == 3)
3905     {
3906       int src_multibyte = coding->src_multibyte;
3907       int dst_multibyte = coding->dst_multibyte;
3908
3909       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
3910       coding->src_multibyte = src_multibyte;
3911       coding->dst_multibyte = dst_multibyte;
3912       coding->heading_ascii = skip;
3913     }
3914 }
3915
3916 #define CONVERSION_BUFFER_EXTRA_ROOM 256
3917
3918 #define DECODING_BUFFER_MAG(coding)                     \
3919   (coding->type == coding_type_iso2022                  \
3920    ? 3                                                  \
3921    : (coding->type == coding_type_ccl                   \
3922       ? coding->spec.ccl.decoder.buf_magnification      \
3923       : 2))
3924
3925 /* Return maximum size (bytes) of a buffer enough for decoding
3926    SRC_BYTES of text encoded in CODING.  */
3927
3928 int
3929 decoding_buffer_size (coding, src_bytes)
3930      struct coding_system *coding;
3931      int src_bytes;
3932 {
3933   return (src_bytes * DECODING_BUFFER_MAG (coding)
3934           + CONVERSION_BUFFER_EXTRA_ROOM);
3935 }
3936
3937 /* Return maximum size (bytes) of a buffer enough for encoding
3938    SRC_BYTES of text to CODING.  */
3939
3940 int
3941 encoding_buffer_size (coding, src_bytes)
3942      struct coding_system *coding;
3943      int src_bytes;
3944 {
3945   int magnification;
3946
3947   if (coding->type == coding_type_ccl)
3948     magnification = coding->spec.ccl.encoder.buf_magnification;
3949   else if (CODING_REQUIRE_ENCODING (coding))
3950     magnification = 3;
3951   else
3952     magnification = 1;
3953
3954   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
3955 }
3956
3957 /* Working buffer for code conversion.  */
3958 struct conversion_buffer
3959 {
3960   int size;                     /* size of data.  */
3961   int on_stack;                 /* 1 if allocated by alloca.  */
3962   unsigned char *data;
3963 };
3964
3965 /* Don't use alloca for allocating memory space larger than this, lest
3966    we overflow their stack.  */
3967 #define MAX_ALLOCA 16*1024
3968
3969 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
3970 #define allocate_conversion_buffer(buf, len)            \
3971   do {                                                  \
3972     if (len < MAX_ALLOCA)                               \
3973       {                                                 \
3974         buf.data = (unsigned char *) alloca (len);      \
3975         buf.on_stack = 1;                               \
3976       }                                                 \
3977     else                                                \
3978       {                                                 \
3979         buf.data = (unsigned char *) xmalloc (len);     \
3980         buf.on_stack = 0;                               \
3981       }                                                 \
3982     buf.size = len;                                     \
3983   } while (0)
3984
3985 /* Double the allocated memory for *BUF.  */
3986 static void
3987 extend_conversion_buffer (buf)
3988      struct conversion_buffer *buf;
3989 {
3990   if (buf->on_stack)
3991     {
3992       unsigned char *save = buf->data;
3993       buf->data = (unsigned char *) xmalloc (buf->size * 2);
3994       bcopy (save, buf->data, buf->size);
3995       buf->on_stack = 0;
3996     }
3997   else
3998     {
3999       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4000     }
4001   buf->size *= 2;
4002 }
4003
4004 /* Free the allocated memory for BUF if it is not on stack.  */
4005 static void
4006 free_conversion_buffer (buf)
4007      struct conversion_buffer *buf;
4008 {
4009   if (!buf->on_stack)
4010     xfree (buf->data);
4011 }
4012
4013 int
4014 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4015      struct coding_system *coding;
4016      unsigned char *source, *destination;
4017      int src_bytes, dst_bytes, encodep;
4018 {
4019   struct ccl_program *ccl
4020     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4021   int result;
4022
4023   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4024   if (encodep)
4025     {
4026       /* On encoding, EOL format is converted within ccl_driver.  For
4027          that, setup proper information in the structure CCL.  */
4028       ccl->eol_type = coding->eol_type;
4029       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4030         ccl->eol_type = CODING_EOL_LF;
4031       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4032     }
4033   ccl->multibyte = coding->src_multibyte;
4034   coding->produced = ccl_driver (ccl, source, destination,
4035                                  src_bytes, dst_bytes, &(coding->consumed));
4036   if (encodep)
4037     {
4038       coding->produced_char = coding->produced;
4039       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4040     }
4041   else
4042     {
4043       int bytes
4044         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4045       coding->produced = str_as_multibyte (destination, bytes,
4046                                            coding->produced,
4047                                            &(coding->produced_char));
4048     }
4049
4050   switch (ccl->status)
4051     {
4052     case CCL_STAT_SUSPEND_BY_SRC:
4053       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4054       break;
4055     case CCL_STAT_SUSPEND_BY_DST:
4056       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4057       break;
4058     case CCL_STAT_QUIT:
4059     case CCL_STAT_INVALID_CMD:
4060       coding->result = CODING_FINISH_INTERRUPT;
4061       break;
4062     default:
4063       coding->result = CODING_FINISH_NORMAL;
4064       break;
4065     }
4066   return coding->result;
4067 }
4068
4069 /* Decode EOL format of the text at PTR of BYTES length destructively
4070    according to CODING->eol_type.  This is called after the CCL
4071    program produced a decoded text at PTR.  If we do CRLF->LF
4072    conversion, update CODING->produced and CODING->produced_char.  */
4073
4074 static void
4075 decode_eol_post_ccl (coding, ptr, bytes)
4076      struct coding_system *coding;
4077      unsigned char *ptr;
4078      int bytes;
4079 {
4080   Lisp_Object val, saved_coding_symbol;
4081   unsigned char *pend = ptr + bytes;
4082   int dummy;
4083
4084   /* Remember the current coding system symbol.  We set it back when
4085      an inconsistent EOL is found so that `last-coding-system-used' is
4086      set to the coding system that doesn't specify EOL conversion.  */
4087   saved_coding_symbol = coding->symbol;
4088
4089   coding->spec.ccl.cr_carryover = 0;
4090   if (coding->eol_type == CODING_EOL_UNDECIDED)
4091     {
4092       /* Here, to avoid the call of setup_coding_system, we directly
4093          call detect_eol_type.  */
4094       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4095       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4096         coding->eol_type = CODING_EOL_LF;
4097       if (coding->eol_type != CODING_EOL_UNDECIDED)
4098         {
4099           val = Fget (coding->symbol, Qeol_type);
4100           if (VECTORP (val) && XVECTOR (val)->size == 3)
4101             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4102         }
4103       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4104     }
4105
4106   if (coding->eol_type == CODING_EOL_LF
4107       || coding->eol_type == CODING_EOL_UNDECIDED)
4108     {
4109       /* We have nothing to do.  */
4110       ptr = pend;
4111     }
4112   else if (coding->eol_type == CODING_EOL_CRLF)
4113     {
4114       unsigned char *pstart = ptr, *p = ptr;
4115
4116       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4117           && *(pend - 1) == '\r')
4118         {
4119           /* If the last character is CR, we can't handle it here
4120              because LF will be in the not-yet-decoded source text.
4121              Recorded that the CR is not yet processed.  */
4122           coding->spec.ccl.cr_carryover = 1;
4123           coding->produced--;
4124           coding->produced_char--;
4125           pend--;
4126         }
4127       while (ptr < pend)
4128         {
4129           if (*ptr == '\r')
4130             {
4131               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4132                 {
4133                   *p++ = '\n';
4134                   ptr += 2;
4135                 }
4136               else
4137                 {
4138                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4139                     goto undo_eol_conversion;
4140                   *p++ = *ptr++;
4141                 }
4142             }
4143           else if (*ptr == '\n'
4144                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4145             goto undo_eol_conversion;
4146           else
4147             *p++ = *ptr++;
4148           continue;
4149
4150         undo_eol_conversion:
4151           /* We have faced with inconsistent EOL format at PTR.
4152              Convert all LFs before PTR back to CRLFs.  */
4153           for (p--, ptr--; p >= pstart; p--)
4154             {
4155               if (*p == '\n')
4156                 *ptr-- = '\n', *ptr-- = '\r';
4157               else
4158                 *ptr-- = *p;
4159             }
4160           /*  If carryover is recorded, cancel it because we don't
4161               convert CRLF anymore.  */
4162           if (coding->spec.ccl.cr_carryover)
4163             {
4164               coding->spec.ccl.cr_carryover = 0;
4165               coding->produced++;
4166               coding->produced_char++;
4167               pend++;
4168             }
4169           p = ptr = pend;
4170           coding->eol_type = CODING_EOL_LF;
4171           coding->symbol = saved_coding_symbol;
4172         }
4173       if (p < pend)
4174         {
4175           /* As each two-byte sequence CRLF was converted to LF, (PEND
4176              - P) is the number of deleted characters.  */
4177           coding->produced -= pend - p;
4178           coding->produced_char -= pend - p;
4179         }
4180     }
4181   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4182     {
4183       unsigned char *p = ptr;
4184
4185       for (; ptr < pend; ptr++)
4186         {
4187           if (*ptr == '\r')
4188             *ptr = '\n';
4189           else if (*ptr == '\n'
4190                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4191             {
4192               for (; p < ptr; p++)
4193                 {
4194                   if (*p == '\n')
4195                     *p = '\r';
4196                 }
4197               ptr = pend;
4198               coding->eol_type = CODING_EOL_LF;
4199               coding->symbol = saved_coding_symbol;
4200             }
4201         }
4202     }
4203 }
4204
4205 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4206    decoding, it may detect coding system and format of end-of-line if
4207    those are not yet decided.  The source should be unibyte, the
4208    result is multibyte if CODING->dst_multibyte is nonzero, else
4209    unibyte.  */
4210
4211 int
4212 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4213      struct coding_system *coding;
4214      unsigned char *source, *destination;
4215      int src_bytes, dst_bytes;
4216 {
4217   if (coding->type == coding_type_undecided)
4218     detect_coding (coding, source, src_bytes);
4219
4220   if (coding->eol_type == CODING_EOL_UNDECIDED
4221       && coding->type != coding_type_ccl)
4222     detect_eol (coding, source, src_bytes);
4223
4224   coding->produced = coding->produced_char = 0;
4225   coding->consumed = coding->consumed_char = 0;
4226   coding->errors = 0;
4227   coding->result = CODING_FINISH_NORMAL;
4228
4229   switch (coding->type)
4230     {
4231     case coding_type_sjis:
4232       decode_coding_sjis_big5 (coding, source, destination,
4233                                src_bytes, dst_bytes, 1);
4234       break;
4235
4236     case coding_type_iso2022:
4237       decode_coding_iso2022 (coding, source, destination,
4238                              src_bytes, dst_bytes);
4239       break;
4240
4241     case coding_type_big5:
4242       decode_coding_sjis_big5 (coding, source, destination,
4243                                src_bytes, dst_bytes, 0);
4244       break;
4245
4246     case coding_type_emacs_mule:
4247       decode_coding_emacs_mule (coding, source, destination,
4248                                 src_bytes, dst_bytes);
4249       break;
4250
4251     case coding_type_ccl:
4252       if (coding->spec.ccl.cr_carryover)
4253         {
4254           /* Set the CR which is not processed by the previous call of
4255              decode_eol_post_ccl in DESTINATION.  */
4256           *destination = '\r';
4257           coding->produced++;
4258           coding->produced_char++;
4259           dst_bytes--;
4260         }
4261       ccl_coding_driver (coding, source,
4262                          destination + coding->spec.ccl.cr_carryover,
4263                          src_bytes, dst_bytes, 0);
4264       if (coding->eol_type != CODING_EOL_LF)
4265         decode_eol_post_ccl (coding, destination, coding->produced);
4266       break;
4267
4268     default:
4269       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4270     }
4271
4272   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4273       && coding->mode & CODING_MODE_LAST_BLOCK
4274       && coding->consumed == src_bytes)
4275     coding->result = CODING_FINISH_NORMAL;
4276
4277   if (coding->mode & CODING_MODE_LAST_BLOCK
4278       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4279     {
4280       unsigned char *src = source + coding->consumed;
4281       unsigned char *dst = destination + coding->produced;
4282
4283       src_bytes -= coding->consumed;
4284       coding->errors++;
4285       if (COMPOSING_P (coding))
4286         DECODE_COMPOSITION_END ('1');
4287       while (src_bytes--)
4288         {
4289           int c = *src++;
4290           dst += CHAR_STRING (c, dst);
4291           coding->produced_char++;
4292         }
4293       coding->consumed = coding->consumed_char = src - source;
4294       coding->produced = dst - destination;
4295       coding->result = CODING_FINISH_NORMAL;
4296     }
4297
4298   if (!coding->dst_multibyte)
4299     {
4300       coding->produced = str_as_unibyte (destination, coding->produced);
4301       coding->produced_char = coding->produced;
4302     }
4303
4304   return coding->result;
4305 }
4306
4307 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4308    multibyteness of the source is CODING->src_multibyte, the
4309    multibyteness of the result is always unibyte.  */
4310
4311 int
4312 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4313      struct coding_system *coding;
4314      unsigned char *source, *destination;
4315      int src_bytes, dst_bytes;
4316 {
4317   coding->produced = coding->produced_char = 0;
4318   coding->consumed = coding->consumed_char = 0;
4319   coding->errors = 0;
4320   coding->result = CODING_FINISH_NORMAL;
4321
4322   switch (coding->type)
4323     {
4324     case coding_type_sjis:
4325       encode_coding_sjis_big5 (coding, source, destination,
4326                                src_bytes, dst_bytes, 1);
4327       break;
4328
4329     case coding_type_iso2022:
4330       encode_coding_iso2022 (coding, source, destination,
4331                              src_bytes, dst_bytes);
4332       break;
4333
4334     case coding_type_big5:
4335       encode_coding_sjis_big5 (coding, source, destination,
4336                                src_bytes, dst_bytes, 0);
4337       break;
4338
4339     case coding_type_emacs_mule:
4340       encode_coding_emacs_mule (coding, source, destination,
4341                                 src_bytes, dst_bytes);
4342       break;
4343
4344     case coding_type_ccl:
4345       ccl_coding_driver (coding, source, destination,
4346                          src_bytes, dst_bytes, 1);
4347       break;
4348
4349     default:
4350       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4351     }
4352
4353   if (coding->mode & CODING_MODE_LAST_BLOCK
4354       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4355     {
4356       unsigned char *src = source + coding->consumed;
4357       unsigned char *src_end = src + src_bytes;
4358       unsigned char *dst = destination + coding->produced;
4359
4360       if (coding->type == coding_type_iso2022)
4361         ENCODE_RESET_PLANE_AND_REGISTER;
4362       if (COMPOSING_P (coding))
4363         *dst++ = ISO_CODE_ESC, *dst++ = '1';
4364       if (coding->consumed < src_bytes)
4365         {
4366           int len = src_bytes - coding->consumed;
4367
4368           BCOPY_SHORT (source + coding->consumed, dst, len);
4369           if (coding->src_multibyte)
4370             len = str_as_unibyte (dst, len);
4371           dst += len;
4372           coding->consumed = src_bytes;
4373         }
4374       coding->produced = coding->produced_char = dst - destination;
4375       coding->result = CODING_FINISH_NORMAL;
4376     }
4377
4378   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4379       && coding->consumed == src_bytes)
4380     coding->result = CODING_FINISH_NORMAL;
4381
4382   return coding->result;
4383 }
4384
4385 /* Scan text in the region between *BEG and *END (byte positions),
4386    skip characters which we don't have to decode by coding system
4387    CODING at the head and tail, then set *BEG and *END to the region
4388    of the text we actually have to convert.  The caller should move
4389    the gap out of the region in advance if the region is from a
4390    buffer.
4391
4392    If STR is not NULL, *BEG and *END are indices into STR.  */
4393
4394 static void
4395 shrink_decoding_region (beg, end, coding, str)
4396      int *beg, *end;
4397      struct coding_system *coding;
4398      unsigned char *str;
4399 {
4400   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
4401   int eol_conversion;
4402   Lisp_Object translation_table;
4403
4404   if (coding->type == coding_type_ccl
4405       || coding->type == coding_type_undecided
4406       || coding->eol_type != CODING_EOL_LF
4407       || !NILP (coding->post_read_conversion)
4408       || coding->composing != COMPOSITION_DISABLED)
4409     {
4410       /* We can't skip any data.  */
4411       return;
4412     }
4413   if (coding->type == coding_type_no_conversion
4414       || coding->type == coding_type_raw_text
4415       || coding->type == coding_type_emacs_mule)
4416     {
4417       /* We need no conversion, but don't have to skip any data here.
4418          Decoding routine handles them effectively anyway.  */
4419       return;
4420     }
4421
4422   translation_table = coding->translation_table_for_decode;
4423   if (NILP (translation_table) && !NILP (Venable_character_translation))
4424     translation_table = Vstandard_translation_table_for_decode;
4425   if (CHAR_TABLE_P (translation_table))
4426     {
4427       int i;
4428       for (i = 0; i < 128; i++)
4429         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4430           break;
4431       if (i < 128)
4432         /* Some ASCII character should be translated.  We give up
4433            shrinking.  */
4434         return;
4435     }
4436
4437   if (coding->heading_ascii >= 0)
4438     /* Detection routine has already found how much we can skip at the
4439        head.  */
4440     *beg += coding->heading_ascii;
4441
4442   if (str)
4443     {
4444       begp_orig = begp = str + *beg;
4445       endp_orig = endp = str + *end;
4446     }
4447   else
4448     {
4449       begp_orig = begp = BYTE_POS_ADDR (*beg);
4450       endp_orig = endp = begp + *end - *beg;
4451     }
4452
4453   eol_conversion = (coding->eol_type == CODING_EOL_CR
4454                     || coding->eol_type == CODING_EOL_CRLF);
4455
4456   switch (coding->type)
4457     {
4458     case coding_type_sjis:
4459     case coding_type_big5:
4460       /* We can skip all ASCII characters at the head.  */
4461       if (coding->heading_ascii < 0)
4462         {
4463           if (eol_conversion)
4464             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
4465           else
4466             while (begp < endp && *begp < 0x80) begp++;
4467         }
4468       /* We can skip all ASCII characters at the tail except for the
4469          second byte of SJIS or BIG5 code.  */
4470       if (eol_conversion)
4471         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
4472       else
4473         while (begp < endp && endp[-1] < 0x80) endp--;
4474       /* Do not consider LF as ascii if preceded by CR, since that
4475          confuses eol decoding. */
4476       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4477         endp++;
4478       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
4479         endp++;
4480       break;
4481
4482     case coding_type_iso2022:
4483       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4484         /* We can't skip any data.  */
4485         break;
4486       if (coding->heading_ascii < 0)
4487         {
4488           /* We can skip all ASCII characters at the head except for a
4489              few control codes.  */
4490           while (begp < endp && (c = *begp) < 0x80
4491                  && c != ISO_CODE_CR && c != ISO_CODE_SO
4492                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
4493                  && (!eol_conversion || c != ISO_CODE_LF))
4494             begp++;
4495         }
4496       switch (coding->category_idx)
4497         {
4498         case CODING_CATEGORY_IDX_ISO_8_1:
4499         case CODING_CATEGORY_IDX_ISO_8_2:
4500           /* We can skip all ASCII characters at the tail.  */
4501           if (eol_conversion)
4502             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
4503           else
4504             while (begp < endp && endp[-1] < 0x80) endp--;
4505           /* Do not consider LF as ascii if preceded by CR, since that
4506              confuses eol decoding. */
4507           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
4508             endp++;
4509           break;
4510
4511         case CODING_CATEGORY_IDX_ISO_7:
4512         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
4513           {
4514             /* We can skip all charactes at the tail except for 8-bit
4515                codes and ESC and the following 2-byte at the tail.  */
4516             unsigned char *eight_bit = NULL;
4517
4518             if (eol_conversion)
4519               while (begp < endp
4520                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
4521                 {
4522                   if (!eight_bit && c & 0x80) eight_bit = endp;
4523                   endp--;
4524                 }
4525             else
4526               while (begp < endp
4527                      && (c = endp[-1]) != ISO_CODE_ESC)
4528                 {
4529                   if (!eight_bit && c & 0x80) eight_bit = endp;
4530                   endp--;
4531                 }
4532             /* Do not consider LF as ascii if preceded by CR, since that
4533                confuses eol decoding. */
4534             if (begp < endp && endp < endp_orig
4535                 && endp[-1] == '\r' && endp[0] == '\n')
4536               endp++;
4537             if (begp < endp && endp[-1] == ISO_CODE_ESC)
4538               {
4539                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
4540                   /* This is an ASCII designation sequence.  We can
4541                      surely skip the tail.  But, if we have
4542                      encountered an 8-bit code, skip only the codes
4543                      after that.  */
4544                   endp = eight_bit ? eight_bit : endp + 2;
4545                 else
4546                   /* Hmmm, we can't skip the tail.  */
4547                   endp = endp_orig;
4548               }
4549             else if (eight_bit)
4550               endp = eight_bit;
4551           }
4552         }
4553       break;
4554
4555     default:
4556       abort ();
4557     }
4558   *beg += begp - begp_orig;
4559   *end += endp - endp_orig;
4560   return;
4561 }
4562
4563 /* Like shrink_decoding_region but for encoding.  */
4564
4565 static void
4566 shrink_encoding_region (beg, end, coding, str)
4567      int *beg, *end;
4568      struct coding_system *coding;
4569      unsigned char *str;
4570 {
4571   unsigned char *begp_orig, *begp, *endp_orig, *endp;
4572   int eol_conversion;
4573   Lisp_Object translation_table;
4574
4575   if (coding->type == coding_type_ccl
4576       || coding->eol_type == CODING_EOL_CRLF
4577       || coding->eol_type == CODING_EOL_CR
4578       || coding->cmp_data && coding->cmp_data->used > 0)
4579     {
4580       /* We can't skip any data.  */
4581       return;
4582     }
4583   if (coding->type == coding_type_no_conversion
4584       || coding->type == coding_type_raw_text
4585       || coding->type == coding_type_emacs_mule
4586       || coding->type == coding_type_undecided)
4587     {
4588       /* We need no conversion, but don't have to skip any data here.
4589          Encoding routine handles them effectively anyway.  */
4590       return;
4591     }
4592
4593   translation_table = coding->translation_table_for_encode;
4594   if (NILP (translation_table) && !NILP (Venable_character_translation))
4595     translation_table = Vstandard_translation_table_for_encode;
4596   if (CHAR_TABLE_P (translation_table))
4597     {
4598       int i;
4599       for (i = 0; i < 128; i++)
4600         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
4601           break;
4602       if (i < 128)
4603         /* Some ASCII character should be tranlsated.  We give up
4604            shrinking.  */
4605         return;
4606     }
4607
4608   if (str)
4609     {
4610       begp_orig = begp = str + *beg;
4611       endp_orig = endp = str + *end;
4612     }
4613   else
4614     {
4615       begp_orig = begp = BYTE_POS_ADDR (*beg);
4616       endp_orig = endp = begp + *end - *beg;
4617     }
4618
4619   eol_conversion = (coding->eol_type == CODING_EOL_CR
4620                     || coding->eol_type == CODING_EOL_CRLF);
4621
4622   /* Here, we don't have to check coding->pre_write_conversion because
4623      the caller is expected to have handled it already.  */
4624   switch (coding->type)
4625     {
4626     case coding_type_iso2022:
4627       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
4628         /* We can't skip any data.  */
4629         break;
4630       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
4631         {
4632           unsigned char *bol = begp;
4633           while (begp < endp && *begp < 0x80)
4634             {
4635               begp++;
4636               if (begp[-1] == '\n')
4637                 bol = begp;
4638             }
4639           begp = bol;
4640           goto label_skip_tail;
4641         }
4642       /* fall down ... */
4643
4644     case coding_type_sjis:
4645     case coding_type_big5:
4646       /* We can skip all ASCII characters at the head and tail.  */
4647       if (eol_conversion)
4648         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
4649       else
4650         while (begp < endp && *begp < 0x80) begp++;
4651     label_skip_tail:
4652       if (eol_conversion)
4653         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
4654       else
4655         while (begp < endp && *(endp - 1) < 0x80) endp--;
4656       break;
4657
4658     default:
4659       abort ();
4660     }
4661
4662   *beg += begp - begp_orig;
4663   *end += endp - endp_orig;
4664   return;
4665 }
4666
4667 /* As shrinking conversion region requires some overhead, we don't try
4668    shrinking if the length of conversion region is less than this
4669    value.  */
4670 static int shrink_conversion_region_threshhold = 1024;
4671
4672 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
4673   do {                                                                  \
4674     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
4675       {                                                                 \
4676         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
4677         else shrink_decoding_region (beg, end, coding, str);            \
4678       }                                                                 \
4679   } while (0)
4680
4681 static Lisp_Object
4682 code_convert_region_unwind (dummy)
4683      Lisp_Object dummy;
4684 {
4685   inhibit_pre_post_conversion = 0;
4686   return Qnil;
4687 }
4688
4689 /* Store information about all compositions in the range FROM and TO
4690    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
4691    buffer or a string, defaults to the current buffer.  */
4692
4693 void
4694 coding_save_composition (coding, from, to, obj)
4695      struct coding_system *coding;
4696      int from, to;
4697      Lisp_Object obj;
4698 {
4699   Lisp_Object prop;
4700   int start, end;
4701
4702   if (coding->composing == COMPOSITION_DISABLED)
4703     return;
4704   if (!coding->cmp_data)
4705     coding_allocate_composition_data (coding, from);
4706   if (!find_composition (from, to, &start, &end, &prop, obj)
4707       || end > to)
4708     return;
4709   if (start < from
4710       && (!find_composition (end, to, &start, &end, &prop, obj)
4711           || end > to))
4712     return;
4713   coding->composing = COMPOSITION_NO;
4714   do
4715     {
4716       if (COMPOSITION_VALID_P (start, end, prop))
4717         {
4718           enum composition_method method = COMPOSITION_METHOD (prop);
4719           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
4720               >= COMPOSITION_DATA_SIZE)
4721             coding_allocate_composition_data (coding, from);
4722           /* For relative composition, we remember start and end
4723              positions, for the other compositions, we also remember
4724              components.  */
4725           CODING_ADD_COMPOSITION_START (coding, start - from, method);
4726           if (method != COMPOSITION_RELATIVE)
4727             {
4728               /* We must store a*/
4729               Lisp_Object val, ch;
4730
4731               val = COMPOSITION_COMPONENTS (prop);
4732               if (CONSP (val))
4733                 while (CONSP (val))
4734                   {
4735                     ch = XCAR (val), val = XCDR (val);
4736                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4737                   }
4738               else if (VECTORP (val) || STRINGP (val))
4739                 {
4740                   int len = (VECTORP (val)
4741                              ? XVECTOR (val)->size : XSTRING (val)->size);
4742                   int i;
4743                   for (i = 0; i < len; i++)
4744                     {
4745                       ch = (STRINGP (val)
4746                             ? Faref (val, make_number (i))
4747                             : XVECTOR (val)->contents[i]);
4748                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
4749                     }
4750                 }
4751               else              /* INTEGERP (val) */
4752                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
4753             }
4754           CODING_ADD_COMPOSITION_END (coding, end - from);
4755         }
4756       start = end;
4757     }
4758   while (start < to
4759          && find_composition (start, to, &start, &end, &prop, obj)
4760          && end <= to);
4761
4762   /* Make coding->cmp_data point to the first memory block.  */
4763   while (coding->cmp_data->prev)
4764     coding->cmp_data = coding->cmp_data->prev;
4765   coding->cmp_data_start = 0;
4766 }
4767
4768 /* Reflect the saved information about compositions to OBJ.
4769    CODING->cmp_data points to a memory block for the informaiton.  OBJ
4770    is a buffer or a string, defaults to the current buffer.  */
4771
4772 void
4773 coding_restore_composition (coding, obj)
4774      struct coding_system *coding;
4775      Lisp_Object obj;
4776 {
4777   struct composition_data *cmp_data = coding->cmp_data;
4778
4779   if (!cmp_data)
4780     return;
4781
4782   while (cmp_data->prev)
4783     cmp_data = cmp_data->prev;
4784
4785   while (cmp_data)
4786     {
4787       int i;
4788
4789       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
4790            i += cmp_data->data[i])
4791         {
4792           int *data = cmp_data->data + i;
4793           enum composition_method method = (enum composition_method) data[3];
4794           Lisp_Object components;
4795
4796           if (method == COMPOSITION_RELATIVE)
4797             components = Qnil;
4798           else
4799             {
4800               int len = data[0] - 4, j;
4801               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
4802
4803               for (j = 0; j < len; j++)
4804                 args[j] = make_number (data[4 + j]);
4805               components = (method == COMPOSITION_WITH_ALTCHARS
4806                             ? Fstring (len, args) : Fvector (len, args));
4807             }
4808           compose_text (data[1], data[2], components, Qnil, obj);
4809         }
4810       cmp_data = cmp_data->next;
4811     }
4812 }
4813
4814 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
4815    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
4816    coding system CODING, and return the status code of code conversion
4817    (currently, this value has no meaning).
4818
4819    How many characters (and bytes) are converted to how many
4820    characters (and bytes) are recorded in members of the structure
4821    CODING.
4822
4823    If REPLACE is nonzero, we do various things as if the original text
4824    is deleted and a new text is inserted.  See the comments in
4825    replace_range (insdel.c) to know what we are doing.
4826
4827    If REPLACE is zero, it is assumed that the source text is unibyte.
4828    Otherwize, it is assumed that the source text is multibyte.  */
4829
4830 int
4831 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
4832      int from, from_byte, to, to_byte, encodep, replace;
4833      struct coding_system *coding;
4834 {
4835   int len = to - from, len_byte = to_byte - from_byte;
4836   int require, inserted, inserted_byte;
4837   int head_skip, tail_skip, total_skip = 0;
4838   Lisp_Object saved_coding_symbol;
4839   int first = 1;
4840   unsigned char *src, *dst;
4841   Lisp_Object deletion;
4842   int orig_point = PT, orig_len = len;
4843   int prev_Z;
4844   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
4845
4846   deletion = Qnil;
4847   saved_coding_symbol = Qnil;
4848
4849   if (from < PT && PT < to)
4850     {
4851       TEMP_SET_PT_BOTH (from, from_byte);
4852       orig_point = from;
4853     }
4854
4855   if (replace)
4856     {
4857       int saved_from = from;
4858       int saved_inhibit_modification_hooks;
4859
4860       prepare_to_modify_buffer (from, to, &from);
4861       if (saved_from != from)
4862         {
4863           to = from + len;
4864           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
4865           len_byte = to_byte - from_byte;
4866         }
4867
4868       /* The code conversion routine can not preserve text properties
4869          for now.  So, we must remove all text properties in the
4870          region.  Here, we must suppress all modification hooks.  */
4871       saved_inhibit_modification_hooks = inhibit_modification_hooks;
4872       inhibit_modification_hooks = 1;
4873       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
4874       inhibit_modification_hooks = saved_inhibit_modification_hooks;
4875     }
4876
4877   if (! encodep && CODING_REQUIRE_DETECTION (coding))
4878     {
4879       /* We must detect encoding of text and eol format.  */
4880
4881       if (from < GPT && to > GPT)
4882         move_gap_both (from, from_byte);
4883       if (coding->type == coding_type_undecided)
4884         {
4885           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
4886           if (coding->type == coding_type_undecided)
4887             {
4888               /* It seems that the text contains only ASCII, but we
4889                  should not leave it undecided because the deeper
4890                  decoding routine (decode_coding) tries to detect the
4891                  encodings again in vain.  */
4892               coding->type = coding_type_emacs_mule;
4893               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
4894             }
4895         }
4896       if (coding->eol_type == CODING_EOL_UNDECIDED
4897           && coding->type != coding_type_ccl)
4898         {
4899           saved_coding_symbol = coding->symbol;
4900           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
4901           if (coding->eol_type == CODING_EOL_UNDECIDED)
4902             coding->eol_type = CODING_EOL_LF;
4903           /* We had better recover the original eol format if we
4904              encounter an inconsitent eol format while decoding.  */
4905           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4906         }
4907     }
4908
4909   /* Now we convert the text.  */
4910
4911   /* For encoding, we must process pre-write-conversion in advance.  */
4912   if (! inhibit_pre_post_conversion
4913       && encodep
4914       && SYMBOLP (coding->pre_write_conversion)
4915       && ! NILP (Ffboundp (coding->pre_write_conversion)))
4916     {
4917       /* The function in pre-write-conversion may put a new text in a
4918          new buffer.  */
4919       struct buffer *prev = current_buffer;
4920       Lisp_Object new;
4921       int count = specpdl_ptr - specpdl;
4922
4923       record_unwind_protect (code_convert_region_unwind, Qnil);
4924       /* We should not call any more pre-write/post-read-conversion
4925          functions while this pre-write-conversion is running.  */
4926       inhibit_pre_post_conversion = 1;
4927       call2 (coding->pre_write_conversion,
4928              make_number (from), make_number (to));
4929       inhibit_pre_post_conversion = 0;
4930       /* Discard the unwind protect.  */
4931       specpdl_ptr--;
4932
4933       if (current_buffer != prev)
4934         {
4935           len = ZV - BEGV;
4936           new = Fcurrent_buffer ();
4937           set_buffer_internal_1 (prev);
4938           del_range_2 (from, from_byte, to, to_byte, 0);
4939           TEMP_SET_PT_BOTH (from, from_byte);
4940           insert_from_buffer (XBUFFER (new), 1, len, 0);
4941           Fkill_buffer (new);
4942           if (orig_point >= to)
4943             orig_point += len - orig_len;
4944           else if (orig_point > from)
4945             orig_point = from;
4946           orig_len = len;
4947           to = from + len;
4948           from_byte = CHAR_TO_BYTE (from);
4949           to_byte = CHAR_TO_BYTE (to);
4950           len_byte = to_byte - from_byte;
4951           TEMP_SET_PT_BOTH (from, from_byte);
4952         }
4953     }
4954
4955   if (replace)
4956     deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
4957
4958   if (coding->composing != COMPOSITION_DISABLED)
4959     {
4960       if (encodep)
4961         coding_save_composition (coding, from, to, Fcurrent_buffer ());
4962       else
4963         coding_allocate_composition_data (coding, from);
4964     }
4965
4966   /* Try to skip the heading and tailing ASCIIs.  */
4967   if (coding->type != coding_type_ccl)
4968     {
4969       int from_byte_orig = from_byte, to_byte_orig = to_byte;
4970
4971       if (from < GPT && GPT < to)
4972         move_gap_both (from, from_byte);
4973       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
4974       if (from_byte == to_byte
4975           && (encodep || NILP (coding->post_read_conversion))
4976           && ! CODING_REQUIRE_FLUSHING (coding))
4977         {
4978           coding->produced = len_byte;
4979           coding->produced_char = len;
4980           if (!replace)
4981             /* We must record and adjust for this new text now.  */
4982             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
4983           return 0;
4984         }
4985
4986       head_skip = from_byte - from_byte_orig;
4987       tail_skip = to_byte_orig - to_byte;
4988       total_skip = head_skip + tail_skip;
4989       from += head_skip;
4990       to -= tail_skip;
4991       len -= total_skip; len_byte -= total_skip;
4992     }
4993
4994   /* For converion, we must put the gap before the text in addition to
4995      making the gap larger for efficient decoding.  The required gap
4996      size starts from 2000 which is the magic number used in make_gap.
4997      But, after one batch of conversion, it will be incremented if we
4998      find that it is not enough .  */
4999   require = 2000;
5000
5001   if (GAP_SIZE  < require)
5002     make_gap (require - GAP_SIZE);
5003   move_gap_both (from, from_byte);
5004
5005   inserted = inserted_byte = 0;
5006
5007   GAP_SIZE += len_byte;
5008   ZV -= len;
5009   Z -= len;
5010   ZV_BYTE -= len_byte;
5011   Z_BYTE -= len_byte;
5012
5013   if (GPT - BEG < BEG_UNCHANGED)
5014     BEG_UNCHANGED = GPT - BEG;
5015   if (Z - GPT < END_UNCHANGED)
5016     END_UNCHANGED = Z - GPT;
5017
5018   if (!encodep && coding->src_multibyte)
5019     {
5020       /* Decoding routines expects that the source text is unibyte.
5021          We must convert 8-bit characters of multibyte form to
5022          unibyte.  */
5023       int len_byte_orig = len_byte;
5024       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5025       if (len_byte < len_byte_orig)
5026         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5027                     len_byte);
5028       coding->src_multibyte = 0;
5029     }
5030
5031   for (;;)
5032     {
5033       int result;
5034
5035       /* The buffer memory is now:
5036          +--------+converted-text+---------+-------original-text-------+---+
5037          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5038                   |<---------------------- GAP ----------------------->|  */
5039       src = GAP_END_ADDR - len_byte;
5040       dst = GPT_ADDR + inserted_byte;
5041
5042       if (encodep)
5043         result = encode_coding (coding, src, dst, len_byte, 0);
5044       else
5045         result = decode_coding (coding, src, dst, len_byte, 0);
5046
5047       /* The buffer memory is now:
5048          +--------+-------converted-text----+--+------original-text----+---+
5049          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5050                   |<---------------------- GAP ----------------------->|  */
5051
5052       inserted += coding->produced_char;
5053       inserted_byte += coding->produced;
5054       len_byte -= coding->consumed;
5055
5056       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5057         {
5058           coding_allocate_composition_data (coding, from + inserted);
5059           continue;
5060         }
5061
5062       src += coding->consumed;
5063       dst += coding->produced;
5064
5065       if (result == CODING_FINISH_NORMAL)
5066         {
5067           src += len_byte;
5068           break;
5069         }
5070       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5071         {
5072           unsigned char *pend = dst, *p = pend - inserted_byte;
5073           Lisp_Object eol_type;
5074
5075           /* Encode LFs back to the original eol format (CR or CRLF).  */
5076           if (coding->eol_type == CODING_EOL_CR)
5077             {
5078               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5079             }
5080           else
5081             {
5082               int count = 0;
5083
5084               while (p < pend) if (*p++ == '\n') count++;
5085               if (src - dst < count)
5086                 {
5087                   /* We don't have sufficient room for encoding LFs
5088                      back to CRLF.  We must record converted and
5089                      not-yet-converted text back to the buffer
5090                      content, enlarge the gap, then record them out of
5091                      the buffer contents again.  */
5092                   int add = len_byte + inserted_byte;
5093
5094                   GAP_SIZE -= add;
5095                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5096                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5097                   make_gap (count - GAP_SIZE);
5098                   GAP_SIZE += add;
5099                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5100                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5101                   /* Don't forget to update SRC, DST, and PEND.  */
5102                   src = GAP_END_ADDR - len_byte;
5103                   dst = GPT_ADDR + inserted_byte;
5104                   pend = dst;
5105                 }
5106               inserted += count;
5107               inserted_byte += count;
5108               coding->produced += count;
5109               p = dst = pend + count;
5110               while (count)
5111                 {
5112                   *--p = *--pend;
5113                   if (*p == '\n') count--, *--p = '\r';
5114                 }
5115             }
5116
5117           /* Suppress eol-format conversion in the further conversion.  */
5118           coding->eol_type = CODING_EOL_LF;
5119
5120           /* Set the coding system symbol to that for Unix-like EOL.  */
5121           eol_type = Fget (saved_coding_symbol, Qeol_type);
5122           if (VECTORP (eol_type)
5123               && XVECTOR (eol_type)->size == 3
5124               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5125             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5126           else
5127             coding->symbol = saved_coding_symbol;
5128
5129           continue;
5130         }
5131       if (len_byte <= 0)
5132         {
5133           if (coding->type != coding_type_ccl
5134               || coding->mode & CODING_MODE_LAST_BLOCK)
5135             break;
5136           coding->mode |= CODING_MODE_LAST_BLOCK;
5137           continue;
5138         }
5139       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5140         {
5141           /* The source text ends in invalid codes.  Let's just
5142              make them valid buffer contents, and finish conversion.  */
5143           inserted += len_byte;
5144           inserted_byte += len_byte;
5145           while (len_byte--)
5146             *dst++ = *src++;
5147           break;
5148         }
5149       if (result == CODING_FINISH_INTERRUPT)
5150         {
5151           /* The conversion procedure was interrupted by a user.  */
5152           break;
5153         }
5154       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5155       if (coding->consumed < 1)
5156         {
5157           /* It's quite strange to require more memory without
5158              consuming any bytes.  Perhaps CCL program bug.  */
5159           break;
5160         }
5161       if (first)
5162         {
5163           /* We have just done the first batch of conversion which was
5164              stoped because of insufficient gap.  Let's reconsider the
5165              required gap size (i.e. SRT - DST) now.
5166
5167              We have converted ORIG bytes (== coding->consumed) into
5168              NEW bytes (coding->produced).  To convert the remaining
5169              LEN bytes, we may need REQUIRE bytes of gap, where:
5170                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5171                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5172              Here, we are sure that NEW >= ORIG.  */
5173           float ratio = coding->produced - coding->consumed;
5174           ratio /= coding->consumed;
5175           require = len_byte * ratio;
5176           first = 0;
5177         }
5178       if ((src - dst) < (require + 2000))
5179         {
5180           /* See the comment above the previous call of make_gap.  */
5181           int add = len_byte + inserted_byte;
5182
5183           GAP_SIZE -= add;
5184           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5185           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5186           make_gap (require + 2000);
5187           GAP_SIZE += add;
5188           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5189           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5190         }
5191     }
5192   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5193
5194   if (encodep && coding->dst_multibyte)
5195     {
5196       /* The output is unibyte.  We must convert 8-bit characters to
5197          multibyte form.  */
5198       if (inserted_byte * 2 > GAP_SIZE)
5199         {
5200           GAP_SIZE -= inserted_byte;
5201           ZV += inserted_byte; Z += inserted_byte;
5202           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5203           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5204           make_gap (inserted_byte - GAP_SIZE);
5205           GAP_SIZE += inserted_byte;
5206           ZV -= inserted_byte; Z -= inserted_byte;
5207           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5208           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5209         }
5210       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5211     }
5212
5213   /* If we have shrinked the conversion area, adjust it now.  */
5214   if (total_skip > 0)
5215     {
5216       if (tail_skip > 0)
5217         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5218       inserted += total_skip; inserted_byte += total_skip;
5219       GAP_SIZE += total_skip;
5220       GPT -= head_skip; GPT_BYTE -= head_skip;
5221       ZV -= total_skip; ZV_BYTE -= total_skip;
5222       Z -= total_skip; Z_BYTE -= total_skip;
5223       from -= head_skip; from_byte -= head_skip;
5224       to += tail_skip; to_byte += tail_skip;
5225     }
5226
5227   prev_Z = Z;
5228   adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5229   inserted = Z - prev_Z;
5230
5231   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5232     coding_restore_composition (coding, Fcurrent_buffer ());
5233   coding_free_composition_data (coding);
5234
5235   if (! inhibit_pre_post_conversion
5236       && ! encodep && ! NILP (coding->post_read_conversion))
5237     {
5238       Lisp_Object val;
5239       int count = specpdl_ptr - specpdl;
5240
5241       if (from != PT)
5242         TEMP_SET_PT_BOTH (from, from_byte);
5243       prev_Z = Z;
5244       record_unwind_protect (code_convert_region_unwind, Qnil);
5245       /* We should not call any more pre-write/post-read-conversion
5246          functions while this post-read-conversion is running.  */
5247       inhibit_pre_post_conversion = 1;
5248       val = call1 (coding->post_read_conversion, make_number (inserted));
5249       inhibit_pre_post_conversion = 0;
5250       /* Discard the unwind protect.  */
5251       specpdl_ptr--;
5252       CHECK_NUMBER (val, 0);
5253       inserted += Z - prev_Z;
5254     }
5255
5256   if (orig_point >= from)
5257     {
5258       if (orig_point >= from + orig_len)
5259         orig_point += inserted - orig_len;
5260       else
5261         orig_point = from;
5262       TEMP_SET_PT (orig_point);
5263     }
5264
5265   if (replace)
5266     {
5267       signal_after_change (from, to - from, inserted);
5268       update_compositions (from, from + inserted, CHECK_BORDER);
5269     }
5270
5271   {
5272     coding->consumed = to_byte - from_byte;
5273     coding->consumed_char = to - from;
5274     coding->produced = inserted_byte;
5275     coding->produced_char = inserted;
5276   }
5277
5278   return 0;
5279 }
5280
5281 Lisp_Object
5282 run_pre_post_conversion_on_str (str, coding, encodep)
5283      Lisp_Object str;
5284      struct coding_system *coding;
5285      int encodep;
5286 {
5287   int count = specpdl_ptr - specpdl;
5288   struct gcpro gcpro1;
5289   struct buffer *prev = current_buffer;
5290   int multibyte = STRING_MULTIBYTE (str);
5291
5292   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5293   record_unwind_protect (code_convert_region_unwind, Qnil);
5294   GCPRO1 (str);
5295   temp_output_buffer_setup (" *code-converting-work*");
5296   set_buffer_internal (XBUFFER (Vstandard_output));
5297   /* We must insert the contents of STR as is without
5298      unibyte<->multibyte conversion.  For that, we adjust the
5299      multibyteness of the working buffer to that of STR.  */
5300   Ferase_buffer ();
5301   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
5302   insert_from_string (str, 0, 0,
5303                       XSTRING (str)->size, STRING_BYTES (XSTRING (str)), 0);
5304   UNGCPRO;
5305   inhibit_pre_post_conversion = 1;
5306   if (encodep)
5307     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
5308   else
5309     {
5310       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
5311       call1 (coding->post_read_conversion, make_number (Z - BEG));
5312     }
5313   inhibit_pre_post_conversion = 0;
5314   str = make_buffer_string (BEG, Z, 1);
5315   return unbind_to (count, str);
5316 }
5317
5318 Lisp_Object
5319 decode_coding_string (str, coding, nocopy)
5320      Lisp_Object str;
5321      struct coding_system *coding;
5322      int nocopy;
5323 {
5324   int len;
5325   struct conversion_buffer buf;
5326   int from, to, to_byte;
5327   struct gcpro gcpro1;
5328   Lisp_Object saved_coding_symbol;
5329   int result;
5330   int require_decoding;
5331   int shrinked_bytes = 0;
5332   Lisp_Object newstr;
5333   int consumed, consumed_char, produced, produced_char;
5334
5335   from = 0;
5336   to = XSTRING (str)->size;
5337   to_byte = STRING_BYTES (XSTRING (str));
5338
5339   saved_coding_symbol = Qnil;
5340   coding->src_multibyte = STRING_MULTIBYTE (str);
5341   coding->dst_multibyte = 1;
5342   if (CODING_REQUIRE_DETECTION (coding))
5343     {
5344       /* See the comments in code_convert_region.  */
5345       if (coding->type == coding_type_undecided)
5346         {
5347           detect_coding (coding, XSTRING (str)->data, to_byte);
5348           if (coding->type == coding_type_undecided)
5349             coding->type = coding_type_emacs_mule;
5350         }
5351       if (coding->eol_type == CODING_EOL_UNDECIDED
5352           && coding->type != coding_type_ccl)
5353         {
5354           saved_coding_symbol = coding->symbol;
5355           detect_eol (coding, XSTRING (str)->data, to_byte);
5356           if (coding->eol_type == CODING_EOL_UNDECIDED)
5357             coding->eol_type = CODING_EOL_LF;
5358           /* We had better recover the original eol format if we
5359              encounter an inconsitent eol format while decoding.  */
5360           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5361         }
5362     }
5363
5364   if (coding->type == coding_type_no_conversion
5365       || coding->type == coding_type_raw_text)
5366     coding->dst_multibyte = 0;
5367
5368   require_decoding = CODING_REQUIRE_DECODING (coding);
5369
5370   if (STRING_MULTIBYTE (str))
5371     {
5372       /* Decoding routines expect the source text to be unibyte.  */
5373       str = Fstring_as_unibyte (str);
5374       to_byte = STRING_BYTES (XSTRING (str));
5375       nocopy = 1;
5376       coding->src_multibyte = 0;
5377     }
5378
5379   /* Try to skip the heading and tailing ASCIIs.  */
5380   if (require_decoding && coding->type != coding_type_ccl)
5381     {
5382       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5383                                 0);
5384       if (from == to_byte)
5385         require_decoding = 0;
5386       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5387     }
5388
5389   if (!require_decoding)
5390     {
5391       coding->consumed = STRING_BYTES (XSTRING (str));
5392       coding->consumed_char = XSTRING (str)->size;
5393       if (coding->dst_multibyte)
5394         {
5395           str = Fstring_as_multibyte (str);
5396           nocopy = 1;
5397         }
5398       coding->produced = STRING_BYTES (XSTRING (str));
5399       coding->produced_char = XSTRING (str)->size;
5400       return (nocopy ? str : Fcopy_sequence (str));
5401     }
5402
5403   if (coding->composing != COMPOSITION_DISABLED)
5404     coding_allocate_composition_data (coding, from);
5405   len = decoding_buffer_size (coding, to_byte - from);
5406   allocate_conversion_buffer (buf, len);
5407
5408   consumed = consumed_char = produced = produced_char = 0;
5409   while (1)
5410     {
5411       result = decode_coding (coding, XSTRING (str)->data + from + consumed,
5412                               buf.data + produced, to_byte - from - consumed,
5413                               buf.size - produced);
5414       consumed += coding->consumed;
5415       consumed_char += coding->consumed_char;
5416       produced += coding->produced;
5417       produced_char += coding->produced_char;
5418       if (result == CODING_FINISH_NORMAL
5419           || (result == CODING_FINISH_INSUFFICIENT_SRC
5420               && coding->consumed == 0))
5421         break;
5422       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5423         coding_allocate_composition_data (coding, from + produced_char);
5424       else if (result == CODING_FINISH_INSUFFICIENT_DST)
5425         extend_conversion_buffer (&buf);
5426       else if (result == CODING_FINISH_INCONSISTENT_EOL)
5427         {
5428           /* Recover the original EOL format.  */
5429           if (coding->eol_type == CODING_EOL_CR)
5430             {
5431               unsigned char *p;
5432               for (p = buf.data; p < buf.data + produced; p++)
5433                 if (*p == '\n') *p = '\r';
5434             }
5435           else if (coding->eol_type == CODING_EOL_CRLF)
5436             {
5437               int num_eol = 0;
5438               unsigned char *p0, *p1;
5439               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
5440                 if (*p0 == '\n') num_eol++;
5441               if (produced + num_eol >= buf.size)
5442                 extend_conversion_buffer (&buf);
5443               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
5444                 {
5445                   *--p1 = *--p0;
5446                   if (*p0 == '\n') *--p1 = '\r';
5447                 }
5448               produced += num_eol;
5449               produced_char += num_eol;
5450             }
5451           coding->eol_type = CODING_EOL_LF;
5452           coding->symbol = saved_coding_symbol;
5453         }
5454     }
5455
5456   coding->consumed = consumed;
5457   coding->consumed_char = consumed_char;
5458   coding->produced = produced;
5459   coding->produced_char = produced_char;
5460
5461   if (coding->dst_multibyte)
5462     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
5463                                            produced + shrinked_bytes);
5464   else
5465     newstr = make_uninit_string (produced + shrinked_bytes);
5466   if (from > 0)
5467     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5468   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5469   if (shrinked_bytes > from)
5470     bcopy (XSTRING (str)->data + to_byte,
5471            XSTRING (newstr)->data + from + produced,
5472            shrinked_bytes - from);
5473   free_conversion_buffer (&buf);
5474
5475   if (coding->cmp_data && coding->cmp_data->used)
5476     coding_restore_composition (coding, newstr);
5477   coding_free_composition_data (coding);
5478
5479   if (SYMBOLP (coding->post_read_conversion)
5480       && !NILP (Ffboundp (coding->post_read_conversion)))
5481     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
5482
5483   return newstr;
5484 }
5485
5486 Lisp_Object
5487 encode_coding_string (str, coding, nocopy)
5488      Lisp_Object str;
5489      struct coding_system *coding;
5490      int nocopy;
5491 {
5492   int len;
5493   struct conversion_buffer buf;
5494   int from, to, to_byte;
5495   struct gcpro gcpro1;
5496   Lisp_Object saved_coding_symbol;
5497   int result;
5498   int shrinked_bytes = 0;
5499   Lisp_Object newstr;
5500   int consumed, consumed_char, produced, produced_char;
5501
5502   if (SYMBOLP (coding->pre_write_conversion)
5503       && !NILP (Ffboundp (coding->pre_write_conversion)))
5504     str = run_pre_post_conversion_on_str (str, coding, 1);
5505
5506   from = 0;
5507   to = XSTRING (str)->size;
5508   to_byte = STRING_BYTES (XSTRING (str));
5509
5510   saved_coding_symbol = Qnil;
5511
5512   /* Encoding routines determine the multibyteness of the source text
5513      by coding->src_multibyte.  */
5514   coding->src_multibyte = STRING_MULTIBYTE (str);
5515   coding->dst_multibyte = 0;
5516   if (! CODING_REQUIRE_ENCODING (coding))
5517     {
5518       coding->consumed = STRING_BYTES (XSTRING (str));
5519       coding->consumed_char = XSTRING (str)->size;
5520       if (STRING_MULTIBYTE (str))
5521         {
5522           str = Fstring_as_unibyte (str);
5523           nocopy = 1;
5524         }
5525       coding->produced = STRING_BYTES (XSTRING (str));
5526       coding->produced_char = XSTRING (str)->size;
5527       return (nocopy ? str : Fcopy_sequence (str));
5528     }
5529
5530   if (coding->composing != COMPOSITION_DISABLED)
5531     coding_save_composition (coding, from, to, str);
5532
5533   /* Try to skip the heading and tailing ASCIIs.  */
5534   if (coding->type != coding_type_ccl)
5535     {
5536       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, XSTRING (str)->data,
5537                                 1);
5538       if (from == to_byte)
5539         return (nocopy ? str : Fcopy_sequence (str));
5540       shrinked_bytes = from + (STRING_BYTES (XSTRING (str)) - to_byte);
5541     }
5542
5543   len = encoding_buffer_size (coding, to_byte - from);
5544   allocate_conversion_buffer (buf, len);
5545
5546   consumed = consumed_char = produced = produced_char = 0;
5547   while (1)
5548     {
5549       result = encode_coding (coding, XSTRING (str)->data + from + consumed,
5550                               buf.data + produced, to_byte - from - consumed,
5551                               buf.size - produced);
5552       consumed += coding->consumed;
5553       consumed_char += coding->consumed_char;
5554       produced += coding->produced;
5555       produced_char += coding->produced_char;
5556       if (result == CODING_FINISH_NORMAL
5557           || (result == CODING_FINISH_INSUFFICIENT_SRC
5558               && coding->consumed == 0))
5559         break;
5560       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
5561       extend_conversion_buffer (&buf);
5562     }
5563
5564   coding->consumed = consumed;
5565   coding->consumed_char = consumed_char;
5566   coding->produced = produced;
5567   coding->produced_char = produced_char;
5568
5569   newstr = make_uninit_string (produced + shrinked_bytes);
5570   if (from > 0)
5571     bcopy (XSTRING (str)->data, XSTRING (newstr)->data, from);
5572   bcopy (buf.data, XSTRING (newstr)->data + from, produced);
5573   if (shrinked_bytes > from)
5574     bcopy (XSTRING (str)->data + to_byte,
5575            XSTRING (newstr)->data + from + produced,
5576            shrinked_bytes - from);
5577
5578   free_conversion_buffer (&buf);
5579   coding_free_composition_data (coding);
5580
5581   return newstr;
5582 }
5583
5584 \f
5585 #ifdef emacs
5586 /*** 8. Emacs Lisp library functions ***/
5587
5588 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
5589   "Return t if OBJECT is nil or a coding-system.\n\
5590 See the documentation of `make-coding-system' for information\n\
5591 about coding-system objects.")
5592   (obj)
5593      Lisp_Object obj;
5594 {
5595   if (NILP (obj))
5596     return Qt;
5597   if (!SYMBOLP (obj))
5598     return Qnil;
5599   /* Get coding-spec vector for OBJ.  */
5600   obj = Fget (obj, Qcoding_system);
5601   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
5602           ? Qt : Qnil);
5603 }
5604
5605 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
5606        Sread_non_nil_coding_system, 1, 1, 0,
5607   "Read a coding system from the minibuffer, prompting with string PROMPT.")
5608   (prompt)
5609      Lisp_Object prompt;
5610 {
5611   Lisp_Object val;
5612   do
5613     {
5614       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5615                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
5616     }
5617   while (XSTRING (val)->size == 0);
5618   return (Fintern (val, Qnil));
5619 }
5620
5621 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
5622   "Read a coding system from the minibuffer, prompting with string PROMPT.\n\
5623 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.")
5624   (prompt, default_coding_system)
5625      Lisp_Object prompt, default_coding_system;
5626 {
5627   Lisp_Object val;
5628   if (SYMBOLP (default_coding_system))
5629     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
5630   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
5631                           Qt, Qnil, Qcoding_system_history,
5632                           default_coding_system, Qnil);
5633   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
5634 }
5635
5636 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
5637        1, 1, 0,
5638   "Check validity of CODING-SYSTEM.\n\
5639 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.\n\
5640 It is valid if it is a symbol with a non-nil `coding-system' property.\n\
5641 The value of property should be a vector of length 5.")
5642   (coding_system)
5643      Lisp_Object coding_system;
5644 {
5645   CHECK_SYMBOL (coding_system, 0);
5646   if (!NILP (Fcoding_system_p (coding_system)))
5647     return coding_system;
5648   while (1)
5649     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
5650 }
5651 \f
5652 Lisp_Object
5653 detect_coding_system (src, src_bytes, highest, multibytep)
5654      unsigned char *src;
5655      int src_bytes, highest;
5656      int multibytep;
5657 {
5658   int coding_mask, eol_type;
5659   Lisp_Object val, tmp;
5660   int dummy;
5661
5662   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
5663   eol_type  = detect_eol_type (src, src_bytes, &dummy);
5664   if (eol_type == CODING_EOL_INCONSISTENT)
5665     eol_type = CODING_EOL_UNDECIDED;
5666
5667   if (!coding_mask)
5668     {
5669       val = Qundecided;
5670       if (eol_type != CODING_EOL_UNDECIDED)
5671         {
5672           Lisp_Object val2;
5673           val2 = Fget (Qundecided, Qeol_type);
5674           if (VECTORP (val2))
5675             val = XVECTOR (val2)->contents[eol_type];
5676         }
5677       return (highest ? val : Fcons (val, Qnil));
5678     }
5679
5680   /* At first, gather possible coding systems in VAL.  */
5681   val = Qnil;
5682   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
5683     {
5684       Lisp_Object category_val, category_index;
5685
5686       category_index = Fget (XCAR (tmp), Qcoding_category_index);
5687       category_val = Fsymbol_value (XCAR (tmp));
5688       if (!NILP (category_val)
5689           && NATNUMP (category_index)
5690           && (coding_mask & (1 << XFASTINT (category_index))))
5691         {
5692           val = Fcons (category_val, val);
5693           if (highest)
5694             break;
5695         }
5696     }
5697   if (!highest)
5698     val = Fnreverse (val);
5699
5700   /* Then, replace the elements with subsidiary coding systems.  */
5701   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
5702     {
5703       if (eol_type != CODING_EOL_UNDECIDED
5704           && eol_type != CODING_EOL_INCONSISTENT)
5705         {
5706           Lisp_Object eol;
5707           eol = Fget (XCAR (tmp), Qeol_type);
5708           if (VECTORP (eol))
5709             XCAR (tmp) = XVECTOR (eol)->contents[eol_type];
5710         }
5711     }
5712   return (highest ? XCAR (val) : val);
5713 }
5714
5715 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
5716        2, 3, 0,
5717   "Detect coding system of the text in the region between START and END.\n\
5718 Return a list of possible coding systems ordered by priority.\n\
5719 \n\
5720 If only ASCII characters are found, it returns a list of single element\n\
5721 `undecided' or its subsidiary coding system according to a detected\n\
5722 end-of-line format.\n\
5723 \n\
5724 If optional argument HIGHEST is non-nil, return the coding system of\n\
5725 highest priority.")
5726   (start, end, highest)
5727      Lisp_Object start, end, highest;
5728 {
5729   int from, to;
5730   int from_byte, to_byte;
5731
5732   CHECK_NUMBER_COERCE_MARKER (start, 0);
5733   CHECK_NUMBER_COERCE_MARKER (end, 1);
5734
5735   validate_region (&start, &end);
5736   from = XINT (start), to = XINT (end);
5737   from_byte = CHAR_TO_BYTE (from);
5738   to_byte = CHAR_TO_BYTE (to);
5739
5740   if (from < GPT && to >= GPT)
5741     move_gap_both (to, to_byte);
5742
5743   return detect_coding_system (BYTE_POS_ADDR (from_byte),
5744                                to_byte - from_byte,
5745                                !NILP (highest),
5746                                !NILP (current_buffer
5747                                       ->enable_multibyte_characters));
5748 }
5749
5750 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
5751        1, 2, 0,
5752   "Detect coding system of the text in STRING.\n\
5753 Return a list of possible coding systems ordered by priority.\n\
5754 \n\
5755 If only ASCII characters are found, it returns a list of single element\n\
5756 `undecided' or its subsidiary coding system according to a detected\n\
5757 end-of-line format.\n\
5758 \n\
5759 If optional argument HIGHEST is non-nil, return the coding system of\n\
5760 highest priority.")
5761   (string, highest)
5762      Lisp_Object string, highest;
5763 {
5764   CHECK_STRING (string, 0);
5765
5766   return detect_coding_system (XSTRING (string)->data,
5767                                STRING_BYTES (XSTRING (string)),
5768                                !NILP (highest),
5769                                STRING_MULTIBYTE (string));
5770 }
5771
5772 /* Return an intersection of lists L1 and L2.  */
5773
5774 static Lisp_Object
5775 intersection (l1, l2)
5776      Lisp_Object l1, l2;
5777 {
5778   Lisp_Object val;
5779
5780   for (val = Qnil; CONSP (l1); l1 = XCDR (l1))
5781     {
5782       if (!NILP (Fmemq (XCAR (l1), l2)))
5783         val = Fcons (XCAR (l1), val);
5784     }
5785   return val;
5786 }
5787
5788
5789 /*  Subroutine for Fsafe_coding_systems_region_internal.
5790
5791     Return a list of coding systems that safely encode the multibyte
5792     text between P and PEND.  SAFE_CODINGS, if non-nil, is a list of
5793     possible coding systems.  If it is nil, it means that we have not
5794     yet found any coding systems.
5795
5796     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
5797     element of WORK_TABLE is set to t once the element is looked up.
5798
5799     If a non-ASCII single byte char is found, set
5800     *single_byte_char_found to 1.  */
5801
5802 static Lisp_Object
5803 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
5804      unsigned char *p, *pend;
5805      Lisp_Object safe_codings, work_table;
5806      int *single_byte_char_found;
5807 {
5808   int c, len, idx;
5809   Lisp_Object val;
5810
5811   while (p < pend)
5812     {
5813       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
5814       p += len;
5815       if (ASCII_BYTE_P (c))
5816         /* We can ignore ASCII characters here.  */
5817         continue;
5818       if (SINGLE_BYTE_CHAR_P (c))
5819         *single_byte_char_found = 1;
5820       if (NILP (safe_codings))
5821         continue;
5822       /* Check the safe coding systems for C.  */
5823       val = char_table_ref_and_index (work_table, c, &idx);
5824       if (EQ (val, Qt))
5825         /* This element was already checked.  Ignore it.  */
5826         continue;
5827       /* Remember that we checked this element.  */
5828       CHAR_TABLE_SET (work_table, make_number (idx), Qt);
5829
5830       /* If there are some safe coding systems for C and we have
5831          already found the other set of coding systems for the
5832          different characters, get the intersection of them.  */
5833       if (!EQ (safe_codings, Qt) && !NILP (val))
5834         val = intersection (safe_codings, val);
5835       safe_codings = val;
5836     }
5837   return safe_codings;
5838 }
5839
5840
5841 /* Return a list of coding systems that safely encode the text between
5842    START and END.  If the text contains only ASCII or is unibyte,
5843    return t.  */
5844
5845 DEFUN ("find-coding-systems-region-internal",
5846        Ffind_coding_systems_region_internal,
5847        Sfind_coding_systems_region_internal, 2, 2, 0,
5848   "Internal use only.")
5849   (start, end)
5850      Lisp_Object start, end;
5851 {
5852   Lisp_Object work_table, safe_codings;
5853   int non_ascii_p = 0;
5854   int single_byte_char_found = 0;
5855   unsigned char *p1, *p1end, *p2, *p2end, *p;
5856   Lisp_Object args[2];
5857
5858   if (STRINGP (start))
5859     {
5860       if (!STRING_MULTIBYTE (start))
5861         return Qt;
5862       p1 = XSTRING (start)->data, p1end = p1 + STRING_BYTES (XSTRING (start));
5863       p2 = p2end = p1end;
5864       if (XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
5865         non_ascii_p = 1;
5866     }
5867   else
5868     {
5869       int from, to, stop;
5870
5871       CHECK_NUMBER_COERCE_MARKER (start, 0);
5872       CHECK_NUMBER_COERCE_MARKER (end, 1);
5873       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
5874         args_out_of_range (start, end);
5875       if (NILP (current_buffer->enable_multibyte_characters))
5876         return Qt;
5877       from = CHAR_TO_BYTE (XINT (start));
5878       to = CHAR_TO_BYTE (XINT (end));
5879       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
5880       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
5881       if (stop == to)
5882         p2 = p2end = p1end;
5883       else
5884         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
5885       if (XINT (end) - XINT (start) != to - from)
5886         non_ascii_p = 1;
5887     }
5888
5889   if (!non_ascii_p)
5890     {
5891       /* We are sure that the text contains no multibyte character.
5892          Check if it contains eight-bit-graphic.  */
5893       p = p1;
5894       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
5895       if (p == p1end)
5896         {
5897           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
5898           if (p == p2end)
5899             return Qt;
5900         }
5901     }
5902
5903   /* The text contains non-ASCII characters.  */
5904   work_table = Fcopy_sequence (Vchar_coding_system_table);
5905   safe_codings = find_safe_codings (p1, p1end, Qt, work_table,
5906                                     &single_byte_char_found);
5907   if (p2 < p2end)
5908     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
5909                                       &single_byte_char_found);
5910
5911   if (!single_byte_char_found)
5912     {
5913       /* Append generic coding systems.  */
5914       Lisp_Object args[2];
5915       args[0] = safe_codings;
5916       args[1] = Fchar_table_extra_slot (Vchar_coding_system_table,
5917                                         make_number (0));
5918       safe_codings = Fappend (2, args);
5919     }
5920   else
5921     safe_codings = Fcons (Qraw_text,
5922                           Fcons (Qemacs_mule,
5923                                  Fcons (Qno_conversion, safe_codings)));
5924   return safe_codings;
5925 }
5926
5927
5928 Lisp_Object
5929 code_convert_region1 (start, end, coding_system, encodep)
5930      Lisp_Object start, end, coding_system;
5931      int encodep;
5932 {
5933   struct coding_system coding;
5934   int from, to, len;
5935
5936   CHECK_NUMBER_COERCE_MARKER (start, 0);
5937   CHECK_NUMBER_COERCE_MARKER (end, 1);
5938   CHECK_SYMBOL (coding_system, 2);
5939
5940   validate_region (&start, &end);
5941   from = XFASTINT (start);
5942   to = XFASTINT (end);
5943
5944   if (NILP (coding_system))
5945     return make_number (to - from);
5946
5947   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
5948     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
5949
5950   coding.mode |= CODING_MODE_LAST_BLOCK;
5951   coding.src_multibyte = coding.dst_multibyte
5952     = !NILP (current_buffer->enable_multibyte_characters);
5953   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
5954                        &coding, encodep, 1);
5955   Vlast_coding_system_used = coding.symbol;
5956   return make_number (coding.produced_char);
5957 }
5958
5959 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
5960        3, 3, "r\nzCoding system: ",
5961   "Decode the current region by specified coding system.\n\
5962 When called from a program, takes three arguments:\n\
5963 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5964 This function sets `last-coding-system-used' to the precise coding system\n\
5965 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5966 not fully specified.)\n\
5967 It returns the length of the decoded text.")
5968   (start, end, coding_system)
5969      Lisp_Object start, end, coding_system;
5970 {
5971   return code_convert_region1 (start, end, coding_system, 0);
5972 }
5973
5974 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
5975        3, 3, "r\nzCoding system: ",
5976   "Encode the current region by specified coding system.\n\
5977 When called from a program, takes three arguments:\n\
5978 START, END, and CODING-SYSTEM.  START and END are buffer positions.\n\
5979 This function sets `last-coding-system-used' to the precise coding system\n\
5980 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
5981 not fully specified.)\n\
5982 It returns the length of the encoded text.")
5983   (start, end, coding_system)
5984      Lisp_Object start, end, coding_system;
5985 {
5986   return code_convert_region1 (start, end, coding_system, 1);
5987 }
5988
5989 Lisp_Object
5990 code_convert_string1 (string, coding_system, nocopy, encodep)
5991      Lisp_Object string, coding_system, nocopy;
5992      int encodep;
5993 {
5994   struct coding_system coding;
5995
5996   CHECK_STRING (string, 0);
5997   CHECK_SYMBOL (coding_system, 1);
5998
5999   if (NILP (coding_system))
6000     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6001
6002   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6003     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6004
6005   coding.mode |= CODING_MODE_LAST_BLOCK;
6006   string = (encodep
6007             ? encode_coding_string (string, &coding, !NILP (nocopy))
6008             : decode_coding_string (string, &coding, !NILP (nocopy)));
6009   Vlast_coding_system_used = coding.symbol;
6010
6011   return string;
6012 }
6013
6014 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6015        2, 3, 0,
6016   "Decode STRING which is encoded in CODING-SYSTEM, and return the result.\n\
6017 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6018 if the decoding operation is trivial.\n\
6019 This function sets `last-coding-system-used' to the precise coding system\n\
6020 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6021 not fully specified.)")
6022   (string, coding_system, nocopy)
6023      Lisp_Object string, coding_system, nocopy;
6024 {
6025   return code_convert_string1 (string, coding_system, nocopy, 0);
6026 }
6027
6028 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6029        2, 3, 0,
6030   "Encode STRING to CODING-SYSTEM, and return the result.\n\
6031 Optional arg NOCOPY non-nil means it is ok to return STRING itself\n\
6032 if the encoding operation is trivial.\n\
6033 This function sets `last-coding-system-used' to the precise coding system\n\
6034 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is\n\
6035 not fully specified.)")
6036   (string, coding_system, nocopy)
6037      Lisp_Object string, coding_system, nocopy;
6038 {
6039   return code_convert_string1 (string, coding_system, nocopy, 1);
6040 }
6041
6042 /* Encode or decode STRING according to CODING_SYSTEM.
6043    Do not set Vlast_coding_system_used.
6044
6045    This function is called only from macros DECODE_FILE and
6046    ENCODE_FILE, thus we ignore character composition.  */
6047
6048 Lisp_Object
6049 code_convert_string_norecord (string, coding_system, encodep)
6050      Lisp_Object string, coding_system;
6051      int encodep;
6052 {
6053   struct coding_system coding;
6054
6055   CHECK_STRING (string, 0);
6056   CHECK_SYMBOL (coding_system, 1);
6057
6058   if (NILP (coding_system))
6059     return string;
6060
6061   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6062     error ("Invalid coding system: %s", XSYMBOL (coding_system)->name->data);
6063
6064   coding.composing = COMPOSITION_DISABLED;
6065   coding.mode |= CODING_MODE_LAST_BLOCK;
6066   return (encodep
6067           ? encode_coding_string (string, &coding, 1)
6068           : decode_coding_string (string, &coding, 1));
6069 }
6070 \f
6071 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6072   "Decode a Japanese character which has CODE in shift_jis encoding.\n\
6073 Return the corresponding character.")
6074   (code)
6075      Lisp_Object code;
6076 {
6077   unsigned char c1, c2, s1, s2;
6078   Lisp_Object val;
6079
6080   CHECK_NUMBER (code, 0);
6081   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
6082   if (s1 == 0)
6083     {
6084       if (s2 < 0x80)
6085         XSETFASTINT (val, s2);
6086       else if (s2 >= 0xA0 || s2 <= 0xDF)
6087         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
6088       else
6089         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6090     }
6091   else
6092     {
6093       if ((s1 < 0x80 || s1 > 0x9F && s1 < 0xE0 || s1 > 0xEF)
6094           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
6095         error ("Invalid Shift JIS code: %x", XFASTINT (code));
6096       DECODE_SJIS (s1, s2, c1, c2);
6097       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
6098     }
6099   return val;
6100 }
6101
6102 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6103   "Encode a Japanese character CHAR to shift_jis encoding.\n\
6104 Return the corresponding code in SJIS.")
6105   (ch)
6106      Lisp_Object ch;
6107 {
6108   int charset, c1, c2, s1, s2;
6109   Lisp_Object val;
6110
6111   CHECK_NUMBER (ch, 0);
6112   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6113   if (charset == CHARSET_ASCII)
6114     {
6115       val = ch;
6116     }
6117   else if (charset == charset_jisx0208
6118            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
6119     {
6120       ENCODE_SJIS (c1, c2, s1, s2);
6121       XSETFASTINT (val, (s1 << 8) | s2);
6122     }
6123   else if (charset == charset_katakana_jisx0201
6124            && c1 > 0x20 && c2 < 0xE0)
6125     {
6126       XSETFASTINT (val, c1 | 0x80);
6127     }
6128   else
6129     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
6130   return val;
6131 }
6132
6133 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6134   "Decode a Big5 character which has CODE in BIG5 coding system.\n\
6135 Return the corresponding character.")
6136   (code)
6137      Lisp_Object code;
6138 {
6139   int charset;
6140   unsigned char b1, b2, c1, c2;
6141   Lisp_Object val;
6142
6143   CHECK_NUMBER (code, 0);
6144   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
6145   if (b1 == 0)
6146     {
6147       if (b2 >= 0x80)
6148         error ("Invalid BIG5 code: %x", XFASTINT (code));
6149       val = code;
6150     }
6151   else
6152     {
6153       if ((b1 < 0xA1 || b1 > 0xFE)
6154           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
6155         error ("Invalid BIG5 code: %x", XFASTINT (code));
6156       DECODE_BIG5 (b1, b2, charset, c1, c2);
6157       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
6158     }
6159   return val;
6160 }
6161
6162 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6163   "Encode the Big5 character CHAR to BIG5 coding system.\n\
6164 Return the corresponding character code in Big5.")
6165   (ch)
6166      Lisp_Object ch;
6167 {
6168   int charset, c1, c2, b1, b2;
6169   Lisp_Object val;
6170
6171   CHECK_NUMBER (ch, 0);
6172   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
6173   if (charset == CHARSET_ASCII)
6174     {
6175       val = ch;
6176     }
6177   else if ((charset == charset_big5_1
6178             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
6179            || (charset == charset_big5_2
6180                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
6181     {
6182       ENCODE_BIG5 (charset, c1, c2, b1, b2);
6183       XSETFASTINT (val, (b1 << 8) | b2);
6184     }
6185   else
6186     error ("Can't encode to Big5: %d", XFASTINT (ch));
6187   return val;
6188 }
6189 \f
6190 DEFUN ("set-terminal-coding-system-internal",
6191        Fset_terminal_coding_system_internal,
6192        Sset_terminal_coding_system_internal, 1, 1, 0, "")
6193   (coding_system)
6194      Lisp_Object coding_system;
6195 {
6196   CHECK_SYMBOL (coding_system, 0);
6197   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
6198   /* We had better not send unsafe characters to terminal.  */
6199   terminal_coding.flags |= CODING_FLAG_ISO_SAFE;
6200   /* Characer composition should be disabled.  */
6201   terminal_coding.composing = COMPOSITION_DISABLED;
6202   terminal_coding.src_multibyte = 1;
6203   terminal_coding.dst_multibyte = 0;
6204   return Qnil;
6205 }
6206
6207 DEFUN ("set-safe-terminal-coding-system-internal",
6208        Fset_safe_terminal_coding_system_internal,
6209        Sset_safe_terminal_coding_system_internal, 1, 1, 0, "")
6210   (coding_system)
6211      Lisp_Object coding_system;
6212 {
6213   CHECK_SYMBOL (coding_system, 0);
6214   setup_coding_system (Fcheck_coding_system (coding_system),
6215                        &safe_terminal_coding);
6216   /* Characer composition should be disabled.  */
6217   safe_terminal_coding.composing = COMPOSITION_DISABLED;
6218   safe_terminal_coding.src_multibyte = 1;
6219   safe_terminal_coding.dst_multibyte = 0;
6220   return Qnil;
6221 }
6222
6223 DEFUN ("terminal-coding-system",
6224        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6225   "Return coding system specified for terminal output.")
6226   ()
6227 {
6228   return terminal_coding.symbol;
6229 }
6230
6231 DEFUN ("set-keyboard-coding-system-internal",
6232        Fset_keyboard_coding_system_internal,
6233        Sset_keyboard_coding_system_internal, 1, 1, 0, "")
6234   (coding_system)
6235      Lisp_Object coding_system;
6236 {
6237   CHECK_SYMBOL (coding_system, 0);
6238   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
6239   /* Characer composition should be disabled.  */
6240   keyboard_coding.composing = COMPOSITION_DISABLED;
6241   return Qnil;
6242 }
6243
6244 DEFUN ("keyboard-coding-system",
6245        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6246   "Return coding system specified for decoding keyboard input.")
6247   ()
6248 {
6249   return keyboard_coding.symbol;
6250 }
6251
6252 \f
6253 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
6254        Sfind_operation_coding_system,  1, MANY, 0,
6255   "Choose a coding system for an operation based on the target name.\n\
6256 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).\n\
6257 DECODING-SYSTEM is the coding system to use for decoding\n\
6258 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system\n\
6259 for encoding (in case OPERATION does encoding).\n\
6260 \n\
6261 The first argument OPERATION specifies an I/O primitive:\n\
6262   For file I/O, `insert-file-contents' or `write-region'.\n\
6263   For process I/O, `call-process', `call-process-region', or `start-process'.\n\
6264   For network I/O, `open-network-stream'.\n\
6265 \n\
6266 The remaining arguments should be the same arguments that were passed\n\
6267 to the primitive.  Depending on which primitive, one of those arguments\n\
6268 is selected as the TARGET.  For example, if OPERATION does file I/O,\n\
6269 whichever argument specifies the file name is TARGET.\n\
6270 \n\
6271 TARGET has a meaning which depends on OPERATION:\n\
6272   For file I/O, TARGET is a file name.\n\
6273   For process I/O, TARGET is a process name.\n\
6274   For network I/O, TARGET is a service name or a port number\n\
6275 \n\
6276 This function looks up what specified for TARGET in,\n\
6277 `file-coding-system-alist', `process-coding-system-alist',\n\
6278 or `network-coding-system-alist' depending on OPERATION.\n\
6279 They may specify a coding system, a cons of coding systems,\n\
6280 or a function symbol to call.\n\
6281 In the last case, we call the function with one argument,\n\
6282 which is a list of all the arguments given to this function.")
6283   (nargs, args)
6284      int nargs;
6285      Lisp_Object *args;
6286 {
6287   Lisp_Object operation, target_idx, target, val;
6288   register Lisp_Object chain;
6289
6290   if (nargs < 2)
6291     error ("Too few arguments");
6292   operation = args[0];
6293   if (!SYMBOLP (operation)
6294       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
6295     error ("Invalid first arguement");
6296   if (nargs < 1 + XINT (target_idx))
6297     error ("Too few arguments for operation: %s",
6298            XSYMBOL (operation)->name->data);
6299   target = args[XINT (target_idx) + 1];
6300   if (!(STRINGP (target)
6301         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
6302     error ("Invalid %dth argument", XINT (target_idx) + 1);
6303
6304   chain = ((EQ (operation, Qinsert_file_contents)
6305             || EQ (operation, Qwrite_region))
6306            ? Vfile_coding_system_alist
6307            : (EQ (operation, Qopen_network_stream)
6308               ? Vnetwork_coding_system_alist
6309               : Vprocess_coding_system_alist));
6310   if (NILP (chain))
6311     return Qnil;
6312
6313   for (; CONSP (chain); chain = XCDR (chain))
6314     {
6315       Lisp_Object elt;
6316       elt = XCAR (chain);
6317
6318       if (CONSP (elt)
6319           && ((STRINGP (target)
6320                && STRINGP (XCAR (elt))
6321                && fast_string_match (XCAR (elt), target) >= 0)
6322               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
6323         {
6324           val = XCDR (elt);
6325           /* Here, if VAL is both a valid coding system and a valid
6326              function symbol, we return VAL as a coding system.  */
6327           if (CONSP (val))
6328             return val;
6329           if (! SYMBOLP (val))
6330             return Qnil;
6331           if (! NILP (Fcoding_system_p (val)))
6332             return Fcons (val, val);
6333           if (! NILP (Ffboundp (val)))
6334             {
6335               val = call1 (val, Flist (nargs, args));
6336               if (CONSP (val))
6337                 return val;
6338               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
6339                 return Fcons (val, val);
6340             }
6341           return Qnil;
6342         }
6343     }
6344   return Qnil;
6345 }
6346
6347 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
6348        Supdate_coding_systems_internal, 0, 0, 0,
6349   "Update internal database for ISO2022 and CCL based coding systems.\n\
6350 When values of any coding categories are changed, you must\n\
6351 call this function")
6352   ()
6353 {
6354   int i;
6355
6356   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
6357     {
6358       Lisp_Object val;
6359
6360       val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value;
6361       if (!NILP (val))
6362         {
6363           if (! coding_system_table[i])
6364             coding_system_table[i] = ((struct coding_system *)
6365                                       xmalloc (sizeof (struct coding_system)));
6366           setup_coding_system (val, coding_system_table[i]);
6367         }
6368       else if (coding_system_table[i])
6369         {
6370           xfree (coding_system_table[i]);
6371           coding_system_table[i] = NULL;
6372         }
6373     }
6374
6375   return Qnil;
6376 }
6377
6378 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
6379        Sset_coding_priority_internal, 0, 0, 0,
6380   "Update internal database for the current value of `coding-category-list'.\n\
6381 This function is internal use only.")
6382   ()
6383 {
6384   int i = 0, idx;
6385   Lisp_Object val;
6386
6387   val = Vcoding_category_list;
6388
6389   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
6390     {
6391       if (! SYMBOLP (XCAR (val)))
6392         break;
6393       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
6394       if (idx >= CODING_CATEGORY_IDX_MAX)
6395         break;
6396       coding_priorities[i++] = (1 << idx);
6397       val = XCDR (val);
6398     }
6399   /* If coding-category-list is valid and contains all coding
6400      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
6401      the following code saves Emacs from crashing.  */
6402   while (i < CODING_CATEGORY_IDX_MAX)
6403     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
6404
6405   return Qnil;
6406 }
6407
6408 #endif /* emacs */
6409
6410 \f
6411 /*** 9. Post-amble ***/
6412
6413 void
6414 init_coding_once ()
6415 {
6416   int i;
6417
6418   /* Emacs' internal format specific initialize routine.  */
6419   for (i = 0; i <= 0x20; i++)
6420     emacs_code_class[i] = EMACS_control_code;
6421   emacs_code_class[0x0A] = EMACS_linefeed_code;
6422   emacs_code_class[0x0D] = EMACS_carriage_return_code;
6423   for (i = 0x21 ; i < 0x7F; i++)
6424     emacs_code_class[i] = EMACS_ascii_code;
6425   emacs_code_class[0x7F] = EMACS_control_code;
6426   for (i = 0x80; i < 0xFF; i++)
6427     emacs_code_class[i] = EMACS_invalid_code;
6428   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
6429   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
6430   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
6431   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
6432
6433   /* ISO2022 specific initialize routine.  */
6434   for (i = 0; i < 0x20; i++)
6435     iso_code_class[i] = ISO_control_0;
6436   for (i = 0x21; i < 0x7F; i++)
6437     iso_code_class[i] = ISO_graphic_plane_0;
6438   for (i = 0x80; i < 0xA0; i++)
6439     iso_code_class[i] = ISO_control_1;
6440   for (i = 0xA1; i < 0xFF; i++)
6441     iso_code_class[i] = ISO_graphic_plane_1;
6442   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
6443   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
6444   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
6445   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
6446   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
6447   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
6448   iso_code_class[ISO_CODE_ESC] = ISO_escape;
6449   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
6450   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
6451   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
6452
6453   setup_coding_system (Qnil, &keyboard_coding);
6454   setup_coding_system (Qnil, &terminal_coding);
6455   setup_coding_system (Qnil, &safe_terminal_coding);
6456   setup_coding_system (Qnil, &default_buffer_file_coding);
6457
6458   bzero (coding_system_table, sizeof coding_system_table);
6459
6460   bzero (ascii_skip_code, sizeof ascii_skip_code);
6461   for (i = 0; i < 128; i++)
6462     ascii_skip_code[i] = 1;
6463
6464 #if defined (MSDOS) || defined (WINDOWSNT)
6465   system_eol_type = CODING_EOL_CRLF;
6466 #else
6467   system_eol_type = CODING_EOL_LF;
6468 #endif
6469
6470   inhibit_pre_post_conversion = 0;
6471 }
6472
6473 #ifdef emacs
6474
6475 void
6476 syms_of_coding ()
6477 {
6478   Qtarget_idx = intern ("target-idx");
6479   staticpro (&Qtarget_idx);
6480
6481   Qcoding_system_history = intern ("coding-system-history");
6482   staticpro (&Qcoding_system_history);
6483   Fset (Qcoding_system_history, Qnil);
6484
6485   /* Target FILENAME is the first argument.  */
6486   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
6487   /* Target FILENAME is the third argument.  */
6488   Fput (Qwrite_region, Qtarget_idx, make_number (2));
6489
6490   Qcall_process = intern ("call-process");
6491   staticpro (&Qcall_process);
6492   /* Target PROGRAM is the first argument.  */
6493   Fput (Qcall_process, Qtarget_idx, make_number (0));
6494
6495   Qcall_process_region = intern ("call-process-region");
6496   staticpro (&Qcall_process_region);
6497   /* Target PROGRAM is the third argument.  */
6498   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
6499
6500   Qstart_process = intern ("start-process");
6501   staticpro (&Qstart_process);
6502   /* Target PROGRAM is the third argument.  */
6503   Fput (Qstart_process, Qtarget_idx, make_number (2));
6504
6505   Qopen_network_stream = intern ("open-network-stream");
6506   staticpro (&Qopen_network_stream);
6507   /* Target SERVICE is the fourth argument.  */
6508   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
6509
6510   Qcoding_system = intern ("coding-system");
6511   staticpro (&Qcoding_system);
6512
6513   Qeol_type = intern ("eol-type");
6514   staticpro (&Qeol_type);
6515
6516   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
6517   staticpro (&Qbuffer_file_coding_system);
6518
6519   Qpost_read_conversion = intern ("post-read-conversion");
6520   staticpro (&Qpost_read_conversion);
6521
6522   Qpre_write_conversion = intern ("pre-write-conversion");
6523   staticpro (&Qpre_write_conversion);
6524
6525   Qno_conversion = intern ("no-conversion");
6526   staticpro (&Qno_conversion);
6527
6528   Qundecided = intern ("undecided");
6529   staticpro (&Qundecided);
6530
6531   Qcoding_system_p = intern ("coding-system-p");
6532   staticpro (&Qcoding_system_p);
6533
6534   Qcoding_system_error = intern ("coding-system-error");
6535   staticpro (&Qcoding_system_error);
6536
6537   Fput (Qcoding_system_error, Qerror_conditions,
6538         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
6539   Fput (Qcoding_system_error, Qerror_message,
6540         build_string ("Invalid coding system"));
6541
6542   Qcoding_category = intern ("coding-category");
6543   staticpro (&Qcoding_category);
6544   Qcoding_category_index = intern ("coding-category-index");
6545   staticpro (&Qcoding_category_index);
6546
6547   Vcoding_category_table
6548     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
6549   staticpro (&Vcoding_category_table);
6550   {
6551     int i;
6552     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
6553       {
6554         XVECTOR (Vcoding_category_table)->contents[i]
6555           = intern (coding_category_name[i]);
6556         Fput (XVECTOR (Vcoding_category_table)->contents[i],
6557               Qcoding_category_index, make_number (i));
6558       }
6559   }
6560
6561   Qtranslation_table = intern ("translation-table");
6562   staticpro (&Qtranslation_table);
6563   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
6564
6565   Qtranslation_table_id = intern ("translation-table-id");
6566   staticpro (&Qtranslation_table_id);
6567
6568   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
6569   staticpro (&Qtranslation_table_for_decode);
6570
6571   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
6572   staticpro (&Qtranslation_table_for_encode);
6573
6574   Qsafe_chars = intern ("safe-chars");
6575   staticpro (&Qsafe_chars);
6576
6577   Qchar_coding_system = intern ("char-coding-system");
6578   staticpro (&Qchar_coding_system);
6579
6580   /* Intern this now in case it isn't already done.
6581      Setting this variable twice is harmless.
6582      But don't staticpro it here--that is done in alloc.c.  */
6583   Qchar_table_extra_slots = intern ("char-table-extra-slots");
6584   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
6585   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (1));
6586
6587   Qvalid_codes = intern ("valid-codes");
6588   staticpro (&Qvalid_codes);
6589
6590   Qemacs_mule = intern ("emacs-mule");
6591   staticpro (&Qemacs_mule);
6592
6593   Qraw_text = intern ("raw-text");
6594   staticpro (&Qraw_text);
6595
6596   defsubr (&Scoding_system_p);
6597   defsubr (&Sread_coding_system);
6598   defsubr (&Sread_non_nil_coding_system);
6599   defsubr (&Scheck_coding_system);
6600   defsubr (&Sdetect_coding_region);
6601   defsubr (&Sdetect_coding_string);
6602   defsubr (&Sfind_coding_systems_region_internal);
6603   defsubr (&Sdecode_coding_region);
6604   defsubr (&Sencode_coding_region);
6605   defsubr (&Sdecode_coding_string);
6606   defsubr (&Sencode_coding_string);
6607   defsubr (&Sdecode_sjis_char);
6608   defsubr (&Sencode_sjis_char);
6609   defsubr (&Sdecode_big5_char);
6610   defsubr (&Sencode_big5_char);
6611   defsubr (&Sset_terminal_coding_system_internal);
6612   defsubr (&Sset_safe_terminal_coding_system_internal);
6613   defsubr (&Sterminal_coding_system);
6614   defsubr (&Sset_keyboard_coding_system_internal);
6615   defsubr (&Skeyboard_coding_system);
6616   defsubr (&Sfind_operation_coding_system);
6617   defsubr (&Supdate_coding_systems_internal);
6618   defsubr (&Sset_coding_priority_internal);
6619
6620   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
6621     "List of coding systems.\n\
6622 \n\
6623 Do not alter the value of this variable manually.  This variable should be\n\
6624 updated by the functions `make-coding-system' and\n\
6625 `define-coding-system-alias'.");
6626   Vcoding_system_list = Qnil;
6627
6628   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
6629     "Alist of coding system names.\n\
6630 Each element is one element list of coding system name.\n\
6631 This variable is given to `completing-read' as TABLE argument.\n\
6632 \n\
6633 Do not alter the value of this variable manually.  This variable should be\n\
6634 updated by the functions `make-coding-system' and\n\
6635 `define-coding-system-alias'.");
6636   Vcoding_system_alist = Qnil;
6637
6638   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
6639     "List of coding-categories (symbols) ordered by priority.");
6640   {
6641     int i;
6642
6643     Vcoding_category_list = Qnil;
6644     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
6645       Vcoding_category_list
6646         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
6647                  Vcoding_category_list);
6648   }
6649
6650   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
6651     "Specify the coding system for read operations.\n\
6652 It is useful to bind this variable with `let', but do not set it globally.\n\
6653 If the value is a coding system, it is used for decoding on read operation.\n\
6654 If not, an appropriate element is used from one of the coding system alists:\n\
6655 There are three such tables, `file-coding-system-alist',\n\
6656 `process-coding-system-alist', and `network-coding-system-alist'.");
6657   Vcoding_system_for_read = Qnil;
6658
6659   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
6660     "Specify the coding system for write operations.\n\
6661 Programs bind this variable with `let', but you should not set it globally.\n\
6662 If the value is a coding system, it is used for encoding of output,\n\
6663 when writing it to a file and when sending it to a file or subprocess.\n\
6664 \n\
6665 If this does not specify a coding system, an appropriate element\n\
6666 is used from one of the coding system alists:\n\
6667 There are three such tables, `file-coding-system-alist',\n\
6668 `process-coding-system-alist', and `network-coding-system-alist'.\n\
6669 For output to files, if the above procedure does not specify a coding system,\n\
6670 the value of `buffer-file-coding-system' is used.");
6671   Vcoding_system_for_write = Qnil;
6672
6673   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
6674     "Coding system used in the latest file or process I/O.");
6675   Vlast_coding_system_used = Qnil;
6676
6677   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
6678     "*Non-nil means always inhibit code conversion of end-of-line format.\n\
6679 See info node `Coding Systems' and info node `Text and Binary' concerning\n\
6680 such conversion.");
6681   inhibit_eol_conversion = 0;
6682
6683   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
6684     "Non-nil means process buffer inherits coding system of process output.\n\
6685 Bind it to t if the process output is to be treated as if it were a file\n\
6686 read from some filesystem.");
6687   inherit_process_coding_system = 0;
6688
6689   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
6690     "Alist to decide a coding system to use for a file I/O operation.\n\
6691 The format is ((PATTERN . VAL) ...),\n\
6692 where PATTERN is a regular expression matching a file name,\n\
6693 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6694 If VAL is a coding system, it is used for both decoding and encoding\n\
6695 the file contents.\n\
6696 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6697 and the cdr part is used for encoding.\n\
6698 If VAL is a function symbol, the function must return a coding system\n\
6699 or a cons of coding systems which are used as above.\n\
6700 \n\
6701 See also the function `find-operation-coding-system'\n\
6702 and the variable `auto-coding-alist'.");
6703   Vfile_coding_system_alist = Qnil;
6704
6705   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
6706     "Alist to decide a coding system to use for a process I/O operation.\n\
6707 The format is ((PATTERN . VAL) ...),\n\
6708 where PATTERN is a regular expression matching a program name,\n\
6709 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6710 If VAL is a coding system, it is used for both decoding what received\n\
6711 from the program and encoding what sent to the program.\n\
6712 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6713 and the cdr part is used for encoding.\n\
6714 If VAL is a function symbol, the function must return a coding system\n\
6715 or a cons of coding systems which are used as above.\n\
6716 \n\
6717 See also the function `find-operation-coding-system'.");
6718   Vprocess_coding_system_alist = Qnil;
6719
6720   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
6721     "Alist to decide a coding system to use for a network I/O operation.\n\
6722 The format is ((PATTERN . VAL) ...),\n\
6723 where PATTERN is a regular expression matching a network service name\n\
6724 or is a port number to connect to,\n\
6725 VAL is a coding system, a cons of coding systems, or a function symbol.\n\
6726 If VAL is a coding system, it is used for both decoding what received\n\
6727 from the network stream and encoding what sent to the network stream.\n\
6728 If VAL is a cons of coding systems, the car part is used for decoding,\n\
6729 and the cdr part is used for encoding.\n\
6730 If VAL is a function symbol, the function must return a coding system\n\
6731 or a cons of coding systems which are used as above.\n\
6732 \n\
6733 See also the function `find-operation-coding-system'.");
6734   Vnetwork_coding_system_alist = Qnil;
6735
6736   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
6737     "Coding system to use with system messages.");
6738   Vlocale_coding_system = Qnil;
6739
6740   /* The eol mnemonics are reset in startup.el system-dependently.  */
6741   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
6742     "*String displayed in mode line for UNIX-like (LF) end-of-line format.");
6743   eol_mnemonic_unix = build_string (":");
6744
6745   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
6746     "*String displayed in mode line for DOS-like (CRLF) end-of-line format.");
6747   eol_mnemonic_dos = build_string ("\\");
6748
6749   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
6750     "*String displayed in mode line for MAC-like (CR) end-of-line format.");
6751   eol_mnemonic_mac = build_string ("/");
6752
6753   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
6754     "*String displayed in mode line when end-of-line format is not yet determined.");
6755   eol_mnemonic_undecided = build_string (":");
6756
6757   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
6758     "*Non-nil enables character translation while encoding and decoding.");
6759   Venable_character_translation = Qt;
6760
6761   DEFVAR_LISP ("standard-translation-table-for-decode",
6762     &Vstandard_translation_table_for_decode,
6763     "Table for translating characters while decoding.");
6764   Vstandard_translation_table_for_decode = Qnil;
6765
6766   DEFVAR_LISP ("standard-translation-table-for-encode",
6767     &Vstandard_translation_table_for_encode,
6768     "Table for translationg characters while encoding.");
6769   Vstandard_translation_table_for_encode = Qnil;
6770
6771   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
6772     "Alist of charsets vs revision numbers.\n\
6773 While encoding, if a charset (car part of an element) is found,\n\
6774 designate it with the escape sequence identifing revision (cdr part of the element).");
6775   Vcharset_revision_alist = Qnil;
6776
6777   DEFVAR_LISP ("default-process-coding-system",
6778                &Vdefault_process_coding_system,
6779     "Cons of coding systems used for process I/O by default.\n\
6780 The car part is used for decoding a process output,\n\
6781 the cdr part is used for encoding a text to be sent to a process.");
6782   Vdefault_process_coding_system = Qnil;
6783
6784   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
6785     "Table of extra Latin codes in the range 128..159 (inclusive).\n\
6786 This is a vector of length 256.\n\
6787 If Nth element is non-nil, the existence of code N in a file\n\
6788 \(or output of subprocess) doesn't prevent it to be detected as\n\
6789 a coding system of ISO 2022 variant which has a flag\n\
6790 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file\n\
6791 or reading output of a subprocess.\n\
6792 Only 128th through 159th elements has a meaning.");
6793   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
6794
6795   DEFVAR_LISP ("select-safe-coding-system-function",
6796                &Vselect_safe_coding_system_function,
6797     "Function to call to select safe coding system for encoding a text.\n\
6798 \n\
6799 If set, this function is called to force a user to select a proper\n\
6800 coding system which can encode the text in the case that a default\n\
6801 coding system used in each operation can't encode the text.\n\
6802 \n\
6803 The default value is `select-safe-coding-system' (which see).");
6804   Vselect_safe_coding_system_function = Qnil;
6805
6806   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
6807     "Char-table containing safe coding systems of each characters.\n\
6808 Each element doesn't include such generic coding systems that can\n\
6809 encode any characters.   They are in the first extra slot.");
6810   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
6811
6812   DEFVAR_BOOL ("inhibit-iso-escape-detection",
6813                &inhibit_iso_escape_detection,
6814     "If non-nil, Emacs ignores ISO2022's escape sequence on code detection.\n\
6815 \n\
6816 By default, on reading a file, Emacs tries to detect how the text is\n\
6817 encoded.  This code detection is sensitive to escape sequences.  If\n\
6818 the sequence is valid as ISO2022, the code is determined as one of\n\
6819 the ISO2022 encodings, and the file is decoded by the corresponding\n\
6820 coding system (e.g. `iso-2022-7bit').\n\
6821 \n\
6822 However, there may be a case that you want to read escape sequences in\n\
6823 a file as is.  In such a case, you can set this variable to non-nil.\n\
6824 Then, as the code detection ignores any escape sequences, no file is\n\
6825 detected as encoded in some ISO2022 encoding.  The result is that all\n\
6826 escape sequences become visible in a buffer.\n\
6827 \n\
6828 The default value is nil, and it is strongly recommended not to change\n\
6829 it.  That is because many Emacs Lisp source files that contain\n\
6830 non-ASCII characters are encoded by the coding system `iso-2022-7bit'\n\
6831 in Emacs's distribution, and they won't be decoded correctly on\n\
6832 reading if you suppress escape sequence detection.\n\
6833 \n\
6834 The other way to read escape sequences in a file without decoding is\n\
6835 to explicitly specify some coding system that doesn't use ISO2022's\n\
6836 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].");
6837   inhibit_iso_escape_detection = 0;
6838 }
6839
6840 char *
6841 emacs_strerror (error_number)
6842      int error_number;
6843 {
6844   char *str;
6845
6846   synchronize_system_messages_locale ();
6847   str = strerror (error_number);
6848
6849   if (! NILP (Vlocale_coding_system))
6850     {
6851       Lisp_Object dec = code_convert_string_norecord (build_string (str),
6852                                                       Vlocale_coding_system,
6853                                                       0);
6854       str = (char *) XSTRING (dec)->data;
6855     }
6856
6857   return str;
6858 }
6859
6860 #endif /* emacs */
6861