src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292
 293 #include "lisp.h"
 294 #include "buffer.h"
 295 #include "character.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 307 Lisp_Object Qunix, Qdos;
 308 extern Lisp_Object Qmac;        /* frame.c */
 309 Lisp_Object Qbuffer_file_coding_system;
 310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 311 Lisp_Object Qdefault_char;
 312 Lisp_Object Qno_conversion, Qundecided;
 313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 314 Lisp_Object Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 320 Lisp_Object QCascii_compatible_p;
 321
 322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 323 Lisp_Object Qcall_process, Qcall_process_region;
 324 Lisp_Object Qstart_process, Qopen_network_stream;
 325 Lisp_Object Qtarget_idx;
 326
 327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 328 Lisp_Object Qinterrupted, Qinsufficient_memory;
 329
 330 extern Lisp_Object Qcompletion_ignore_case;
 331
 332 /* If a symbol has this property, evaluate the value to define the
 333    symbol as a coding system.  */
 334 static Lisp_Object Qcoding_system_define_form;
 335
 336 int coding_system_require_warning;
 337
 338 Lisp_Object Vselect_safe_coding_system_function;
 339
 340 /* Mnemonic string for each format of end-of-line.  */
 341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 342 /* Mnemonic string to indicate format of end-of-line is not yet
 343    decided.  */
 344 Lisp_Object eol_mnemonic_undecided;
 345
 346 /* Format of end-of-line decided by system.  This is Qunix on
 347    Unix and Mac, Qdos on DOS/Windows.
 348    This has an effect only for external encoding (i.e. for output to
 349    file and process), not for in-buffer or Lisp string encoding.  */
 350 static Lisp_Object system_eol_type;
 351
 352 #ifdef emacs
 353
 354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 355
 356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 357
 358 /* Coding system emacs-mule and raw-text are for converting only
 359    end-of-line format.  */
 360 Lisp_Object Qemacs_mule, Qraw_text;
 361 Lisp_Object Qutf_8_emacs;
 362
 363 /* Coding-systems are handed between Emacs Lisp programs and C internal
 364    routines by the following three variables.  */
 365 /* Coding-system for reading files and receiving data from process.  */
 366 Lisp_Object Vcoding_system_for_read;
 367 /* Coding-system for writing files and sending data to process.  */
 368 Lisp_Object Vcoding_system_for_write;
 369 /* Coding-system actually used in the latest I/O.  */
 370 Lisp_Object Vlast_coding_system_used;
 371 /* Set to non-nil when an error is detected while code conversion.  */
 372 Lisp_Object Vlast_code_conversion_error;
 373 /* A vector of length 256 which contains information about special
 374    Latin codes (especially for dealing with Microsoft codes).  */
 375 Lisp_Object Vlatin_extra_code_table;
 376
 377 /* Flag to inhibit code conversion of end-of-line format.  */
 378 int inhibit_eol_conversion;
 379
 380 /* Flag to inhibit ISO2022 escape sequence detection.  */
 381 int inhibit_iso_escape_detection;
 382
 383 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 384 int inherit_process_coding_system;
 385
 386 /* Coding system to be used to encode text for terminal display when
 387    terminal coding system is nil.  */
 388 struct coding_system safe_terminal_coding;
 389
 390 Lisp_Object Vfile_coding_system_alist;
 391 Lisp_Object Vprocess_coding_system_alist;
 392 Lisp_Object Vnetwork_coding_system_alist;
 393
 394 Lisp_Object Vlocale_coding_system;
 395
 396 #endif /* emacs */
 397
 398 /* Flag to tell if we look up translation table on character code
 399    conversion.  */
 400 Lisp_Object Venable_character_translation;
 401 /* Standard translation table to look up on decoding (reading).  */
 402 Lisp_Object Vstandard_translation_table_for_decode;
 403 /* Standard translation table to look up on encoding (writing).  */
 404 Lisp_Object Vstandard_translation_table_for_encode;
 405
 406 Lisp_Object Qtranslation_table;
 407 Lisp_Object Qtranslation_table_id;
 408 Lisp_Object Qtranslation_table_for_decode;
 409 Lisp_Object Qtranslation_table_for_encode;
 410
 411 /* Alist of charsets vs revision number.  */
 412 static Lisp_Object Vcharset_revision_table;
 413
 414 /* Default coding systems used for process I/O.  */
 415 Lisp_Object Vdefault_process_coding_system;
 416
 417 /* Char table for translating Quail and self-inserting input.  */
 418 Lisp_Object Vtranslation_table_for_input;
 419
 420 /* Two special coding systems.  */
 421 Lisp_Object Vsjis_coding_system;
 422 Lisp_Object Vbig5_coding_system;
 423
 424 /* ISO2022 section */
 425
 426 #define CODING_ISO_INITIAL(coding, reg)                 \
 427   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 428                      coding_attr_iso_initial),          \
 429                reg)))
 430
 431
 432 #define CODING_ISO_REQUEST(coding, charset_id)  \
 433   ((charset_id <= (coding)->max_charset_id      \
 434     ? (coding)->safe_charsets[charset_id]       \
 435     : -1))
 436
 437
 438 #define CODING_ISO_FLAGS(coding)        \
 439   ((coding)->spec.iso_2022.flags)
 440 #define CODING_ISO_DESIGNATION(coding, reg)     \
 441   ((coding)->spec.iso_2022.current_designation[reg])
 442 #define CODING_ISO_INVOCATION(coding, plane)    \
 443   ((coding)->spec.iso_2022.current_invocation[plane])
 444 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 445   ((coding)->spec.iso_2022.single_shifting)
 446 #define CODING_ISO_BOL(coding)  \
 447   ((coding)->spec.iso_2022.bol)
 448 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 449   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 450
 451 /* Control characters of ISO2022.  */
 452                         /* code */      /* function */
 453 #define ISO_CODE_LF     0x0A            /* line-feed */
 454 #define ISO_CODE_CR     0x0D            /* carriage-return */
 455 #define ISO_CODE_SO     0x0E            /* shift-out */
 456 #define ISO_CODE_SI     0x0F            /* shift-in */
 457 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 458 #define ISO_CODE_ESC    0x1B            /* escape */
 459 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 460 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 461 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 462
 463 /* All code (1-byte) of ISO2022 is classified into one of the
 464    followings.  */
 465 enum iso_code_class_type
 466   {
 467     ISO_control_0,              /* Control codes in the range
 468                                    0x00..0x1F and 0x7F, except for the
 469                                    following 5 codes.  */
 470     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 471     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 472     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 473     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 474     ISO_control_1,              /* Control codes in the range
 475                                    0x80..0x9F, except for the
 476                                    following 3 codes.  */
 477     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 478     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 479     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 480     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 481     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 482     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 483     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 484   };
 485
 486 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 487     `iso-flags' attribute of an iso2022 coding system.  */
 488
 489 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 490    instead of the correct short-form sequence (e.g. ESC $ A).  */
 491 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 492
 493 /* If set, reset graphic planes and registers at end-of-line to the
 494    initial state.  */
 495 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 496
 497 /* If set, reset graphic planes and registers before any control
 498    characters to the initial state.  */
 499 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 500
 501 /* If set, encode by 7-bit environment.  */
 502 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 503
 504 /* If set, use locking-shift function.  */
 505 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 506
 507 /* If set, use single-shift function.  Overwrite
 508    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 509 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 510
 511 /* If set, use designation escape sequence.  */
 512 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 513
 514 /* If set, produce revision number sequence.  */
 515 #define CODING_ISO_FLAG_REVISION        0x0080
 516
 517 /* If set, produce ISO6429's direction specifying sequence.  */
 518 #define CODING_ISO_FLAG_DIRECTION       0x0100
 519
 520 /* If set, assume designation states are reset at beginning of line on
 521    output.  */
 522 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 523
 524 /* If set, designation sequence should be placed at beginning of line
 525    on output.  */
 526 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 527
 528 /* If set, do not encode unsafe charactes on output.  */
 529 #define CODING_ISO_FLAG_SAFE            0x0800
 530
 531 /* If set, extra latin codes (128..159) are accepted as a valid code
 532    on input.  */
 533 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 534
 535 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 536
 537 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 538
 539 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 540
 541 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 542
 543 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 544
 545 /* A character to be produced on output if encoding of the original
 546    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 547 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 548
 549 /* UTF-8 section */
 550 #define CODING_UTF_8_BOM(coding)        \
 551   ((coding)->spec.utf_8_bom)
 552
 553 /* UTF-16 section */
 554 #define CODING_UTF_16_BOM(coding)       \
 555   ((coding)->spec.utf_16.bom)
 556
 557 #define CODING_UTF_16_ENDIAN(coding)    \
 558   ((coding)->spec.utf_16.endian)
 559
 560 #define CODING_UTF_16_SURROGATE(coding) \
 561   ((coding)->spec.utf_16.surrogate)
 562
 563
 564 /* CCL section */
 565 #define CODING_CCL_DECODER(coding)      \
 566   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 567 #define CODING_CCL_ENCODER(coding)      \
 568   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 569 #define CODING_CCL_VALIDS(coding)                                          \
 570   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 571
 572 /* Index for each coding category in `coding_categories' */
 573
 574 enum coding_category
 575   {
 576     coding_category_iso_7,
 577     coding_category_iso_7_tight,
 578     coding_category_iso_8_1,
 579     coding_category_iso_8_2,
 580     coding_category_iso_7_else,
 581     coding_category_iso_8_else,
 582     coding_category_utf_8_auto,
 583     coding_category_utf_8_nosig,
 584     coding_category_utf_8_sig,
 585     coding_category_utf_16_auto,
 586     coding_category_utf_16_be,
 587     coding_category_utf_16_le,
 588     coding_category_utf_16_be_nosig,
 589     coding_category_utf_16_le_nosig,
 590     coding_category_charset,
 591     coding_category_sjis,
 592     coding_category_big5,
 593     coding_category_ccl,
 594     coding_category_emacs_mule,
 595     /* All above are targets of code detection.  */
 596     coding_category_raw_text,
 597     coding_category_undecided,
 598     coding_category_max
 599   };
 600
 601 /* Definitions of flag bits used in detect_coding_XXXX.  */
 602 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 603 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 604 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 605 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 606 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 607 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 608 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 609 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 610 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 611 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 612 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 613 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 614 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 615 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 616 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 617 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 618 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 619 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 620 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 621 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 622
 623 /* This value is returned if detect_coding_mask () find nothing other
 624    than ASCII characters.  */
 625 #define CATEGORY_MASK_ANY               \
 626   (CATEGORY_MASK_ISO_7                  \
 627    | CATEGORY_MASK_ISO_7_TIGHT          \
 628    | CATEGORY_MASK_ISO_8_1              \
 629    | CATEGORY_MASK_ISO_8_2              \
 630    | CATEGORY_MASK_ISO_7_ELSE           \
 631    | CATEGORY_MASK_ISO_8_ELSE           \
 632    | CATEGORY_MASK_UTF_8_AUTO           \
 633    | CATEGORY_MASK_UTF_8_NOSIG          \
 634    | CATEGORY_MASK_UTF_8_SIG            \
 635    | CATEGORY_MASK_UTF_16_AUTO          \
 636    | CATEGORY_MASK_UTF_16_BE            \
 637    | CATEGORY_MASK_UTF_16_LE            \
 638    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 639    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 640    | CATEGORY_MASK_CHARSET              \
 641    | CATEGORY_MASK_SJIS                 \
 642    | CATEGORY_MASK_BIG5                 \
 643    | CATEGORY_MASK_CCL                  \
 644    | CATEGORY_MASK_EMACS_MULE)
 645
 646
 647 #define CATEGORY_MASK_ISO_7BIT \
 648   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 649
 650 #define CATEGORY_MASK_ISO_8BIT \
 651   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 652
 653 #define CATEGORY_MASK_ISO_ELSE \
 654   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 655
 656 #define CATEGORY_MASK_ISO_ESCAPE        \
 657   (CATEGORY_MASK_ISO_7                  \
 658    | CATEGORY_MASK_ISO_7_TIGHT          \
 659    | CATEGORY_MASK_ISO_7_ELSE           \
 660    | CATEGORY_MASK_ISO_8_ELSE)
 661
 662 #define CATEGORY_MASK_ISO       \
 663   (  CATEGORY_MASK_ISO_7BIT     \
 664      | CATEGORY_MASK_ISO_8BIT   \
 665      | CATEGORY_MASK_ISO_ELSE)
 666
 667 #define CATEGORY_MASK_UTF_16            \
 668   (CATEGORY_MASK_UTF_16_AUTO            \
 669    | CATEGORY_MASK_UTF_16_BE            \
 670    | CATEGORY_MASK_UTF_16_LE            \
 671    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 672    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 673
 674 #define CATEGORY_MASK_UTF_8     \
 675   (CATEGORY_MASK_UTF_8_AUTO     \
 676    | CATEGORY_MASK_UTF_8_NOSIG  \
 677    | CATEGORY_MASK_UTF_8_SIG)
 678
 679 /* List of symbols `coding-category-xxx' ordered by priority.  This
 680    variable is exposed to Emacs Lisp.  */
 681 static Lisp_Object Vcoding_category_list;
 682
 683 /* Table of coding categories (Lisp symbols).  This variable is for
 684    internal use oly.  */
 685 static Lisp_Object Vcoding_category_table;
 686
 687 /* Table of coding-categories ordered by priority.  */
 688 static enum coding_category coding_priorities[coding_category_max];
 689
 690 /* Nth element is a coding context for the coding system bound to the
 691    Nth coding category.  */
 692 static struct coding_system coding_categories[coding_category_max];
 693
 694 /*** Commonly used macros and functions ***/
 695
 696 #ifndef min
 697 #define min(a, b) ((a) < (b) ? (a) : (b))
 698 #endif
 699 #ifndef max
 700 #define max(a, b) ((a) > (b) ? (a) : (b))
 701 #endif
 702
 703 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 704   do {                                                  \
 705     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 706     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 707   } while (0)
 708
 709
 710 /* Safely get one byte from the source text pointed by SRC which ends
 711    at SRC_END, and set C to that byte.  If there are not enough bytes
 712    in the source, it jumps to `no_more_source'.  If multibytep is
 713    nonzero, and a multibyte character is found at SRC, set C to the
 714    negative value of the character code.  The caller should declare
 715    and set these variables appropriately in advance:
 716         src, src_end, multibytep */
 717
 718 #define ONE_MORE_BYTE(c)                                \
 719   do {                                                  \
 720     if (src == src_end)                                 \
 721       {                                                 \
 722         if (src_base < src)                             \
 723           record_conversion_result                      \
 724             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 725         goto no_more_source;                            \
 726       }                                                 \
 727     c = *src++;                                         \
 728     if (multibytep && (c & 0x80))                       \
 729       {                                                 \
 730         if ((c & 0xFE) == 0xC0)                         \
 731           c = ((c & 1) << 6) | *src++;                  \
 732         else                                            \
 733           {                                             \
 734             src--;                                      \
 735             c = - string_char (src, &src, NULL);        \
 736             record_conversion_result                    \
 737               (coding, CODING_RESULT_INVALID_SRC);      \
 738           }                                             \
 739       }                                                 \
 740     consumed_chars++;                                   \
 741   } while (0)
 742
 743
 744 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 745   do {                                                  \
 746     c = *src++;                                         \
 747     if (multibytep && (c & 0x80))                       \
 748       {                                                 \
 749         if ((c & 0xFE) == 0xC0)                         \
 750           c = ((c & 1) << 6) | *src++;                  \
 751         else                                            \
 752           {                                             \
 753             src--;                                      \
 754             c = - string_char (src, &src, NULL);        \
 755             record_conversion_result                    \
 756               (coding, CODING_RESULT_INVALID_SRC);      \
 757           }                                             \
 758       }                                                 \
 759     consumed_chars++;                                   \
 760   } while (0)
 761
 762
 763 /* Store a byte C in the place pointed by DST and increment DST to the
 764    next free point, and increment PRODUCED_CHARS.  The caller should
 765    assure that C is 0..127, and declare and set the variable `dst'
 766    appropriately in advance.
 767 */
 768
 769
 770 #define EMIT_ONE_ASCII_BYTE(c)  \
 771   do {                          \
 772     produced_chars++;           \
 773     *dst++ = (c);               \
 774   } while (0)
 775
 776
 777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 778
 779 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 780   do {                                  \
 781     produced_chars += 2;                \
 782     *dst++ = (c1), *dst++ = (c2);       \
 783   } while (0)
 784
 785
 786 /* Store a byte C in the place pointed by DST and increment DST to the
 787    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 788    nonzero, store in an appropriate multibyte from.  The caller should
 789    declare and set the variables `dst' and `multibytep' appropriately
 790    in advance.  */
 791
 792 #define EMIT_ONE_BYTE(c)                \
 793   do {                                  \
 794     produced_chars++;                   \
 795     if (multibytep)                     \
 796       {                                 \
 797         int ch = (c);                   \
 798         if (ch >= 0x80)                 \
 799           ch = BYTE8_TO_CHAR (ch);      \
 800         CHAR_STRING_ADVANCE (ch, dst);  \
 801       }                                 \
 802     else                                \
 803       *dst++ = (c);                     \
 804   } while (0)
 805
 806
 807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 808
 809 #define EMIT_TWO_BYTES(c1, c2)          \
 810   do {                                  \
 811     produced_chars += 2;                \
 812     if (multibytep)                     \
 813       {                                 \
 814         int ch;                         \
 815                                         \
 816         ch = (c1);                      \
 817         if (ch >= 0x80)                 \
 818           ch = BYTE8_TO_CHAR (ch);      \
 819         CHAR_STRING_ADVANCE (ch, dst);  \
 820         ch = (c2);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824       }                                 \
 825     else                                \
 826       {                                 \
 827         *dst++ = (c1);                  \
 828         *dst++ = (c2);                  \
 829       }                                 \
 830   } while (0)
 831
 832
 833 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 834   do {                                  \
 835     EMIT_ONE_BYTE (c1);                 \
 836     EMIT_TWO_BYTES (c2, c3);            \
 837   } while (0)
 838
 839
 840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 841   do {                                          \
 842     EMIT_TWO_BYTES (c1, c2);                    \
 843     EMIT_TWO_BYTES (c3, c4);                    \
 844   } while (0)
 845
 846
 847 /* Prototypes for static functions.  */
 848 static void record_conversion_result P_ ((struct coding_system *coding,
 849                                           enum coding_result_code result));
 850 static int detect_coding_utf_8 P_ ((struct coding_system *,
 851                                     struct coding_detection_info *info));
 852 static void decode_coding_utf_8 P_ ((struct coding_system *));
 853 static int encode_coding_utf_8 P_ ((struct coding_system *));
 854
 855 static int detect_coding_utf_16 P_ ((struct coding_system *,
 856                                      struct coding_detection_info *info));
 857 static void decode_coding_utf_16 P_ ((struct coding_system *));
 858 static int encode_coding_utf_16 P_ ((struct coding_system *));
 859
 860 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 861                                        struct coding_detection_info *info));
 862 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 863 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 864
 865 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 866                                          struct coding_detection_info *info));
 867 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 868 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 869
 870 static int detect_coding_sjis P_ ((struct coding_system *,
 871                                    struct coding_detection_info *info));
 872 static void decode_coding_sjis P_ ((struct coding_system *));
 873 static int encode_coding_sjis P_ ((struct coding_system *));
 874
 875 static int detect_coding_big5 P_ ((struct coding_system *,
 876                                    struct coding_detection_info *info));
 877 static void decode_coding_big5 P_ ((struct coding_system *));
 878 static int encode_coding_big5 P_ ((struct coding_system *));
 879
 880 static int detect_coding_ccl P_ ((struct coding_system *,
 881                                   struct coding_detection_info *info));
 882 static void decode_coding_ccl P_ ((struct coding_system *));
 883 static int encode_coding_ccl P_ ((struct coding_system *));
 884
 885 static void decode_coding_raw_text P_ ((struct coding_system *));
 886 static int encode_coding_raw_text P_ ((struct coding_system *));
 887
 888 static void coding_set_source P_ ((struct coding_system *));
 889 static void coding_set_destination P_ ((struct coding_system *));
 890 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 891 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 892                                             EMACS_INT, EMACS_INT));
 893 static unsigned char *alloc_destination P_ ((struct coding_system *,
 894                                              EMACS_INT, unsigned char *));
 895 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 896 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 897                                                      int *, int *,
 898                                                      unsigned char *));
 899 static int detect_eol P_ ((const unsigned char *,
 900                            EMACS_INT, enum coding_category));
 901 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 902 static void decode_eol P_ ((struct coding_system *));
 903 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 904 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 905                                         int, int *, int *));
 906 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 907 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 908                                             EMACS_INT));
 909 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 910                                         EMACS_INT));
 911 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 912 static int decode_coding P_ ((struct coding_system *));
 913 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 914                                                       struct coding_system *,
 915                                                       int *, EMACS_INT *));
 916 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 917                                                   struct coding_system *,
 918                                                   int *, EMACS_INT *));
 919 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 920 static int encode_coding P_ ((struct coding_system *));
 921 static Lisp_Object make_conversion_work_buffer P_ ((int));
 922 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 923 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 924 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 925
 926 static void
 927 record_conversion_result (struct coding_system *coding,
 928                           enum coding_result_code result)
 929 {
 930   coding->result = result;
 931   switch (result)
 932     {
 933     case CODING_RESULT_INSUFFICIENT_SRC:
 934       Vlast_code_conversion_error = Qinsufficient_source;
 935       break;
 936     case CODING_RESULT_INCONSISTENT_EOL:
 937       Vlast_code_conversion_error = Qinconsistent_eol;
 938       break;
 939     case CODING_RESULT_INVALID_SRC:
 940       Vlast_code_conversion_error = Qinvalid_source;
 941       break;
 942     case CODING_RESULT_INTERRUPT:
 943       Vlast_code_conversion_error = Qinterrupted;
 944       break;
 945     case CODING_RESULT_INSUFFICIENT_MEM:
 946       Vlast_code_conversion_error = Qinsufficient_memory;
 947       break;
 948     default:
 949       Vlast_code_conversion_error = intern ("Unknown error");
 950     }
 951 }
 952
 953 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 954   do {                                                                       \
 955     charset_map_loaded = 0;                                                  \
 956     c = DECODE_CHAR (charset, code);                                         \
 957     if (charset_map_loaded)                                                  \
 958       {                                                                      \
 959         const unsigned char *orig = coding->source;                          \
 960         EMACS_INT offset;                                                    \
 961                                                                              \
 962         coding_set_source (coding);                                          \
 963         offset = coding->source - orig;                                      \
 964         src += offset;                                                       \
 965         src_base += offset;                                                  \
 966         src_end += offset;                                                   \
 967       }                                                                      \
 968   } while (0)
 969
 970
 971 /* If there are at least BYTES length of room at dst, allocate memory
 972    for coding->destination and update dst and dst_end.  We don't have
 973    to take care of coding->source which will be relocated.  It is
 974    handled by calling coding_set_source in encode_coding.  */
 975
 976 #define ASSURE_DESTINATION(bytes)                               \
 977   do {                                                          \
 978     if (dst + (bytes) >= dst_end)                               \
 979       {                                                         \
 980         int more_bytes = charbuf_end - charbuf + (bytes);       \
 981                                                                 \
 982         dst = alloc_destination (coding, more_bytes, dst);      \
 983         dst_end = coding->destination + coding->dst_bytes;      \
 984       }                                                         \
 985   } while (0)
 986
 987
 988 /* Store multibyte form of the character C in P, and advance P to the
 989    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 990    never calls MAYBE_UNIFY_CHAR.  */
 991
 992 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 993   do {                                          \
 994     if ((c) <= MAX_1_BYTE_CHAR)                 \
 995       *(p)++ = (c);                             \
 996     else if ((c) <= MAX_2_BYTE_CHAR)            \
 997       *(p)++ = (0xC0 | ((c) >> 6)),             \
 998         *(p)++ = (0x80 | ((c) & 0x3F));         \
 999     else if ((c) <= MAX_3_BYTE_CHAR)            \
1000       *(p)++ = (0xE0 | ((c) >> 12)),            \
1001         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1002         *(p)++ = (0x80 | ((c) & 0x3F));         \
1003     else if ((c) <= MAX_4_BYTE_CHAR)            \
1004       *(p)++ = (0xF0 | (c >> 18)),              \
1005         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1006         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1007         *(p)++ = (0x80 | (c & 0x3F));           \
1008     else if ((c) <= MAX_5_BYTE_CHAR)            \
1009       *(p)++ = 0xF8,                            \
1010         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1011         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1012         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1013         *(p)++ = (0x80 | (c & 0x3F));           \
1014     else                                        \
1015       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1016   } while (0)
1017
1018
1019 /* Return the character code of character whose multibyte form is at
1020    P, and advance P to the end of the multibyte form.  This is like
1021    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1022
1023 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1024   (!((p)[0] & 0x80)                                             \
1025    ? *(p)++                                                     \
1026    : ! ((p)[0] & 0x20)                                          \
1027    ? ((p) += 2,                                                 \
1028       ((((p)[-2] & 0x1F) << 6)                                  \
1029        | ((p)[-1] & 0x3F)                                       \
1030        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1031    : ! ((p)[0] & 0x10)                                          \
1032    ? ((p) += 3,                                                 \
1033       ((((p)[-3] & 0x0F) << 12)                                 \
1034        | (((p)[-2] & 0x3F) << 6)                                \
1035        | ((p)[-1] & 0x3F)))                                     \
1036    : ! ((p)[0] & 0x08)                                          \
1037    ? ((p) += 4,                                                 \
1038       ((((p)[-4] & 0xF) << 18)                                  \
1039        | (((p)[-3] & 0x3F) << 12)                               \
1040        | (((p)[-2] & 0x3F) << 6)                                \
1041        | ((p)[-1] & 0x3F)))                                     \
1042    : ((p) += 5,                                                 \
1043       ((((p)[-4] & 0x3F) << 18)                                 \
1044        | (((p)[-3] & 0x3F) << 12)                               \
1045        | (((p)[-2] & 0x3F) << 6)                                \
1046        | ((p)[-1] & 0x3F))))
1047
1048
1049 static void
1050 coding_set_source (coding)
1051      struct coding_system *coding;
1052 {
1053   if (BUFFERP (coding->src_object))
1054     {
1055       struct buffer *buf = XBUFFER (coding->src_object);
1056
1057       if (coding->src_pos < 0)
1058         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1059       else
1060         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1061     }
1062   else if (STRINGP (coding->src_object))
1063     {
1064       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1065     }
1066   else
1067     /* Otherwise, the source is C string and is never relocated
1068        automatically.  Thus we don't have to update anything.  */
1069     ;
1070 }
1071
1072 static void
1073 coding_set_destination (coding)
1074      struct coding_system *coding;
1075 {
1076   if (BUFFERP (coding->dst_object))
1077     {
1078       if (coding->src_pos < 0)
1079         {
1080           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1081           coding->dst_bytes = (GAP_END_ADDR
1082                                - (coding->src_bytes - coding->consumed)
1083                                - coding->destination);
1084         }
1085       else
1086         {
1087           /* We are sure that coding->dst_pos_byte is before the gap
1088              of the buffer. */
1089           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1090                                  + coding->dst_pos_byte - BEG_BYTE);
1091           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1092                                - coding->destination);
1093         }
1094     }
1095   else
1096     /* Otherwise, the destination is C string and is never relocated
1097        automatically.  Thus we don't have to update anything.  */
1098     ;
1099 }
1100
1101
1102 static void
1103 coding_alloc_by_realloc (coding, bytes)
1104      struct coding_system *coding;
1105      EMACS_INT bytes;
1106 {
1107   coding->destination = (unsigned char *) xrealloc (coding->destination,
1108                                                     coding->dst_bytes + bytes);
1109   coding->dst_bytes += bytes;
1110 }
1111
1112 static void
1113 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1114      struct coding_system *coding;
1115      EMACS_INT gap_head_used, bytes;
1116 {
1117   if (EQ (coding->src_object, coding->dst_object))
1118     {
1119       /* The gap may contain the produced data at the head and not-yet
1120          consumed data at the tail.  To preserve those data, we at
1121          first make the gap size to zero, then increase the gap
1122          size.  */
1123       EMACS_INT add = GAP_SIZE;
1124
1125       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1126       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1127       make_gap (bytes);
1128       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1129       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1130     }
1131   else
1132     {
1133       Lisp_Object this_buffer;
1134
1135       this_buffer = Fcurrent_buffer ();
1136       set_buffer_internal (XBUFFER (coding->dst_object));
1137       make_gap (bytes);
1138       set_buffer_internal (XBUFFER (this_buffer));
1139     }
1140 }
1141
1142
1143 static unsigned char *
1144 alloc_destination (coding, nbytes, dst)
1145      struct coding_system *coding;
1146      EMACS_INT nbytes;
1147      unsigned char *dst;
1148 {
1149   EMACS_INT offset = dst - coding->destination;
1150
1151   if (BUFFERP (coding->dst_object))
1152     {
1153       struct buffer *buf = XBUFFER (coding->dst_object);
1154
1155       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1156     }
1157   else
1158     coding_alloc_by_realloc (coding, nbytes);
1159   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1160   coding_set_destination (coding);
1161   dst = coding->destination + offset;
1162   return dst;
1163 }
1164
1165 /** Macros for annotations.  */
1166
1167 /* Maximum length of annotation data (sum of annotations for
1168    composition and charset).  */
1169 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1170
1171 /* An annotation data is stored in the array coding->charbuf in this
1172    format:
1173      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1174    LENGTH is the number of elements in the annotation.
1175    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1176    NCHARS is the number of characters in the text annotated.
1177
1178    The format of the following elements depend on ANNOTATION_MASK.
1179
1180    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1181    follows:
1182      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1183    METHOD is one of enum composition_method.
1184    Optionnal COMPOSITION-COMPONENTS are characters and composition
1185    rules.
1186
1187    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1188    follows.  */
1189
1190 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1191   do {                                                  \
1192     *(buf)++ = -(len);                                  \
1193     *(buf)++ = (mask);                                  \
1194     *(buf)++ = (nchars);                                \
1195     coding->annotated = 1;                              \
1196   } while (0);
1197
1198 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1199   do {                                                                      \
1200     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1201     *buf++ = method;                                                        \
1202   } while (0)
1203
1204
1205 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1206   do {                                                                  \
1207     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1208     *buf++ = id;                                                        \
1209   } while (0)
1210
1211 \f
1212 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1213
1214
1215
1216 \f
1217 /*** 3. UTF-8 ***/
1218
1219 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1220    Check if a text is encoded in UTF-8.  If it is, return 1, else
1221    return 0.  */
1222
1223 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1224 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1225 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1226 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1227 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1228 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1229
1230 #define UTF_BOM 0xFEFF
1231 #define UTF_8_BOM_1 0xEF
1232 #define UTF_8_BOM_2 0xBB
1233 #define UTF_8_BOM_3 0xBF
1234
1235 static int
1236 detect_coding_utf_8 (coding, detect_info)
1237      struct coding_system *coding;
1238      struct coding_detection_info *detect_info;
1239 {
1240   const unsigned char *src = coding->source, *src_base;
1241   const unsigned char *src_end = coding->source + coding->src_bytes;
1242   int multibytep = coding->src_multibyte;
1243   int consumed_chars = 0;
1244   int bom_found = 0;
1245   int found = 0;
1246
1247   detect_info->checked |= CATEGORY_MASK_UTF_8;
1248   /* A coding system of this category is always ASCII compatible.  */
1249   src += coding->head_ascii;
1250
1251   while (1)
1252     {
1253       int c, c1, c2, c3, c4;
1254
1255       src_base = src;
1256       ONE_MORE_BYTE (c);
1257       if (c < 0 || UTF_8_1_OCTET_P (c))
1258         continue;
1259       ONE_MORE_BYTE (c1);
1260       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1261         break;
1262       if (UTF_8_2_OCTET_LEADING_P (c))
1263         {
1264           found = 1;
1265           continue;
1266         }
1267       ONE_MORE_BYTE (c2);
1268       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1269         break;
1270       if (UTF_8_3_OCTET_LEADING_P (c))
1271         {
1272           found = 1;
1273           if (src_base == coding->source
1274               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1275             bom_found = 1;
1276           continue;
1277         }
1278       ONE_MORE_BYTE (c3);
1279       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1280         break;
1281       if (UTF_8_4_OCTET_LEADING_P (c))
1282         {
1283           found = 1;
1284           continue;
1285         }
1286       ONE_MORE_BYTE (c4);
1287       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1288         break;
1289       if (UTF_8_5_OCTET_LEADING_P (c))
1290         {
1291           found = 1;
1292           continue;
1293         }
1294       break;
1295     }
1296   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1297   return 0;
1298
1299  no_more_source:
1300   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1301     {
1302       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1303       return 0;
1304     }
1305   if (bom_found)
1306     {
1307       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1308       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1309     }
1310   else
1311     {
1312       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1313       if (found)
1314         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1315     }
1316   return 1;
1317 }
1318
1319
1320 static void
1321 decode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   const unsigned char *src = coding->source + coding->consumed;
1325   const unsigned char *src_end = coding->source + coding->src_bytes;
1326   const unsigned char *src_base;
1327   int *charbuf = coding->charbuf + coding->charbuf_used;
1328   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1329   int consumed_chars = 0, consumed_chars_base;
1330   int multibytep = coding->src_multibyte;
1331   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1332   Lisp_Object attr, charset_list;
1333   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1334   int byte_after_cr = -1;
1335
1336   CODING_GET_INFO (coding, attr, charset_list);
1337
1338   if (bom != utf_without_bom)
1339     {
1340       int c1, c2, c3;
1341
1342       src_base = src;
1343       ONE_MORE_BYTE (c1);
1344       if (! UTF_8_3_OCTET_LEADING_P (c1))
1345         src = src_base;
1346       else
1347         {
1348           ONE_MORE_BYTE (c2);
1349           if (! UTF_8_EXTRA_OCTET_P (c2))
1350             src = src_base;
1351           else
1352             {
1353               ONE_MORE_BYTE (c3);
1354               if (! UTF_8_EXTRA_OCTET_P (c3))
1355                 src = src_base;
1356               else
1357                 {
1358                   if ((c1 != UTF_8_BOM_1)
1359                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1360                     src = src_base;
1361                   else
1362                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1363                 }
1364             }
1365         }
1366     }
1367   CODING_UTF_8_BOM (coding) = utf_without_bom;
1368
1369
1370
1371   while (1)
1372     {
1373       int c, c1, c2, c3, c4, c5;
1374
1375       src_base = src;
1376       consumed_chars_base = consumed_chars;
1377
1378       if (charbuf >= charbuf_end)
1379         break;
1380
1381       if (byte_after_cr >= 0)
1382         c1 = byte_after_cr, byte_after_cr = -1;
1383       else
1384         ONE_MORE_BYTE (c1);
1385       if (c1 < 0)
1386         {
1387           c = - c1;
1388         }
1389       else if (UTF_8_1_OCTET_P(c1))
1390         {
1391           if (eol_crlf && c1 == '\r')
1392             ONE_MORE_BYTE (byte_after_cr);
1393           c = c1;
1394         }
1395       else
1396         {
1397           ONE_MORE_BYTE (c2);
1398           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1399             goto invalid_code;
1400           if (UTF_8_2_OCTET_LEADING_P (c1))
1401             {
1402               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1403               /* Reject overlong sequences here and below.  Encoders
1404                  producing them are incorrect, they can be misleading,
1405                  and they mess up read/write invariance.  */
1406               if (c < 128)
1407                 goto invalid_code;
1408             }
1409           else
1410             {
1411               ONE_MORE_BYTE (c3);
1412               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1413                 goto invalid_code;
1414               if (UTF_8_3_OCTET_LEADING_P (c1))
1415                 {
1416                   c = (((c1 & 0xF) << 12)
1417                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1418                   if (c < 0x800
1419                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1420                     goto invalid_code;
1421                 }
1422               else
1423                 {
1424                   ONE_MORE_BYTE (c4);
1425                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1426                     goto invalid_code;
1427                   if (UTF_8_4_OCTET_LEADING_P (c1))
1428                     {
1429                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1430                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1431                     if (c < 0x10000)
1432                       goto invalid_code;
1433                     }
1434                   else
1435                     {
1436                       ONE_MORE_BYTE (c5);
1437                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1438                         goto invalid_code;
1439                       if (UTF_8_5_OCTET_LEADING_P (c1))
1440                         {
1441                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1442                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1443                                | (c5 & 0x3F));
1444                           if ((c > MAX_CHAR) || (c < 0x200000))
1445                             goto invalid_code;
1446                         }
1447                       else
1448                         goto invalid_code;
1449                     }
1450                 }
1451             }
1452         }
1453
1454       *charbuf++ = c;
1455       continue;
1456
1457     invalid_code:
1458       src = src_base;
1459       consumed_chars = consumed_chars_base;
1460       ONE_MORE_BYTE (c);
1461       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1462       coding->errors++;
1463     }
1464
1465  no_more_source:
1466   coding->consumed_char += consumed_chars_base;
1467   coding->consumed = src_base - coding->source;
1468   coding->charbuf_used = charbuf - coding->charbuf;
1469 }
1470
1471
1472 static int
1473 encode_coding_utf_8 (coding)
1474      struct coding_system *coding;
1475 {
1476   int multibytep = coding->dst_multibyte;
1477   int *charbuf = coding->charbuf;
1478   int *charbuf_end = charbuf + coding->charbuf_used;
1479   unsigned char *dst = coding->destination + coding->produced;
1480   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1481   int produced_chars = 0;
1482   int c;
1483
1484   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1485     {
1486       ASSURE_DESTINATION (3);
1487       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1488       CODING_UTF_8_BOM (coding) = utf_without_bom;
1489     }
1490
1491   if (multibytep)
1492     {
1493       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1494
1495       while (charbuf < charbuf_end)
1496         {
1497           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1498
1499           ASSURE_DESTINATION (safe_room);
1500           c = *charbuf++;
1501           if (CHAR_BYTE8_P (c))
1502             {
1503               c = CHAR_TO_BYTE8 (c);
1504               EMIT_ONE_BYTE (c);
1505             }
1506           else
1507             {
1508               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1509               for (p = str; p < pend; p++)
1510                 EMIT_ONE_BYTE (*p);
1511             }
1512         }
1513     }
1514   else
1515     {
1516       int safe_room = MAX_MULTIBYTE_LENGTH;
1517
1518       while (charbuf < charbuf_end)
1519         {
1520           ASSURE_DESTINATION (safe_room);
1521           c = *charbuf++;
1522           if (CHAR_BYTE8_P (c))
1523             *dst++ = CHAR_TO_BYTE8 (c);
1524           else
1525             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1526           produced_chars++;
1527         }
1528     }
1529   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1530   coding->produced_char += produced_chars;
1531   coding->produced = dst - coding->destination;
1532   return 0;
1533 }
1534
1535
1536 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1537    Check if a text is encoded in one of UTF-16 based coding systems.
1538    If it is, return 1, else return 0.  */
1539
1540 #define UTF_16_HIGH_SURROGATE_P(val) \
1541   (((val) & 0xFC00) == 0xD800)
1542
1543 #define UTF_16_LOW_SURROGATE_P(val) \
1544   (((val) & 0xFC00) == 0xDC00)
1545
1546 #define UTF_16_INVALID_P(val)   \
1547   (((val) == 0xFFFE)            \
1548    || ((val) == 0xFFFF)         \
1549    || UTF_16_LOW_SURROGATE_P (val))
1550
1551
1552 static int
1553 detect_coding_utf_16 (coding, detect_info)
1554      struct coding_system *coding;
1555      struct coding_detection_info *detect_info;
1556 {
1557   const unsigned char *src = coding->source, *src_base = src;
1558   const unsigned char *src_end = coding->source + coding->src_bytes;
1559   int multibytep = coding->src_multibyte;
1560   int consumed_chars = 0;
1561   int c1, c2;
1562
1563   detect_info->checked |= CATEGORY_MASK_UTF_16;
1564   if (coding->mode & CODING_MODE_LAST_BLOCK
1565       && (coding->src_chars & 1))
1566     {
1567       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1568       return 0;
1569     }
1570
1571   ONE_MORE_BYTE (c1);
1572   ONE_MORE_BYTE (c2);
1573   if ((c1 == 0xFF) && (c2 == 0xFE))
1574     {
1575       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1576                              | CATEGORY_MASK_UTF_16_AUTO);
1577       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1578                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1579                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1580     }
1581   else if ((c1 == 0xFE) && (c2 == 0xFF))
1582     {
1583       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1584                              | CATEGORY_MASK_UTF_16_AUTO);
1585       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1586                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1587                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1588     }
1589   else
1590     {
1591       /* We check the dispersion of Eth and Oth bytes where E is even and
1592          O is odd.  If both are high, we assume binary data.*/
1593       unsigned char e[256], o[256];
1594       unsigned e_num = 1, o_num = 1;
1595
1596       memset (e, 0, 256);
1597       memset (o, 0, 256);
1598       e[c1] = 1;
1599       o[c2] = 1;
1600
1601       detect_info->rejected
1602         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1603
1604       while (1)
1605         {
1606           ONE_MORE_BYTE (c1);
1607           ONE_MORE_BYTE (c2);
1608           if (! e[c1])
1609             {
1610               e[c1] = 1;
1611               e_num++;
1612               if (e_num >= 128)
1613                 break;
1614             }
1615           if (! o[c2])
1616             {
1617               o[c1] = 1;
1618               o_num++;
1619               if (o_num >= 128)
1620                 break;
1621             }
1622         }
1623       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1624       return 0;
1625     }
1626
1627  no_more_source:
1628   return 1;
1629 }
1630
1631 static void
1632 decode_coding_utf_16 (coding)
1633      struct coding_system *coding;
1634 {
1635   const unsigned char *src = coding->source + coding->consumed;
1636   const unsigned char *src_end = coding->source + coding->src_bytes;
1637   const unsigned char *src_base;
1638   int *charbuf = coding->charbuf + coding->charbuf_used;
1639   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1640   int consumed_chars = 0, consumed_chars_base;
1641   int multibytep = coding->src_multibyte;
1642   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1643   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1644   int surrogate = CODING_UTF_16_SURROGATE (coding);
1645   Lisp_Object attr, charset_list;
1646   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1647   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1648
1649   CODING_GET_INFO (coding, attr, charset_list);
1650
1651   if (bom == utf_with_bom)
1652     {
1653       int c, c1, c2;
1654
1655       src_base = src;
1656       ONE_MORE_BYTE (c1);
1657       ONE_MORE_BYTE (c2);
1658       c = (c1 << 8) | c2;
1659
1660       if (endian == utf_16_big_endian
1661           ? c != 0xFEFF : c != 0xFFFE)
1662         {
1663           /* The first two bytes are not BOM.  Treat them as bytes
1664              for a normal character.  */
1665           src = src_base;
1666           coding->errors++;
1667         }
1668       CODING_UTF_16_BOM (coding) = utf_without_bom;
1669     }
1670   else if (bom == utf_detect_bom)
1671     {
1672       /* We have already tried to detect BOM and failed in
1673          detect_coding.  */
1674       CODING_UTF_16_BOM (coding) = utf_without_bom;
1675     }
1676
1677   while (1)
1678     {
1679       int c, c1, c2;
1680
1681       src_base = src;
1682       consumed_chars_base = consumed_chars;
1683
1684       if (charbuf + 2 >= charbuf_end)
1685         break;
1686
1687       if (byte_after_cr1 >= 0)
1688         c1 = byte_after_cr1, byte_after_cr1 = -1;
1689       else
1690         ONE_MORE_BYTE (c1);
1691       if (c1 < 0)
1692         {
1693           *charbuf++ = -c1;
1694           continue;
1695         }
1696       if (byte_after_cr2 >= 0)
1697         c2 = byte_after_cr2, byte_after_cr2 = -1;
1698       else
1699         ONE_MORE_BYTE (c2);
1700       if (c2 < 0)
1701         {
1702           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1703           *charbuf++ = -c2;
1704           continue;
1705         }
1706       c = (endian == utf_16_big_endian
1707            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1708
1709       if (surrogate)
1710         {
1711           if (! UTF_16_LOW_SURROGATE_P (c))
1712             {
1713               if (endian == utf_16_big_endian)
1714                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1715               else
1716                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1717               *charbuf++ = c1;
1718               *charbuf++ = c2;
1719               coding->errors++;
1720               if (UTF_16_HIGH_SURROGATE_P (c))
1721                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1722               else
1723                 *charbuf++ = c;
1724             }
1725           else
1726             {
1727               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1728               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1729               *charbuf++ = 0x10000 + c;
1730             }
1731         }
1732       else
1733         {
1734           if (UTF_16_HIGH_SURROGATE_P (c))
1735             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1736           else
1737             {
1738               if (eol_crlf && c == '\r')
1739                 {
1740                   ONE_MORE_BYTE (byte_after_cr1);
1741                   ONE_MORE_BYTE (byte_after_cr2);
1742                 }
1743               *charbuf++ = c;
1744             }
1745         }
1746     }
1747
1748  no_more_source:
1749   coding->consumed_char += consumed_chars_base;
1750   coding->consumed = src_base - coding->source;
1751   coding->charbuf_used = charbuf - coding->charbuf;
1752 }
1753
1754 static int
1755 encode_coding_utf_16 (coding)
1756      struct coding_system *coding;
1757 {
1758   int multibytep = coding->dst_multibyte;
1759   int *charbuf = coding->charbuf;
1760   int *charbuf_end = charbuf + coding->charbuf_used;
1761   unsigned char *dst = coding->destination + coding->produced;
1762   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763   int safe_room = 8;
1764   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1765   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766   int produced_chars = 0;
1767   Lisp_Object attrs, charset_list;
1768   int c;
1769
1770   CODING_GET_INFO (coding, attrs, charset_list);
1771
1772   if (bom != utf_without_bom)
1773     {
1774       ASSURE_DESTINATION (safe_room);
1775       if (big_endian)
1776         EMIT_TWO_BYTES (0xFE, 0xFF);
1777       else
1778         EMIT_TWO_BYTES (0xFF, 0xFE);
1779       CODING_UTF_16_BOM (coding) = utf_without_bom;
1780     }
1781
1782   while (charbuf < charbuf_end)
1783     {
1784       ASSURE_DESTINATION (safe_room);
1785       c = *charbuf++;
1786       if (c >= MAX_UNICODE_CHAR)
1787         c = coding->default_char;
1788
1789       if (c < 0x10000)
1790         {
1791           if (big_endian)
1792             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793           else
1794             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795         }
1796       else
1797         {
1798           int c1, c2;
1799
1800           c -= 0x10000;
1801           c1 = (c >> 10) + 0xD800;
1802           c2 = (c & 0x3FF) + 0xDC00;
1803           if (big_endian)
1804             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805           else
1806             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807         }
1808     }
1809   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1810   coding->produced = dst - coding->destination;
1811   coding->produced_char += produced_chars;
1812   return 0;
1813 }
1814
1815 \f
1816 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818 /* Emacs' internal format for representation of multiple character
1819    sets is a kind of multi-byte encoding, i.e. characters are
1820    represented by variable-length sequences of one-byte codes.
1821
1822    ASCII characters and control characters (e.g. `tab', `newline') are
1823    represented by one-byte sequences which are their ASCII codes, in
1824    the range 0x00 through 0x7F.
1825
1826    8-bit characters of the range 0x80..0x9F are represented by
1827    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828    code + 0x20).
1829
1830    8-bit characters of the range 0xA0..0xFF are represented by
1831    one-byte sequences which are their 8-bit code.
1832
1833    The other characters are represented by a sequence of `base
1834    leading-code', optional `extended leading-code', and one or two
1835    `position-code's.  The length of the sequence is determined by the
1836    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1837    whereas extended leading-code and position-code take the range 0xA0
1838    through 0xFF.  See `charset.h' for more details about leading-code
1839    and position-code.
1840
1841    --- CODE RANGE of Emacs' internal format ---
1842    character set        range
1843    -------------        -----
1844    ascii                0x00..0x7F
1845    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846    eight-bit-graphic    0xA0..0xBF
1847    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1848    ---------------------------------------------
1849
1850    As this is the internal character representation, the format is
1851    usually not used externally (i.e. in a file or in a data sent to a
1852    process).  But, it is possible to have a text externally in this
1853    format (i.e. by encoding by the coding system `emacs-mule').
1854
1855    In that case, a sequence of one-byte codes has a slightly different
1856    form.
1857
1858    At first, all characters in eight-bit-control are represented by
1859    one-byte sequences which are their 8-bit code.
1860
1861    Next, character composition data are represented by the byte
1862    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863    where,
1864         METHOD is 0xF0 plus one of composition method (enum
1865         composition_method),
1866
1867         BYTES is 0xA0 plus a byte length of this composition data,
1868
1869         CHARS is 0x20 plus a number of characters composed by this
1870         data,
1871
1872         COMPONENTs are characters of multibye form or composition
1873         rules encoded by two-byte of ASCII codes.
1874
1875    In addition, for backward compatibility, the following formats are
1876    also recognized as composition data on decoding.
1877
1878    0x80 MSEQ ...
1879    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881    Here,
1882         MSEQ is a multibyte form but in these special format:
1883           ASCII: 0xA0 ASCII_CODE+0x80,
1884           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885         RULE is a one byte code of the range 0xA0..0xF0 that
1886         represents a composition rule.
1887   */
1888
1889 char emacs_mule_bytes[256];
1890
1891 int
1892 emacs_mule_char (coding, src, nbytes, nchars, id)
1893      struct coding_system *coding;
1894      const unsigned char *src;
1895      int *nbytes, *nchars, *id;
1896 {
1897   const unsigned char *src_end = coding->source + coding->src_bytes;
1898   const unsigned char *src_base = src;
1899   int multibytep = coding->src_multibyte;
1900   struct charset *charset;
1901   unsigned code;
1902   int c;
1903   int consumed_chars = 0;
1904
1905   ONE_MORE_BYTE (c);
1906   if (c < 0)
1907     {
1908       c = -c;
1909       charset = emacs_mule_charset[0];
1910     }
1911   else
1912     {
1913       if (c >= 0xA0)
1914         {
1915           /* Old style component character of a composition.  */
1916           if (c == 0xA0)
1917             {
1918               ONE_MORE_BYTE (c);
1919               c -= 0x80;
1920             }
1921           else
1922             c -= 0x20;
1923         }
1924
1925       switch (emacs_mule_bytes[c])
1926         {
1927         case 2:
1928           if (! (charset = emacs_mule_charset[c]))
1929             goto invalid_code;
1930           ONE_MORE_BYTE (c);
1931           if (c < 0xA0)
1932             goto invalid_code;
1933           code = c & 0x7F;
1934           break;
1935
1936         case 3:
1937           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1938               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1939             {
1940               ONE_MORE_BYTE (c);
1941               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1942                 goto invalid_code;
1943               ONE_MORE_BYTE (c);
1944               if (c < 0xA0)
1945                 goto invalid_code;
1946               code = c & 0x7F;
1947             }
1948           else
1949             {
1950               if (! (charset = emacs_mule_charset[c]))
1951                 goto invalid_code;
1952               ONE_MORE_BYTE (c);
1953               if (c < 0xA0)
1954                 goto invalid_code;
1955               code = (c & 0x7F) << 8;
1956               ONE_MORE_BYTE (c);
1957               if (c < 0xA0)
1958                 goto invalid_code;
1959               code |= c & 0x7F;
1960             }
1961           break;
1962
1963         case 4:
1964           ONE_MORE_BYTE (c);
1965           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1966             goto invalid_code;
1967           ONE_MORE_BYTE (c);
1968           if (c < 0xA0)
1969             goto invalid_code;
1970           code = (c & 0x7F) << 8;
1971           ONE_MORE_BYTE (c);
1972           if (c < 0xA0)
1973             goto invalid_code;
1974           code |= c & 0x7F;
1975           break;
1976
1977         case 1:
1978           code = c;
1979           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1980                                      ? charset_ascii : charset_eight_bit);
1981           break;
1982
1983         default:
1984           abort ();
1985         }
1986       c = DECODE_CHAR (charset, code);
1987       if (c < 0)
1988         goto invalid_code;
1989     }
1990   *nbytes = src - src_base;
1991   *nchars = consumed_chars;
1992   if (id)
1993     *id = charset->id;
1994   return c;
1995
1996  no_more_source:
1997   return -2;
1998
1999  invalid_code:
2000   return -1;
2001 }
2002
2003
2004 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2005    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
2006    else return 0.  */
2007
2008 static int
2009 detect_coding_emacs_mule (coding, detect_info)
2010      struct coding_system *coding;
2011      struct coding_detection_info *detect_info;
2012 {
2013   const unsigned char *src = coding->source, *src_base;
2014   const unsigned char *src_end = coding->source + coding->src_bytes;
2015   int multibytep = coding->src_multibyte;
2016   int consumed_chars = 0;
2017   int c;
2018   int found = 0;
2019
2020   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
2021   /* A coding system of this category is always ASCII compatible.  */
2022   src += coding->head_ascii;
2023
2024   while (1)
2025     {
2026       src_base = src;
2027       ONE_MORE_BYTE (c);
2028       if (c < 0)
2029         continue;
2030       if (c == 0x80)
2031         {
2032           /* Perhaps the start of composite character.  We simple skip
2033              it because analyzing it is too heavy for detecting.  But,
2034              at least, we check that the composite character
2035              constitutes of more than 4 bytes.  */
2036           const unsigned char *src_base;
2037
2038         repeat:
2039           src_base = src;
2040           do
2041             {
2042               ONE_MORE_BYTE (c);
2043             }
2044           while (c >= 0xA0);
2045
2046           if (src - src_base <= 4)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049           if (c == 0x80)
2050             goto repeat;
2051         }
2052
2053       if (c < 0x80)
2054         {
2055           if (c < 0x20
2056               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2057             break;
2058         }
2059       else
2060         {
2061           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2062
2063           while (more_bytes > 0)
2064             {
2065               ONE_MORE_BYTE (c);
2066               if (c < 0xA0)
2067                 {
2068                   src--;        /* Unread the last byte.  */
2069                   break;
2070                 }
2071               more_bytes--;
2072             }
2073           if (more_bytes != 0)
2074             break;
2075           found = CATEGORY_MASK_EMACS_MULE;
2076         }
2077     }
2078   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2079   return 0;
2080
2081  no_more_source:
2082   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2083     {
2084       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2085       return 0;
2086     }
2087   detect_info->found |= found;
2088   return 1;
2089 }
2090
2091
2092 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2093
2094 /* Decode a character represented as a component of composition
2095    sequence of Emacs 20/21 style at SRC.  Set C to that character and
2096    update SRC to the head of next character (or an encoded composition
2097    rule).  If SRC doesn't points a composition component, set C to -1.
2098    If SRC points an invalid byte sequence, global exit by a return
2099    value 0.  */
2100
2101 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
2102   do                                                            \
2103     {                                                           \
2104       int c;                                                    \
2105       int nbytes, nchars;                                       \
2106                                                                 \
2107       if (src == src_end)                                       \
2108         break;                                                  \
2109       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
2110       if (c < 0)                                                \
2111         {                                                       \
2112           if (c == -2)                                          \
2113             break;                                              \
2114           goto invalid_code;                                    \
2115         }                                                       \
2116       *buf++ = c;                                               \
2117       src += nbytes;                                            \
2118       consumed_chars += nchars;                                 \
2119     }                                                           \
2120   while (0)
2121
2122
2123 /* Decode a composition rule represented as a component of composition
2124    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
2125    and increment BUF.  If SRC points an invalid byte sequence, set C
2126    to -1.  */
2127
2128 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
2129   do {                                                  \
2130     int c, gref, nref;                                  \
2131                                                         \
2132     if (src >= src_end)                                 \
2133       goto invalid_code;                                \
2134     ONE_MORE_BYTE_NO_CHECK (c);                         \
2135     c -= 0xA0;                                          \
2136     if (c < 0 || c >= 81)                               \
2137       goto invalid_code;                                \
2138                                                         \
2139     gref = c / 9, nref = c % 9;                         \
2140     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
2141   } while (0)
2142
2143
2144 /* Decode a composition rule represented as a component of composition
2145    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
2146    and increment BUF.  If SRC points an invalid byte sequence, set C
2147    to -1.  */
2148
2149 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
2150   do {                                                  \
2151     int gref, nref;                                     \
2152                                                         \
2153     if (src + 1>= src_end)                              \
2154       goto invalid_code;                                \
2155     ONE_MORE_BYTE_NO_CHECK (gref);                      \
2156     gref -= 0x20;                                       \
2157     ONE_MORE_BYTE_NO_CHECK (nref);                      \
2158     nref -= 0x20;                                       \
2159     if (gref < 0 || gref >= 81                          \
2160         || nref < 0 || nref >= 81)                      \
2161       goto invalid_code;                                \
2162     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
2163   } while (0)
2164
2165
2166 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
2167   do {                                                                  \
2168     /* Emacs 21 style format.  The first three bytes at SRC are         \
2169        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
2170        the byte length of this composition information, CHARS is the    \
2171        number of characters composed by this composition.  */           \
2172     enum composition_method method = c - 0xF2;                          \
2173     int *charbuf_base = charbuf;                                        \
2174     int consumed_chars_limit;                                           \
2175     int nbytes, nchars;                                                 \
2176                                                                         \
2177     ONE_MORE_BYTE (c);                                                  \
2178     if (c < 0)                                                          \
2179       goto invalid_code;                                                \
2180     nbytes = c - 0xA0;                                                  \
2181     if (nbytes < 3)                                                     \
2182       goto invalid_code;                                                \
2183     ONE_MORE_BYTE (c);                                                  \
2184     if (c < 0)                                                          \
2185       goto invalid_code;                                                \
2186     nchars = c - 0xA0;                                                  \
2187     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2188     consumed_chars_limit = consumed_chars_base + nbytes;                \
2189     if (method != COMPOSITION_RELATIVE)                                 \
2190       {                                                                 \
2191         int i = 0;                                                      \
2192         while (consumed_chars < consumed_chars_limit)                   \
2193           {                                                             \
2194             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
2195               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
2196             else                                                        \
2197               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
2198             i++;                                                        \
2199           }                                                             \
2200         if (consumed_chars < consumed_chars_limit)                      \
2201           goto invalid_code;                                            \
2202         charbuf_base[0] -= i;                                           \
2203       }                                                                 \
2204   } while (0)
2205
2206
2207 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)                    \
2208   do {                                                                  \
2209     /* Emacs 20 style format for relative composition.  */              \
2210     /* Store multibyte form of characters to be composed.  */           \
2211     enum composition_method method = COMPOSITION_RELATIVE;              \
2212     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];                 \
2213     int *buf = components;                                              \
2214     int i, j;                                                           \
2215                                                                         \
2216     src = src_base;                                                     \
2217     ONE_MORE_BYTE (c);          /* skip 0x80 */                         \
2218     for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++)    \
2219       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                         \
2220     if (i < 2)                                                          \
2221       goto invalid_code;                                                \
2222     ADD_COMPOSITION_DATA (charbuf, i, method);                          \
2223     for (j = 0; j < i; j++)                                             \
2224       *charbuf++ = components[j];                                       \
2225   } while (0)
2226
2227
2228 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2229   do {                                                          \
2230     /* Emacs 20 style format for rule-base composition.  */     \
2231     /* Store multibyte form of characters to be composed.  */   \
2232     enum composition_method method = COMPOSITION_WITH_RULE;     \
2233     int *charbuf_base = charbuf;                                \
2234     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2235     int *buf = components;                                      \
2236     int i, j;                                                   \
2237                                                                 \
2238     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2239     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2240       {                                                         \
2241         if (*src < 0xA0)                                        \
2242           break;                                                \
2243         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2244         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2245       }                                                         \
2246     if (i <= 1 || (buf - components) % 2 == 0)                  \
2247       goto invalid_code;                                        \
2248     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2249       goto no_more_source;                                      \
2250     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2251     i = i * 2 - 1;                                              \
2252     for (j = 0; j < i; j++)                                     \
2253       *charbuf++ = components[j];                               \
2254     charbuf_base[0] -= i;                                       \
2255     for (j = 0; j < i; j += 2)                                  \
2256       *charbuf++ = components[j];                               \
2257   } while (0)
2258
2259
2260 static void
2261 decode_coding_emacs_mule (coding)
2262      struct coding_system *coding;
2263 {
2264   const unsigned char *src = coding->source + coding->consumed;
2265   const unsigned char *src_end = coding->source + coding->src_bytes;
2266   const unsigned char *src_base;
2267   int *charbuf = coding->charbuf + coding->charbuf_used;
2268   int *charbuf_end
2269     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2270   int consumed_chars = 0, consumed_chars_base;
2271   int multibytep = coding->src_multibyte;
2272   Lisp_Object attrs, charset_list;
2273   int char_offset = coding->produced_char;
2274   int last_offset = char_offset;
2275   int last_id = charset_ascii;
2276   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2277   int byte_after_cr = -1;
2278
2279   CODING_GET_INFO (coding, attrs, charset_list);
2280
2281   while (1)
2282     {
2283       int c;
2284
2285       src_base = src;
2286       consumed_chars_base = consumed_chars;
2287
2288       if (charbuf >= charbuf_end)
2289         break;
2290
2291       if (byte_after_cr >= 0)
2292         c = byte_after_cr, byte_after_cr = -1;
2293       else
2294         ONE_MORE_BYTE (c);
2295       if (c < 0)
2296         {
2297           *charbuf++ = -c;
2298           char_offset++;
2299         }
2300       else if (c < 0x80)
2301         {
2302           if (eol_crlf && c == '\r')
2303             ONE_MORE_BYTE (byte_after_cr);
2304           *charbuf++ = c;
2305           char_offset++;
2306         }
2307       else if (c == 0x80)
2308         {
2309           ONE_MORE_BYTE (c);
2310           if (c < 0)
2311             goto invalid_code;
2312           if (c - 0xF2 >= COMPOSITION_RELATIVE
2313               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2314             DECODE_EMACS_MULE_21_COMPOSITION (c);
2315           else if (c < 0xC0)
2316             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2317           else if (c == 0xFF)
2318             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2319           else
2320             goto invalid_code;
2321         }
2322       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2323         {
2324           int nbytes, nchars;
2325           int id;
2326
2327           src = src_base;
2328           consumed_chars = consumed_chars_base;
2329           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2330           if (c < 0)
2331             {
2332               if (c == -2)
2333                 break;
2334               goto invalid_code;
2335             }
2336           if (last_id != id)
2337             {
2338               if (last_id != charset_ascii)
2339                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2340               last_id = id;
2341               last_offset = char_offset;
2342             }
2343           *charbuf++ = c;
2344           src += nbytes;
2345           consumed_chars += nchars;
2346           char_offset++;
2347         }
2348       else
2349         goto invalid_code;
2350       continue;
2351
2352     invalid_code:
2353       src = src_base;
2354       consumed_chars = consumed_chars_base;
2355       ONE_MORE_BYTE (c);
2356       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2357       char_offset++;
2358       coding->errors++;
2359     }
2360
2361  no_more_source:
2362   if (last_id != charset_ascii)
2363     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2364   coding->consumed_char += consumed_chars_base;
2365   coding->consumed = src_base - coding->source;
2366   coding->charbuf_used = charbuf - coding->charbuf;
2367 }
2368
2369
2370 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2371   do {                                          \
2372     if (id < 0xA0)                              \
2373       codes[0] = id, codes[1] = 0;              \
2374     else if (id < 0xE0)                         \
2375       codes[0] = 0x9A, codes[1] = id;           \
2376     else if (id < 0xF0)                         \
2377       codes[0] = 0x9B, codes[1] = id;           \
2378     else if (id < 0xF5)                         \
2379       codes[0] = 0x9C, codes[1] = id;           \
2380     else                                        \
2381       codes[0] = 0x9D, codes[1] = id;           \
2382   } while (0);
2383
2384
2385 static int
2386 encode_coding_emacs_mule (coding)
2387      struct coding_system *coding;
2388 {
2389   int multibytep = coding->dst_multibyte;
2390   int *charbuf = coding->charbuf;
2391   int *charbuf_end = charbuf + coding->charbuf_used;
2392   unsigned char *dst = coding->destination + coding->produced;
2393   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2394   int safe_room = 8;
2395   int produced_chars = 0;
2396   Lisp_Object attrs, charset_list;
2397   int c;
2398   int preferred_charset_id = -1;
2399
2400   CODING_GET_INFO (coding, attrs, charset_list);
2401   if (! EQ (charset_list, Vemacs_mule_charset_list))
2402     {
2403       CODING_ATTR_CHARSET_LIST (attrs)
2404         = charset_list = Vemacs_mule_charset_list;
2405     }
2406
2407   while (charbuf < charbuf_end)
2408     {
2409       ASSURE_DESTINATION (safe_room);
2410       c = *charbuf++;
2411
2412       if (c < 0)
2413         {
2414           /* Handle an annotation.  */
2415           switch (*charbuf)
2416             {
2417             case CODING_ANNOTATE_COMPOSITION_MASK:
2418               /* Not yet implemented.  */
2419               break;
2420             case CODING_ANNOTATE_CHARSET_MASK:
2421               preferred_charset_id = charbuf[3];
2422               if (preferred_charset_id >= 0
2423                   && NILP (Fmemq (make_number (preferred_charset_id),
2424                                   charset_list)))
2425                 preferred_charset_id = -1;
2426               break;
2427             default:
2428               abort ();
2429             }
2430           charbuf += -c - 1;
2431           continue;
2432         }
2433
2434       if (ASCII_CHAR_P (c))
2435         EMIT_ONE_ASCII_BYTE (c);
2436       else if (CHAR_BYTE8_P (c))
2437         {
2438           c = CHAR_TO_BYTE8 (c);
2439           EMIT_ONE_BYTE (c);
2440         }
2441       else
2442         {
2443           struct charset *charset;
2444           unsigned code;
2445           int dimension;
2446           int emacs_mule_id;
2447           unsigned char leading_codes[2];
2448
2449           if (preferred_charset_id >= 0)
2450             {
2451               charset = CHARSET_FROM_ID (preferred_charset_id);
2452               if (! CHAR_CHARSET_P (c, charset))
2453                 charset = char_charset (c, charset_list, NULL);
2454             }
2455           else
2456             charset = char_charset (c, charset_list, &code);
2457           if (! charset)
2458             {
2459               c = coding->default_char;
2460               if (ASCII_CHAR_P (c))
2461                 {
2462                   EMIT_ONE_ASCII_BYTE (c);
2463                   continue;
2464                 }
2465               charset = char_charset (c, charset_list, &code);
2466             }
2467           dimension = CHARSET_DIMENSION (charset);
2468           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2469           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2470           EMIT_ONE_BYTE (leading_codes[0]);
2471           if (leading_codes[1])
2472             EMIT_ONE_BYTE (leading_codes[1]);
2473           if (dimension == 1)
2474             EMIT_ONE_BYTE (code | 0x80);
2475           else
2476             {
2477               code |= 0x8080;
2478               EMIT_ONE_BYTE (code >> 8);
2479               EMIT_ONE_BYTE (code & 0xFF);
2480             }
2481         }
2482     }
2483   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2484   coding->produced_char += produced_chars;
2485   coding->produced = dst - coding->destination;
2486   return 0;
2487 }
2488
2489 \f
2490 /*** 7. ISO2022 handlers ***/
2491
2492 /* The following note describes the coding system ISO2022 briefly.
2493    Since the intention of this note is to help understand the
2494    functions in this file, some parts are NOT ACCURATE or are OVERLY
2495    SIMPLIFIED.  For thorough understanding, please refer to the
2496    original document of ISO2022.  This is equivalent to the standard
2497    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2498
2499    ISO2022 provides many mechanisms to encode several character sets
2500    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2501    is encoded using bytes less than 128.  This may make the encoded
2502    text a little bit longer, but the text passes more easily through
2503    several types of gateway, some of which strip off the MSB (Most
2504    Significant Bit).
2505
2506    There are two kinds of character sets: control character sets and
2507    graphic character sets.  The former contain control characters such
2508    as `newline' and `escape' to provide control functions (control
2509    functions are also provided by escape sequences).  The latter
2510    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2511    two control character sets and many graphic character sets.
2512
2513    Graphic character sets are classified into one of the following
2514    four classes, according to the number of bytes (DIMENSION) and
2515    number of characters in one dimension (CHARS) of the set:
2516    - DIMENSION1_CHARS94
2517    - DIMENSION1_CHARS96
2518    - DIMENSION2_CHARS94
2519    - DIMENSION2_CHARS96
2520
2521    In addition, each character set is assigned an identification tag,
2522    unique for each set, called the "final character" (denoted as <F>
2523    hereafter).  The <F> of each character set is decided by ECMA(*)
2524    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2525    (0x30..0x3F are for private use only).
2526
2527    Note (*): ECMA = European Computer Manufacturers Association
2528
2529    Here are examples of graphic character sets [NAME(<F>)]:
2530         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2531         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2532         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2533         o DIMENSION2_CHARS96 -- none for the moment
2534
2535    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2536         C0 [0x00..0x1F] -- control character plane 0
2537         GL [0x20..0x7F] -- graphic character plane 0
2538         C1 [0x80..0x9F] -- control character plane 1
2539         GR [0xA0..0xFF] -- graphic character plane 1
2540
2541    A control character set is directly designated and invoked to C0 or
2542    C1 by an escape sequence.  The most common case is that:
2543    - ISO646's  control character set is designated/invoked to C0, and
2544    - ISO6429's control character set is designated/invoked to C1,
2545    and usually these designations/invocations are omitted in encoded
2546    text.  In a 7-bit environment, only C0 can be used, and a control
2547    character for C1 is encoded by an appropriate escape sequence to
2548    fit into the environment.  All control characters for C1 are
2549    defined to have corresponding escape sequences.
2550
2551    A graphic character set is at first designated to one of four
2552    graphic registers (G0 through G3), then these graphic registers are
2553    invoked to GL or GR.  These designations and invocations can be
2554    done independently.  The most common case is that G0 is invoked to
2555    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2556    these invocations and designations are omitted in encoded text.
2557    In a 7-bit environment, only GL can be used.
2558
2559    When a graphic character set of CHARS94 is invoked to GL, codes
2560    0x20 and 0x7F of the GL area work as control characters SPACE and
2561    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2562    be used.
2563
2564    There are two ways of invocation: locking-shift and single-shift.
2565    With locking-shift, the invocation lasts until the next different
2566    invocation, whereas with single-shift, the invocation affects the
2567    following character only and doesn't affect the locking-shift
2568    state.  Invocations are done by the following control characters or
2569    escape sequences:
2570
2571    ----------------------------------------------------------------------
2572    abbrev  function                  cntrl escape seq   description
2573    ----------------------------------------------------------------------
2574    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2575    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2576    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2577    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2578    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2579    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2580    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2581    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2582    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2583    ----------------------------------------------------------------------
2584    (*) These are not used by any known coding system.
2585
2586    Control characters for these functions are defined by macros
2587    ISO_CODE_XXX in `coding.h'.
2588
2589    Designations are done by the following escape sequences:
2590    ----------------------------------------------------------------------
2591    escape sequence      description
2592    ----------------------------------------------------------------------
2593    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2594    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2595    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2596    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2597    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2598    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2599    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2600    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2601    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2602    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2603    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2604    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2605    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2606    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2607    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2608    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2609    ----------------------------------------------------------------------
2610
2611    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2612    of dimension 1, chars 94, and final character <F>, etc...
2613
2614    Note (*): Although these designations are not allowed in ISO2022,
2615    Emacs accepts them on decoding, and produces them on encoding
2616    CHARS96 character sets in a coding system which is characterized as
2617    7-bit environment, non-locking-shift, and non-single-shift.
2618
2619    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2620    '(' must be omitted.  We refer to this as "short-form" hereafter.
2621
2622    Now you may notice that there are a lot of ways of encoding the
2623    same multilingual text in ISO2022.  Actually, there exist many
2624    coding systems such as Compound Text (used in X11's inter client
2625    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2626    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2627    localized platforms), and all of these are variants of ISO2022.
2628
2629    In addition to the above, Emacs handles two more kinds of escape
2630    sequences: ISO6429's direction specification and Emacs' private
2631    sequence for specifying character composition.
2632
2633    ISO6429's direction specification takes the following form:
2634         o CSI ']'      -- end of the current direction
2635         o CSI '0' ']'  -- end of the current direction
2636         o CSI '1' ']'  -- start of left-to-right text
2637         o CSI '2' ']'  -- start of right-to-left text
2638    The control character CSI (0x9B: control sequence introducer) is
2639    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2640
2641    Character composition specification takes the following form:
2642         o ESC '0' -- start relative composition
2643         o ESC '1' -- end composition
2644         o ESC '2' -- start rule-base composition (*)
2645         o ESC '3' -- start relative composition with alternate chars  (**)
2646         o ESC '4' -- start rule-base composition with alternate chars  (**)
2647   Since these are not standard escape sequences of any ISO standard,
2648   the use of them with these meanings is restricted to Emacs only.
2649
2650   (*) This form is used only in Emacs 20.7 and older versions,
2651   but newer versions can safely decode it.
2652   (**) This form is used only in Emacs 21.1 and newer versions,
2653   and older versions can't decode it.
2654
2655   Here's a list of example usages of these composition escape
2656   sequences (categorized by `enum composition_method').
2657
2658   COMPOSITION_RELATIVE:
2659         ESC 0 CHAR [ CHAR ] ESC 1
2660   COMPOSITION_WITH_RULE:
2661         ESC 2 CHAR [ RULE CHAR ] ESC 1
2662   COMPOSITION_WITH_ALTCHARS:
2663         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2664   COMPOSITION_WITH_RULE_ALTCHARS:
2665         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2666
2667 enum iso_code_class_type iso_code_class[256];
2668
2669 #define SAFE_CHARSET_P(coding, id)      \
2670   ((id) <= (coding)->max_charset_id     \
2671    && (coding)->safe_charsets[id] >= 0)
2672
2673
2674 #define SHIFT_OUT_OK(category)  \
2675   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2676
2677 static void
2678 setup_iso_safe_charsets (attrs)
2679      Lisp_Object attrs;
2680 {
2681   Lisp_Object charset_list, safe_charsets;
2682   Lisp_Object request;
2683   Lisp_Object reg_usage;
2684   Lisp_Object tail;
2685   int reg94, reg96;
2686   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2687   int max_charset_id;
2688
2689   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2690   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2691       && ! EQ (charset_list, Viso_2022_charset_list))
2692     {
2693       CODING_ATTR_CHARSET_LIST (attrs)
2694         = charset_list = Viso_2022_charset_list;
2695       ASET (attrs, coding_attr_safe_charsets, Qnil);
2696     }
2697
2698   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2699     return;
2700
2701   max_charset_id = 0;
2702   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2703     {
2704       int id = XINT (XCAR (tail));
2705       if (max_charset_id < id)
2706         max_charset_id = id;
2707     }
2708
2709   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2710                                 make_number (255));
2711   request = AREF (attrs, coding_attr_iso_request);
2712   reg_usage = AREF (attrs, coding_attr_iso_usage);
2713   reg94 = XINT (XCAR (reg_usage));
2714   reg96 = XINT (XCDR (reg_usage));
2715
2716   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2717     {
2718       Lisp_Object id;
2719       Lisp_Object reg;
2720       struct charset *charset;
2721
2722       id = XCAR (tail);
2723       charset = CHARSET_FROM_ID (XINT (id));
2724       reg = Fcdr (Fassq (id, request));
2725       if (! NILP (reg))
2726         SSET (safe_charsets, XINT (id), XINT (reg));
2727       else if (charset->iso_chars_96)
2728         {
2729           if (reg96 < 4)
2730             SSET (safe_charsets, XINT (id), reg96);
2731         }
2732       else
2733         {
2734           if (reg94 < 4)
2735             SSET (safe_charsets, XINT (id), reg94);
2736         }
2737     }
2738   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2739 }
2740
2741
2742 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2743    Check if a text is encoded in one of ISO-2022 based codig systems.
2744    If it is, return 1, else return 0.  */
2745
2746 static int
2747 detect_coding_iso_2022 (coding, detect_info)
2748      struct coding_system *coding;
2749      struct coding_detection_info *detect_info;
2750 {
2751   const unsigned char *src = coding->source, *src_base = src;
2752   const unsigned char *src_end = coding->source + coding->src_bytes;
2753   int multibytep = coding->src_multibyte;
2754   int single_shifting = 0;
2755   int id;
2756   int c, c1;
2757   int consumed_chars = 0;
2758   int i;
2759   int rejected = 0;
2760   int found = 0;
2761
2762   detect_info->checked |= CATEGORY_MASK_ISO;
2763
2764   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2765     {
2766       struct coding_system *this = &(coding_categories[i]);
2767       Lisp_Object attrs, val;
2768
2769       if (this->id < 0)
2770         continue;
2771       attrs = CODING_ID_ATTRS (this->id);
2772       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2773           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2774         setup_iso_safe_charsets (attrs);
2775       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2776       this->max_charset_id = SCHARS (val) - 1;
2777       this->safe_charsets = (char *) SDATA (val);
2778     }
2779
2780   /* A coding system of this category is always ASCII compatible.  */
2781   src += coding->head_ascii;
2782
2783   while (rejected != CATEGORY_MASK_ISO)
2784     {
2785       src_base = src;
2786       ONE_MORE_BYTE (c);
2787       switch (c)
2788         {
2789         case ISO_CODE_ESC:
2790           if (inhibit_iso_escape_detection)
2791             break;
2792           single_shifting = 0;
2793           ONE_MORE_BYTE (c);
2794           if (c >= '(' && c <= '/')
2795             {
2796               /* Designation sequence for a charset of dimension 1.  */
2797               ONE_MORE_BYTE (c1);
2798               if (c1 < ' ' || c1 >= 0x80
2799                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2800                 /* Invalid designation sequence.  Just ignore.  */
2801                 break;
2802             }
2803           else if (c == '$')
2804             {
2805               /* Designation sequence for a charset of dimension 2.  */
2806               ONE_MORE_BYTE (c);
2807               if (c >= '@' && c <= 'B')
2808                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2809                 id = iso_charset_table[1][0][c];
2810               else if (c >= '(' && c <= '/')
2811                 {
2812                   ONE_MORE_BYTE (c1);
2813                   if (c1 < ' ' || c1 >= 0x80
2814                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2815                     /* Invalid designation sequence.  Just ignore.  */
2816                     break;
2817                 }
2818               else
2819                 /* Invalid designation sequence.  Just ignore it.  */
2820                 break;
2821             }
2822           else if (c == 'N' || c == 'O')
2823             {
2824               /* ESC <Fe> for SS2 or SS3.  */
2825               single_shifting = 1;
2826               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2827               break;
2828             }
2829           else if (c >= '0' && c <= '4')
2830             {
2831               /* ESC <Fp> for start/end composition.  */
2832               found |= CATEGORY_MASK_ISO;
2833               break;
2834             }
2835           else
2836             {
2837               /* Invalid escape sequence.  Just ignore it.  */
2838               break;
2839             }
2840
2841           /* We found a valid designation sequence for CHARSET.  */
2842           rejected |= CATEGORY_MASK_ISO_8BIT;
2843           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2844                               id))
2845             found |= CATEGORY_MASK_ISO_7;
2846           else
2847             rejected |= CATEGORY_MASK_ISO_7;
2848           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2849                               id))
2850             found |= CATEGORY_MASK_ISO_7_TIGHT;
2851           else
2852             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2853           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2854                               id))
2855             found |= CATEGORY_MASK_ISO_7_ELSE;
2856           else
2857             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2858           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2859                               id))
2860             found |= CATEGORY_MASK_ISO_8_ELSE;
2861           else
2862             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2863           break;
2864
2865         case ISO_CODE_SO:
2866         case ISO_CODE_SI:
2867           /* Locking shift out/in.  */
2868           if (inhibit_iso_escape_detection)
2869             break;
2870           single_shifting = 0;
2871           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2872           break;
2873
2874         case ISO_CODE_CSI:
2875           /* Control sequence introducer.  */
2876           single_shifting = 0;
2877           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2878           found |= CATEGORY_MASK_ISO_8_ELSE;
2879           goto check_extra_latin;
2880
2881         case ISO_CODE_SS2:
2882         case ISO_CODE_SS3:
2883           /* Single shift.   */
2884           if (inhibit_iso_escape_detection)
2885             break;
2886           single_shifting = 0;
2887           rejected |= CATEGORY_MASK_ISO_7BIT;
2888           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2889               & CODING_ISO_FLAG_SINGLE_SHIFT)
2890             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2891           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2892               & CODING_ISO_FLAG_SINGLE_SHIFT)
2893             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2894           if (single_shifting)
2895             break;
2896           goto check_extra_latin;
2897
2898         default:
2899           if (c < 0)
2900             continue;
2901           if (c < 0x80)
2902             {
2903               single_shifting = 0;
2904               break;
2905             }
2906           if (c >= 0xA0)
2907             {
2908               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2909               found |= CATEGORY_MASK_ISO_8_1;
2910               /* Check the length of succeeding codes of the range
2911                  0xA0..0FF.  If the byte length is even, we include
2912                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2913                  only when we are not single shifting.  */
2914               if (! single_shifting
2915                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2916                 {
2917                   int i = 1;
2918                   while (src < src_end)
2919                     {
2920                       ONE_MORE_BYTE (c);
2921                       if (c < 0xA0)
2922                         break;
2923                       i++;
2924                     }
2925
2926                   if (i & 1 && src < src_end)
2927                     rejected |= CATEGORY_MASK_ISO_8_2;
2928                   else
2929                     found |= CATEGORY_MASK_ISO_8_2;
2930                 }
2931               break;
2932             }
2933         check_extra_latin:
2934           single_shifting = 0;
2935           if (! VECTORP (Vlatin_extra_code_table)
2936               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2937             {
2938               rejected = CATEGORY_MASK_ISO;
2939               break;
2940             }
2941           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2942               & CODING_ISO_FLAG_LATIN_EXTRA)
2943             found |= CATEGORY_MASK_ISO_8_1;
2944           else
2945             rejected |= CATEGORY_MASK_ISO_8_1;
2946           rejected |= CATEGORY_MASK_ISO_8_2;
2947         }
2948     }
2949   detect_info->rejected |= CATEGORY_MASK_ISO;
2950   return 0;
2951
2952  no_more_source:
2953   detect_info->rejected |= rejected;
2954   detect_info->found |= (found & ~rejected);
2955   return 1;
2956 }
2957
2958
2959 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2960    escape sequence should be kept.  */
2961 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2962   do {                                                                  \
2963     int id, prev;                                                       \
2964                                                                         \
2965     if (final < '0' || final >= 128                                     \
2966         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2967         || !SAFE_CHARSET_P (coding, id))                                \
2968       {                                                                 \
2969         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2970         chars_96 = -1;                                                  \
2971         break;                                                          \
2972       }                                                                 \
2973     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2974     if (id == charset_jisx0201_roman)                                   \
2975       {                                                                 \
2976         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2977           id = charset_ascii;                                           \
2978       }                                                                 \
2979     else if (id == charset_jisx0208_1978)                               \
2980       {                                                                 \
2981         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2982           id = charset_jisx0208;                                        \
2983       }                                                                 \
2984     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2985     /* If there was an invalid designation to REG previously, and this  \
2986        designation is ASCII to REG, we should keep this designation     \
2987        sequence.  */                                                    \
2988     if (prev == -2 && id == charset_ascii)                              \
2989       chars_96 = -1;                                                    \
2990   } while (0)
2991
2992
2993 #define MAYBE_FINISH_COMPOSITION()                              \
2994   do {                                                          \
2995     int i;                                                      \
2996     if (composition_state == COMPOSING_NO)                      \
2997       break;                                                    \
2998     /* It is assured that we have enough room for producing     \
2999        characters stored in the table `components'.  */         \
3000     if (charbuf + component_idx > charbuf_end)                  \
3001       goto no_more_source;                                      \
3002     composition_state = COMPOSING_NO;                           \
3003     if (method == COMPOSITION_RELATIVE                          \
3004         || method == COMPOSITION_WITH_ALTCHARS)                 \
3005       {                                                         \
3006         for (i = 0; i < component_idx; i++)                     \
3007           *charbuf++ = components[i];                           \
3008         char_offset += component_idx;                           \
3009       }                                                         \
3010     else                                                        \
3011       {                                                         \
3012         for (i = 0; i < component_idx; i += 2)                  \
3013           *charbuf++ = components[i];                           \
3014         char_offset += (component_idx / 2) + 1;                 \
3015       }                                                         \
3016   } while (0)
3017
3018
3019 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3020    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3021    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3022    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3023    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3024   */
3025
3026 #define DECODE_COMPOSITION_START(c1)                                    \
3027   do {                                                                  \
3028     if (c1 == '0'                                                       \
3029         && composition_state == COMPOSING_COMPONENT_RULE)               \
3030       {                                                                 \
3031         component_len = component_idx;                                  \
3032         composition_state = COMPOSING_CHAR;                             \
3033       }                                                                 \
3034     else                                                                \
3035       {                                                                 \
3036         const unsigned char *p;                                         \
3037                                                                         \
3038         MAYBE_FINISH_COMPOSITION ();                                    \
3039         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
3040           goto no_more_source;                                          \
3041         for (p = src; p < src_end - 1; p++)                             \
3042           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
3043             break;                                                      \
3044         if (p == src_end - 1)                                           \
3045           {                                                             \
3046             /* The current composition doesn't end in the current       \
3047                source.  */                                              \
3048             record_conversion_result                                    \
3049               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
3050             goto no_more_source;                                        \
3051           }                                                             \
3052                                                                         \
3053         /* This is surely the start of a composition.  */               \
3054         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
3055                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
3056                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
3057                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
3058         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
3059                              : COMPOSING_COMPONENT_CHAR);               \
3060         component_idx = component_len = 0;                              \
3061       }                                                                 \
3062   } while (0)
3063
3064
3065 /* Handle compositoin end sequence ESC 1.  */
3066
3067 #define DECODE_COMPOSITION_END()                                        \
3068   do {                                                                  \
3069     int nchars = (component_len > 0 ? component_idx - component_len     \
3070                   : method == COMPOSITION_RELATIVE ? component_idx      \
3071                   : (component_idx + 1) / 2);                           \
3072     int i;                                                              \
3073     int *saved_charbuf = charbuf;                                       \
3074                                                                         \
3075     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
3076     if (method != COMPOSITION_RELATIVE)                                 \
3077       {                                                                 \
3078         if (component_len == 0)                                         \
3079           for (i = 0; i < component_idx; i++)                           \
3080             *charbuf++ = components[i];                                 \
3081         else                                                            \
3082           for (i = 0; i < component_len; i++)                           \
3083             *charbuf++ = components[i];                                 \
3084         *saved_charbuf = saved_charbuf - charbuf;                       \
3085       }                                                                 \
3086     if (method == COMPOSITION_WITH_RULE)                                \
3087       for (i = 0; i < component_idx; i += 2, char_offset++)             \
3088         *charbuf++ = components[i];                                     \
3089     else                                                                \
3090       for (i = component_len; i < component_idx; i++, char_offset++)    \
3091         *charbuf++ = components[i];                                     \
3092     coding->annotated = 1;                                              \
3093     composition_state = COMPOSING_NO;                                   \
3094   } while (0)
3095
3096
3097 /* Decode a composition rule from the byte C1 (and maybe one more byte
3098    from SRC) and store one encoded composition rule in
3099    coding->cmp_data.  */
3100
3101 #define DECODE_COMPOSITION_RULE(c1)                                     \
3102   do {                                                                  \
3103     (c1) -= 32;                                                         \
3104     if (c1 < 81)                /* old format (before ver.21) */        \
3105       {                                                                 \
3106         int gref = (c1) / 9;                                            \
3107         int nref = (c1) % 9;                                            \
3108         if (gref == 4) gref = 10;                                       \
3109         if (nref == 4) nref = 10;                                       \
3110         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
3111       }                                                                 \
3112     else if (c1 < 93)           /* new format (after ver.21) */         \
3113       {                                                                 \
3114         ONE_MORE_BYTE (c2);                                             \
3115         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
3116       }                                                                 \
3117     else                                                                \
3118       c1 = 0;                                                           \
3119   } while (0)
3120
3121
3122 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3123
3124 static void
3125 decode_coding_iso_2022 (coding)
3126      struct coding_system *coding;
3127 {
3128   const unsigned char *src = coding->source + coding->consumed;
3129   const unsigned char *src_end = coding->source + coding->src_bytes;
3130   const unsigned char *src_base;
3131   int *charbuf = coding->charbuf + coding->charbuf_used;
3132   int *charbuf_end
3133     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
3134   int consumed_chars = 0, consumed_chars_base;
3135   int multibytep = coding->src_multibyte;
3136   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3137   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3138   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3139   int charset_id_2, charset_id_3;
3140   struct charset *charset;
3141   int c;
3142   /* For handling composition sequence.  */
3143 #define COMPOSING_NO                    0
3144 #define COMPOSING_CHAR                  1
3145 #define COMPOSING_RULE                  2
3146 #define COMPOSING_COMPONENT_CHAR        3
3147 #define COMPOSING_COMPONENT_RULE        4
3148
3149   int composition_state = COMPOSING_NO;
3150   enum composition_method method;
3151   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
3152   int component_idx;
3153   int component_len;
3154   Lisp_Object attrs, charset_list;
3155   int char_offset = coding->produced_char;
3156   int last_offset = char_offset;
3157   int last_id = charset_ascii;
3158   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3159   int byte_after_cr = -1;
3160
3161   CODING_GET_INFO (coding, attrs, charset_list);
3162   setup_iso_safe_charsets (attrs);
3163   /* Charset list may have been changed.  */
3164   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3165   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3166
3167   while (1)
3168     {
3169       int c1, c2;
3170
3171       src_base = src;
3172       consumed_chars_base = consumed_chars;
3173
3174       if (charbuf >= charbuf_end)
3175         break;
3176
3177       if (byte_after_cr >= 0)
3178         c1 = byte_after_cr, byte_after_cr = -1;
3179       else
3180         ONE_MORE_BYTE (c1);
3181       if (c1 < 0)
3182         goto invalid_code;
3183
3184       /* We produce at most one character.  */
3185       switch (iso_code_class [c1])
3186         {
3187         case ISO_0x20_or_0x7F:
3188           if (composition_state != COMPOSING_NO)
3189             {
3190               if (composition_state == COMPOSING_RULE
3191                   || composition_state == COMPOSING_COMPONENT_RULE)
3192                 {
3193                   DECODE_COMPOSITION_RULE (c1);
3194                   components[component_idx++] = c1;
3195                   composition_state--;
3196                   continue;
3197                 }
3198             }
3199           if (charset_id_0 < 0
3200               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3201             /* This is SPACE or DEL.  */
3202             charset = CHARSET_FROM_ID (charset_ascii);
3203           else
3204             charset = CHARSET_FROM_ID (charset_id_0);
3205           break;
3206
3207         case ISO_graphic_plane_0:
3208           if (composition_state != COMPOSING_NO)
3209             {
3210               if (composition_state == COMPOSING_RULE
3211                   || composition_state == COMPOSING_COMPONENT_RULE)
3212                 {
3213                   DECODE_COMPOSITION_RULE (c1);
3214                   components[component_idx++] = c1;
3215                   composition_state--;
3216                   continue;
3217                 }
3218             }
3219           if (charset_id_0 < 0)
3220             charset = CHARSET_FROM_ID (charset_ascii);
3221           else
3222             charset = CHARSET_FROM_ID (charset_id_0);
3223           break;
3224
3225         case ISO_0xA0_or_0xFF:
3226           if (charset_id_1 < 0
3227               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3228               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3229             goto invalid_code;
3230           /* This is a graphic character, we fall down ... */
3231
3232         case ISO_graphic_plane_1:
3233           if (charset_id_1 < 0)
3234             goto invalid_code;
3235           charset = CHARSET_FROM_ID (charset_id_1);
3236           break;
3237
3238         case ISO_control_0:
3239           if (eol_crlf && c1 == '\r')
3240             ONE_MORE_BYTE (byte_after_cr);
3241           MAYBE_FINISH_COMPOSITION ();
3242           charset = CHARSET_FROM_ID (charset_ascii);
3243           break;
3244
3245         case ISO_control_1:
3246           MAYBE_FINISH_COMPOSITION ();
3247           goto invalid_code;
3248
3249         case ISO_shift_out:
3250           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3251               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3252             goto invalid_code;
3253           CODING_ISO_INVOCATION (coding, 0) = 1;
3254           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3255           continue;
3256
3257         case ISO_shift_in:
3258           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3259             goto invalid_code;
3260           CODING_ISO_INVOCATION (coding, 0) = 0;
3261           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3262           continue;
3263
3264         case ISO_single_shift_2_7:
3265         case ISO_single_shift_2:
3266           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3267             goto invalid_code;
3268           /* SS2 is handled as an escape sequence of ESC 'N' */
3269           c1 = 'N';
3270           goto label_escape_sequence;
3271
3272         case ISO_single_shift_3:
3273           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3274             goto invalid_code;
3275           /* SS2 is handled as an escape sequence of ESC 'O' */
3276           c1 = 'O';
3277           goto label_escape_sequence;
3278
3279         case ISO_control_sequence_introducer:
3280           /* CSI is handled as an escape sequence of ESC '[' ...  */
3281           c1 = '[';
3282           goto label_escape_sequence;
3283
3284         case ISO_escape:
3285           ONE_MORE_BYTE (c1);
3286         label_escape_sequence:
3287           /* Escape sequences handled here are invocation,
3288              designation, direction specification, and character
3289              composition specification.  */
3290           switch (c1)
3291             {
3292             case '&':           /* revision of following character set */
3293               ONE_MORE_BYTE (c1);
3294               if (!(c1 >= '@' && c1 <= '~'))
3295                 goto invalid_code;
3296               ONE_MORE_BYTE (c1);
3297               if (c1 != ISO_CODE_ESC)
3298                 goto invalid_code;
3299               ONE_MORE_BYTE (c1);
3300               goto label_escape_sequence;
3301
3302             case '$':           /* designation of 2-byte character set */
3303               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3304                 goto invalid_code;
3305               {
3306                 int reg, chars96;
3307
3308                 ONE_MORE_BYTE (c1);
3309                 if (c1 >= '@' && c1 <= 'B')
3310                   {     /* designation of JISX0208.1978, GB2312.1980,
3311                            or JISX0208.1980 */
3312                     reg = 0, chars96 = 0;
3313                   }
3314                 else if (c1 >= 0x28 && c1 <= 0x2B)
3315                   { /* designation of DIMENSION2_CHARS94 character set */
3316                     reg = c1 - 0x28, chars96 = 0;
3317                     ONE_MORE_BYTE (c1);
3318                   }
3319                 else if (c1 >= 0x2C && c1 <= 0x2F)
3320                   { /* designation of DIMENSION2_CHARS96 character set */
3321                     reg = c1 - 0x2C, chars96 = 1;
3322                     ONE_MORE_BYTE (c1);
3323                   }
3324                 else
3325                   goto invalid_code;
3326                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3327                 /* We must update these variables now.  */
3328                 if (reg == 0)
3329                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3330                 else if (reg == 1)
3331                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3332                 if (chars96 < 0)
3333                   goto invalid_code;
3334               }
3335               continue;
3336
3337             case 'n':           /* invocation of locking-shift-2 */
3338               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3339                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3340                 goto invalid_code;
3341               CODING_ISO_INVOCATION (coding, 0) = 2;
3342               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3343               continue;
3344
3345             case 'o':           /* invocation of locking-shift-3 */
3346               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3347                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3348                 goto invalid_code;
3349               CODING_ISO_INVOCATION (coding, 0) = 3;
3350               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3351               continue;
3352
3353             case 'N':           /* invocation of single-shift-2 */
3354               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3355                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3356                 goto invalid_code;
3357               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3358               if (charset_id_2 < 0)
3359                 charset = CHARSET_FROM_ID (charset_ascii);
3360               else
3361                 charset = CHARSET_FROM_ID (charset_id_2);
3362               ONE_MORE_BYTE (c1);
3363               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3364                 goto invalid_code;
3365               break;
3366
3367             case 'O':           /* invocation of single-shift-3 */
3368               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3369                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3370                 goto invalid_code;
3371               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3372               if (charset_id_3 < 0)
3373                 charset = CHARSET_FROM_ID (charset_ascii);
3374               else
3375                 charset = CHARSET_FROM_ID (charset_id_3);
3376               ONE_MORE_BYTE (c1);
3377               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3378                 goto invalid_code;
3379               break;
3380
3381             case '0': case '2': case '3': case '4': /* start composition */
3382               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3383                 goto invalid_code;
3384               DECODE_COMPOSITION_START (c1);
3385               continue;
3386
3387             case '1':           /* end composition */
3388               if (composition_state == COMPOSING_NO)
3389                 goto invalid_code;
3390               DECODE_COMPOSITION_END ();
3391               continue;
3392
3393             case '[':           /* specification of direction */
3394               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3395                 goto invalid_code;
3396               /* For the moment, nested direction is not supported.
3397                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3398                  left-to-right, and nozero means right-to-left.  */
3399               ONE_MORE_BYTE (c1);
3400               switch (c1)
3401                 {
3402                 case ']':       /* end of the current direction */
3403                   coding->mode &= ~CODING_MODE_DIRECTION;
3404
3405                 case '0':       /* end of the current direction */
3406                 case '1':       /* start of left-to-right direction */
3407                   ONE_MORE_BYTE (c1);
3408                   if (c1 == ']')
3409                     coding->mode &= ~CODING_MODE_DIRECTION;
3410                   else
3411                     goto invalid_code;
3412                   break;
3413
3414                 case '2':       /* start of right-to-left direction */
3415                   ONE_MORE_BYTE (c1);
3416                   if (c1 == ']')
3417                     coding->mode |= CODING_MODE_DIRECTION;
3418                   else
3419                     goto invalid_code;
3420                   break;
3421
3422                 default:
3423                   goto invalid_code;
3424                 }
3425               continue;
3426
3427             case '%':
3428               ONE_MORE_BYTE (c1);
3429               if (c1 == '/')
3430                 {
3431                   /* CTEXT extended segment:
3432                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3433                      We keep these bytes as is for the moment.
3434                      They may be decoded by post-read-conversion.  */
3435                   int dim, M, L;
3436                   int size;
3437
3438                   ONE_MORE_BYTE (dim);
3439                   ONE_MORE_BYTE (M);
3440                   ONE_MORE_BYTE (L);
3441                   size = ((M - 128) * 128) + (L - 128);
3442                   if (charbuf + 8 + size > charbuf_end)
3443                     goto break_loop;
3444                   *charbuf++ = ISO_CODE_ESC;
3445                   *charbuf++ = '%';
3446                   *charbuf++ = '/';
3447                   *charbuf++ = dim;
3448                   *charbuf++ = BYTE8_TO_CHAR (M);
3449                   *charbuf++ = BYTE8_TO_CHAR (L);
3450                   while (size-- > 0)
3451                     {
3452                       ONE_MORE_BYTE (c1);
3453                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3454                     }
3455                 }
3456               else if (c1 == 'G')
3457                 {
3458                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3459                      ESC % G --UTF-8-BYTES-- ESC % @
3460                      We keep these bytes as is for the moment.
3461                      They may be decoded by post-read-conversion.  */
3462                   int *p = charbuf;
3463
3464                   if (p + 6 > charbuf_end)
3465                     goto break_loop;
3466                   *p++ = ISO_CODE_ESC;
3467                   *p++ = '%';
3468                   *p++ = 'G';
3469                   while (p < charbuf_end)
3470                     {
3471                       ONE_MORE_BYTE (c1);
3472                       if (c1 == ISO_CODE_ESC
3473                           && src + 1 < src_end
3474                           && src[0] == '%'
3475                           && src[1] == '@')
3476                         {
3477                           src += 2;
3478                           break;
3479                         }
3480                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3481                     }
3482                   if (p + 3 > charbuf_end)
3483                     goto break_loop;
3484                   *p++ = ISO_CODE_ESC;
3485                   *p++ = '%';
3486                   *p++ = '@';
3487                   charbuf = p;
3488                 }
3489               else
3490                 goto invalid_code;
3491               continue;
3492               break;
3493
3494             default:
3495               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3496                 goto invalid_code;
3497               {
3498                 int reg, chars96;
3499
3500                 if (c1 >= 0x28 && c1 <= 0x2B)
3501                   { /* designation of DIMENSION1_CHARS94 character set */
3502                     reg = c1 - 0x28, chars96 = 0;
3503                     ONE_MORE_BYTE (c1);
3504                   }
3505                 else if (c1 >= 0x2C && c1 <= 0x2F)
3506                   { /* designation of DIMENSION1_CHARS96 character set */
3507                     reg = c1 - 0x2C, chars96 = 1;
3508                     ONE_MORE_BYTE (c1);
3509                   }
3510                 else
3511                   goto invalid_code;
3512                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3513                 /* We must update these variables now.  */
3514                 if (reg == 0)
3515                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3516                 else if (reg == 1)
3517                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3518                 if (chars96 < 0)
3519                   goto invalid_code;
3520               }
3521               continue;
3522             }
3523         }
3524
3525       if (charset->id != charset_ascii
3526           && last_id != charset->id)
3527         {
3528           if (last_id != charset_ascii)
3529             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3530           last_id = charset->id;
3531           last_offset = char_offset;
3532         }
3533
3534       /* Now we know CHARSET and 1st position code C1 of a character.
3535          Produce a decoded character while getting 2nd position code
3536          C2 if necessary.  */
3537       c1 &= 0x7F;
3538       if (CHARSET_DIMENSION (charset) > 1)
3539         {
3540           ONE_MORE_BYTE (c2);
3541           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3542             /* C2 is not in a valid range.  */
3543             goto invalid_code;
3544           c1 = (c1 << 8) | (c2 & 0x7F);
3545           if (CHARSET_DIMENSION (charset) > 2)
3546             {
3547               ONE_MORE_BYTE (c2);
3548               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3549                 /* C2 is not in a valid range.  */
3550                 goto invalid_code;
3551               c1 = (c1 << 8) | (c2 & 0x7F);
3552             }
3553         }
3554
3555       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3556       if (c < 0)
3557         {
3558           MAYBE_FINISH_COMPOSITION ();
3559           for (; src_base < src; src_base++, char_offset++)
3560             {
3561               if (ASCII_BYTE_P (*src_base))
3562                 *charbuf++ = *src_base;
3563               else
3564                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3565             }
3566         }
3567       else if (composition_state == COMPOSING_NO)
3568         {
3569           *charbuf++ = c;
3570           char_offset++;
3571         }
3572       else
3573         {
3574           components[component_idx++] = c;
3575           if (method == COMPOSITION_WITH_RULE
3576               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3577                   && composition_state == COMPOSING_COMPONENT_CHAR))
3578             composition_state++;
3579         }
3580       continue;
3581
3582     invalid_code:
3583       MAYBE_FINISH_COMPOSITION ();
3584       src = src_base;
3585       consumed_chars = consumed_chars_base;
3586       ONE_MORE_BYTE (c);
3587       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3588       char_offset++;
3589       coding->errors++;
3590       continue;
3591
3592     break_loop:
3593       break;
3594     }
3595
3596  no_more_source:
3597   if (last_id != charset_ascii)
3598     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3599   coding->consumed_char += consumed_chars_base;
3600   coding->consumed = src_base - coding->source;
3601   coding->charbuf_used = charbuf - coding->charbuf;
3602 }
3603
3604
3605 /* ISO2022 encoding stuff.  */
3606
3607 /*
3608    It is not enough to say just "ISO2022" on encoding, we have to
3609    specify more details.  In Emacs, each coding system of ISO2022
3610    variant has the following specifications:
3611         1. Initial designation to G0 thru G3.
3612         2. Allows short-form designation?
3613         3. ASCII should be designated to G0 before control characters?
3614         4. ASCII should be designated to G0 at end of line?
3615         5. 7-bit environment or 8-bit environment?
3616         6. Use locking-shift?
3617         7. Use Single-shift?
3618    And the following two are only for Japanese:
3619         8. Use ASCII in place of JIS0201-1976-Roman?
3620         9. Use JISX0208-1983 in place of JISX0208-1978?
3621    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3622    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3623    details.
3624 */
3625
3626 /* Produce codes (escape sequence) for designating CHARSET to graphic
3627    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3628    '@', 'A', or 'B' and the coding system CODING allows, produce
3629    designation sequence of short-form.  */
3630
3631 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3632   do {                                                                  \
3633     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3634     char *intermediate_char_94 = "()*+";                                \
3635     char *intermediate_char_96 = ",-./";                                \
3636     int revision = -1;                                                  \
3637     int c;                                                              \
3638                                                                         \
3639     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3640       revision = CHARSET_ISO_REVISION (charset);                        \
3641                                                                         \
3642     if (revision >= 0)                                                  \
3643       {                                                                 \
3644         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3645         EMIT_ONE_BYTE ('@' + revision);                                 \
3646       }                                                                 \
3647     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3648     if (CHARSET_DIMENSION (charset) == 1)                               \
3649       {                                                                 \
3650         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3651           c = intermediate_char_94[reg];                                \
3652         else                                                            \
3653           c = intermediate_char_96[reg];                                \
3654         EMIT_ONE_ASCII_BYTE (c);                                        \
3655       }                                                                 \
3656     else                                                                \
3657       {                                                                 \
3658         EMIT_ONE_ASCII_BYTE ('$');                                      \
3659         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3660           {                                                             \
3661             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3662                 || reg != 0                                             \
3663                 || final_char < '@' || final_char > 'B')                \
3664               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3665           }                                                             \
3666         else                                                            \
3667           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3668       }                                                                 \
3669     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3670                                                                         \
3671     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3672   } while (0)
3673
3674
3675 /* The following two macros produce codes (control character or escape
3676    sequence) for ISO2022 single-shift functions (single-shift-2 and
3677    single-shift-3).  */
3678
3679 #define ENCODE_SINGLE_SHIFT_2                                           \
3680   do {                                                                  \
3681     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3682       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3683     else                                                                \
3684       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3685     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3686   } while (0)
3687
3688
3689 #define ENCODE_SINGLE_SHIFT_3                                           \
3690   do {                                                                  \
3691     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3692       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3693     else                                                                \
3694       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3695     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3696   } while (0)
3697
3698
3699 /* The following four macros produce codes (control character or
3700    escape sequence) for ISO2022 locking-shift functions (shift-in,
3701    shift-out, locking-shift-2, and locking-shift-3).  */
3702
3703 #define ENCODE_SHIFT_IN                                 \
3704   do {                                                  \
3705     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3706     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3707   } while (0)
3708
3709
3710 #define ENCODE_SHIFT_OUT                                \
3711   do {                                                  \
3712     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3713     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3714   } while (0)
3715
3716
3717 #define ENCODE_LOCKING_SHIFT_2                          \
3718   do {                                                  \
3719     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3720     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3721   } while (0)
3722
3723
3724 #define ENCODE_LOCKING_SHIFT_3                          \
3725   do {                                                  \
3726     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3727     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3728   } while (0)
3729
3730
3731 /* Produce codes for a DIMENSION1 character whose character set is
3732    CHARSET and whose position-code is C1.  Designation and invocation
3733    sequences are also produced in advance if necessary.  */
3734
3735 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3736   do {                                                                  \
3737     int id = CHARSET_ID (charset);                                      \
3738                                                                         \
3739     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3740         && id == charset_ascii)                                         \
3741       {                                                                 \
3742         id = charset_jisx0201_roman;                                    \
3743         charset = CHARSET_FROM_ID (id);                                 \
3744       }                                                                 \
3745                                                                         \
3746     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3747       {                                                                 \
3748         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3749           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3750         else                                                            \
3751           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3752         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3753         break;                                                          \
3754       }                                                                 \
3755     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3756       {                                                                 \
3757         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3758         break;                                                          \
3759       }                                                                 \
3760     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3761       {                                                                 \
3762         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3763         break;                                                          \
3764       }                                                                 \
3765     else                                                                \
3766       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3767          must invoke it, or, at first, designate it to some graphic     \
3768          register.  Then repeat the loop to actually produce the        \
3769          character.  */                                                 \
3770       dst = encode_invocation_designation (charset, coding, dst,        \
3771                                            &produced_chars);            \
3772   } while (1)
3773
3774
3775 /* Produce codes for a DIMENSION2 character whose character set is
3776    CHARSET and whose position-codes are C1 and C2.  Designation and
3777    invocation codes are also produced in advance if necessary.  */
3778
3779 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3780   do {                                                                  \
3781     int id = CHARSET_ID (charset);                                      \
3782                                                                         \
3783     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3784         && id == charset_jisx0208)                                      \
3785       {                                                                 \
3786         id = charset_jisx0208_1978;                                     \
3787         charset = CHARSET_FROM_ID (id);                                 \
3788       }                                                                 \
3789                                                                         \
3790     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3791       {                                                                 \
3792         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3793           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3794         else                                                            \
3795           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3796         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3797         break;                                                          \
3798       }                                                                 \
3799     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3800       {                                                                 \
3801         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3802         break;                                                          \
3803       }                                                                 \
3804     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3805       {                                                                 \
3806         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3807         break;                                                          \
3808       }                                                                 \
3809     else                                                                \
3810       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3811          must invoke it, or, at first, designate it to some graphic     \
3812          register.  Then repeat the loop to actually produce the        \
3813          character.  */                                                 \
3814       dst = encode_invocation_designation (charset, coding, dst,        \
3815                                            &produced_chars);            \
3816   } while (1)
3817
3818
3819 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3820   do {                                                                     \
3821     int code = ENCODE_CHAR ((charset),(c));                                \
3822                                                                            \
3823     if (CHARSET_DIMENSION (charset) == 1)                                  \
3824       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3825     else                                                                   \
3826       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3827   } while (0)
3828
3829
3830 /* Produce designation and invocation codes at a place pointed by DST
3831    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3832    Return new DST.  */
3833
3834 unsigned char *
3835 encode_invocation_designation (charset, coding, dst, p_nchars)
3836      struct charset *charset;
3837      struct coding_system *coding;
3838      unsigned char *dst;
3839      int *p_nchars;
3840 {
3841   int multibytep = coding->dst_multibyte;
3842   int produced_chars = *p_nchars;
3843   int reg;                      /* graphic register number */
3844   int id = CHARSET_ID (charset);
3845
3846   /* At first, check designations.  */
3847   for (reg = 0; reg < 4; reg++)
3848     if (id == CODING_ISO_DESIGNATION (coding, reg))
3849       break;
3850
3851   if (reg >= 4)
3852     {
3853       /* CHARSET is not yet designated to any graphic registers.  */
3854       /* At first check the requested designation.  */
3855       reg = CODING_ISO_REQUEST (coding, id);
3856       if (reg < 0)
3857         /* Since CHARSET requests no special designation, designate it
3858            to graphic register 0.  */
3859         reg = 0;
3860
3861       ENCODE_DESIGNATION (charset, reg, coding);
3862     }
3863
3864   if (CODING_ISO_INVOCATION (coding, 0) != reg
3865       && CODING_ISO_INVOCATION (coding, 1) != reg)
3866     {
3867       /* Since the graphic register REG is not invoked to any graphic
3868          planes, invoke it to graphic plane 0.  */
3869       switch (reg)
3870         {
3871         case 0:                 /* graphic register 0 */
3872           ENCODE_SHIFT_IN;
3873           break;
3874
3875         case 1:                 /* graphic register 1 */
3876           ENCODE_SHIFT_OUT;
3877           break;
3878
3879         case 2:                 /* graphic register 2 */
3880           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3881             ENCODE_SINGLE_SHIFT_2;
3882           else
3883             ENCODE_LOCKING_SHIFT_2;
3884           break;
3885
3886         case 3:                 /* graphic register 3 */
3887           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3888             ENCODE_SINGLE_SHIFT_3;
3889           else
3890             ENCODE_LOCKING_SHIFT_3;
3891           break;
3892         }
3893     }
3894
3895   *p_nchars = produced_chars;
3896   return dst;
3897 }
3898
3899 /* The following three macros produce codes for indicating direction
3900    of text.  */
3901 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3902   do {                                                                  \
3903     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3904       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3905     else                                                                \
3906       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3907   } while (0)
3908
3909
3910 #define ENCODE_DIRECTION_R2L()                  \
3911   do {                                          \
3912     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3913     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3914   } while (0)
3915
3916
3917 #define ENCODE_DIRECTION_L2R()                  \
3918   do {                                          \
3919     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3920     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3921   } while (0)
3922
3923
3924 /* Produce codes for designation and invocation to reset the graphic
3925    planes and registers to initial state.  */
3926 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3927   do {                                                                  \
3928     int reg;                                                            \
3929     struct charset *charset;                                            \
3930                                                                         \
3931     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3932       ENCODE_SHIFT_IN;                                                  \
3933     for (reg = 0; reg < 4; reg++)                                       \
3934       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3935           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3936               != CODING_ISO_INITIAL (coding, reg)))                     \
3937         {                                                               \
3938           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3939           ENCODE_DESIGNATION (charset, reg, coding);                    \
3940         }                                                               \
3941   } while (0)
3942
3943
3944 /* Produce designation sequences of charsets in the line started from
3945    SRC to a place pointed by DST, and return updated DST.
3946
3947    If the current block ends before any end-of-line, we may fail to
3948    find all the necessary designations.  */
3949
3950 static unsigned char *
3951 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3952      struct coding_system *coding;
3953      int *charbuf, *charbuf_end;
3954      unsigned char *dst;
3955 {
3956   struct charset *charset;
3957   /* Table of charsets to be designated to each graphic register.  */
3958   int r[4];
3959   int c, found = 0, reg;
3960   int produced_chars = 0;
3961   int multibytep = coding->dst_multibyte;
3962   Lisp_Object attrs;
3963   Lisp_Object charset_list;
3964
3965   attrs = CODING_ID_ATTRS (coding->id);
3966   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3967   if (EQ (charset_list, Qiso_2022))
3968     charset_list = Viso_2022_charset_list;
3969
3970   for (reg = 0; reg < 4; reg++)
3971     r[reg] = -1;
3972
3973   while (found < 4)
3974     {
3975       int id;
3976
3977       c = *charbuf++;
3978       if (c == '\n')
3979         break;
3980       charset = char_charset (c, charset_list, NULL);
3981       id = CHARSET_ID (charset);
3982       reg = CODING_ISO_REQUEST (coding, id);
3983       if (reg >= 0 && r[reg] < 0)
3984         {
3985           found++;
3986           r[reg] = id;
3987         }
3988     }
3989
3990   if (found)
3991     {
3992       for (reg = 0; reg < 4; reg++)
3993         if (r[reg] >= 0
3994             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3995           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3996     }
3997
3998   return dst;
3999 }
4000
4001 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4002
4003 static int
4004 encode_coding_iso_2022 (coding)
4005      struct coding_system *coding;
4006 {
4007   int multibytep = coding->dst_multibyte;
4008   int *charbuf = coding->charbuf;
4009   int *charbuf_end = charbuf + coding->charbuf_used;
4010   unsigned char *dst = coding->destination + coding->produced;
4011   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4012   int safe_room = 16;
4013   int bol_designation
4014     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4015        && CODING_ISO_BOL (coding));
4016   int produced_chars = 0;
4017   Lisp_Object attrs, eol_type, charset_list;
4018   int ascii_compatible;
4019   int c;
4020   int preferred_charset_id = -1;
4021
4022   CODING_GET_INFO (coding, attrs, charset_list);
4023   eol_type = CODING_ID_EOL_TYPE (coding->id);
4024   if (VECTORP (eol_type))
4025     eol_type = Qunix;
4026
4027   setup_iso_safe_charsets (attrs);
4028   /* Charset list may have been changed.  */
4029   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4030   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
4031
4032   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4033
4034   while (charbuf < charbuf_end)
4035     {
4036       ASSURE_DESTINATION (safe_room);
4037
4038       if (bol_designation)
4039         {
4040           unsigned char *dst_prev = dst;
4041
4042           /* We have to produce designation sequences if any now.  */
4043           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4044           bol_designation = 0;
4045           /* We are sure that designation sequences are all ASCII bytes.  */
4046           produced_chars += dst - dst_prev;
4047         }
4048
4049       c = *charbuf++;
4050
4051       if (c < 0)
4052         {
4053           /* Handle an annotation.  */
4054           switch (*charbuf)
4055             {
4056             case CODING_ANNOTATE_COMPOSITION_MASK:
4057               /* Not yet implemented.  */
4058               break;
4059             case CODING_ANNOTATE_CHARSET_MASK:
4060               preferred_charset_id = charbuf[2];
4061               if (preferred_charset_id >= 0
4062                   && NILP (Fmemq (make_number (preferred_charset_id),
4063                                   charset_list)))
4064                 preferred_charset_id = -1;
4065               break;
4066             default:
4067               abort ();
4068             }
4069           charbuf += -c - 1;
4070           continue;
4071         }
4072
4073       /* Now encode the character C.  */
4074       if (c < 0x20 || c == 0x7F)
4075         {
4076           if (c == '\n'
4077               || (c == '\r' && EQ (eol_type, Qmac)))
4078             {
4079               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4080                 ENCODE_RESET_PLANE_AND_REGISTER ();
4081               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4082                 {
4083                   int i;
4084
4085                   for (i = 0; i < 4; i++)
4086                     CODING_ISO_DESIGNATION (coding, i)
4087                       = CODING_ISO_INITIAL (coding, i);
4088                 }
4089               bol_designation
4090                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4091             }
4092           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4093             ENCODE_RESET_PLANE_AND_REGISTER ();
4094           EMIT_ONE_ASCII_BYTE (c);
4095         }
4096       else if (ASCII_CHAR_P (c))
4097         {
4098           if (ascii_compatible)
4099             EMIT_ONE_ASCII_BYTE (c);
4100           else
4101             {
4102               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4103               ENCODE_ISO_CHARACTER (charset, c);
4104             }
4105         }
4106       else if (CHAR_BYTE8_P (c))
4107         {
4108           c = CHAR_TO_BYTE8 (c);
4109           EMIT_ONE_BYTE (c);
4110         }
4111       else
4112         {
4113           struct charset *charset;
4114
4115           if (preferred_charset_id >= 0)
4116             {
4117               charset = CHARSET_FROM_ID (preferred_charset_id);
4118               if (! CHAR_CHARSET_P (c, charset))
4119                 charset = char_charset (c, charset_list, NULL);
4120             }
4121           else
4122             charset = char_charset (c, charset_list, NULL);
4123           if (!charset)
4124             {
4125               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4126                 {
4127                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4128                   charset = CHARSET_FROM_ID (charset_ascii);
4129                 }
4130               else
4131                 {
4132                   c = coding->default_char;
4133                   charset = char_charset (c, charset_list, NULL);
4134                 }
4135             }
4136           ENCODE_ISO_CHARACTER (charset, c);
4137         }
4138     }
4139
4140   if (coding->mode & CODING_MODE_LAST_BLOCK
4141       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4142     {
4143       ASSURE_DESTINATION (safe_room);
4144       ENCODE_RESET_PLANE_AND_REGISTER ();
4145     }
4146   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4147   CODING_ISO_BOL (coding) = bol_designation;
4148   coding->produced_char += produced_chars;
4149   coding->produced = dst - coding->destination;
4150   return 0;
4151 }
4152
4153 \f
4154 /*** 8,9. SJIS and BIG5 handlers ***/
4155
4156 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4157    quite widely.  So, for the moment, Emacs supports them in the bare
4158    C code.  But, in the future, they may be supported only by CCL.  */
4159
4160 /* SJIS is a coding system encoding three character sets: ASCII, right
4161    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4162    as is.  A character of charset katakana-jisx0201 is encoded by
4163    "position-code + 0x80".  A character of charset japanese-jisx0208
4164    is encoded in 2-byte but two position-codes are divided and shifted
4165    so that it fit in the range below.
4166
4167    --- CODE RANGE of SJIS ---
4168    (character set)      (range)
4169    ASCII                0x00 .. 0x7F
4170    KATAKANA-JISX0201    0xA0 .. 0xDF
4171    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4172             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4173    -------------------------------
4174
4175 */
4176
4177 /* BIG5 is a coding system encoding two character sets: ASCII and
4178    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4179    character set and is encoded in two-byte.
4180
4181    --- CODE RANGE of BIG5 ---
4182    (character set)      (range)
4183    ASCII                0x00 .. 0x7F
4184    Big5 (1st byte)      0xA1 .. 0xFE
4185         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4186    --------------------------
4187
4188   */
4189
4190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4191    Check if a text is encoded in SJIS.  If it is, return
4192    CATEGORY_MASK_SJIS, else return 0.  */
4193
4194 static int
4195 detect_coding_sjis (coding, detect_info)
4196      struct coding_system *coding;
4197      struct coding_detection_info *detect_info;
4198 {
4199   const unsigned char *src = coding->source, *src_base;
4200   const unsigned char *src_end = coding->source + coding->src_bytes;
4201   int multibytep = coding->src_multibyte;
4202   int consumed_chars = 0;
4203   int found = 0;
4204   int c;
4205
4206   detect_info->checked |= CATEGORY_MASK_SJIS;
4207   /* A coding system of this category is always ASCII compatible.  */
4208   src += coding->head_ascii;
4209
4210   while (1)
4211     {
4212       src_base = src;
4213       ONE_MORE_BYTE (c);
4214       if (c < 0x80)
4215         continue;
4216       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
4217         {
4218           ONE_MORE_BYTE (c);
4219           if (c < 0x40 || c == 0x7F || c > 0xFC)
4220             break;
4221           found = CATEGORY_MASK_SJIS;
4222         }
4223       else if (c >= 0xA0 && c < 0xE0)
4224         found = CATEGORY_MASK_SJIS;
4225       else
4226         break;
4227     }
4228   detect_info->rejected |= CATEGORY_MASK_SJIS;
4229   return 0;
4230
4231  no_more_source:
4232   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4233     {
4234       detect_info->rejected |= CATEGORY_MASK_SJIS;
4235       return 0;
4236     }
4237   detect_info->found |= found;
4238   return 1;
4239 }
4240
4241 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4242    Check if a text is encoded in BIG5.  If it is, return
4243    CATEGORY_MASK_BIG5, else return 0.  */
4244
4245 static int
4246 detect_coding_big5 (coding, detect_info)
4247      struct coding_system *coding;
4248      struct coding_detection_info *detect_info;
4249 {
4250   const unsigned char *src = coding->source, *src_base;
4251   const unsigned char *src_end = coding->source + coding->src_bytes;
4252   int multibytep = coding->src_multibyte;
4253   int consumed_chars = 0;
4254   int found = 0;
4255   int c;
4256
4257   detect_info->checked |= CATEGORY_MASK_BIG5;
4258   /* A coding system of this category is always ASCII compatible.  */
4259   src += coding->head_ascii;
4260
4261   while (1)
4262     {
4263       src_base = src;
4264       ONE_MORE_BYTE (c);
4265       if (c < 0x80)
4266         continue;
4267       if (c >= 0xA1)
4268         {
4269           ONE_MORE_BYTE (c);
4270           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4271             return 0;
4272           found = CATEGORY_MASK_BIG5;
4273         }
4274       else
4275         break;
4276     }
4277   detect_info->rejected |= CATEGORY_MASK_BIG5;
4278   return 0;
4279
4280  no_more_source:
4281   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4282     {
4283       detect_info->rejected |= CATEGORY_MASK_BIG5;
4284       return 0;
4285     }
4286   detect_info->found |= found;
4287   return 1;
4288 }
4289
4290 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4291    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4292
4293 static void
4294 decode_coding_sjis (coding)
4295      struct coding_system *coding;
4296 {
4297   const unsigned char *src = coding->source + coding->consumed;
4298   const unsigned char *src_end = coding->source + coding->src_bytes;
4299   const unsigned char *src_base;
4300   int *charbuf = coding->charbuf + coding->charbuf_used;
4301   int *charbuf_end
4302     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4303   int consumed_chars = 0, consumed_chars_base;
4304   int multibytep = coding->src_multibyte;
4305   struct charset *charset_roman, *charset_kanji, *charset_kana;
4306   struct charset *charset_kanji2;
4307   Lisp_Object attrs, charset_list, val;
4308   int char_offset = coding->produced_char;
4309   int last_offset = char_offset;
4310   int last_id = charset_ascii;
4311   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4312   int byte_after_cr = -1;
4313
4314   CODING_GET_INFO (coding, attrs, charset_list);
4315
4316   val = charset_list;
4317   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4318   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4319   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4320   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4321
4322   while (1)
4323     {
4324       int c, c1;
4325       struct charset *charset;
4326
4327       src_base = src;
4328       consumed_chars_base = consumed_chars;
4329
4330       if (charbuf >= charbuf_end)
4331         break;
4332
4333       if (byte_after_cr >= 0)
4334         c = byte_after_cr, byte_after_cr = -1;
4335       else
4336         ONE_MORE_BYTE (c);
4337       if (c < 0)
4338         goto invalid_code;
4339       if (c < 0x80)
4340         {
4341           if (eol_crlf && c == '\r')
4342             ONE_MORE_BYTE (byte_after_cr);
4343           charset = charset_roman;
4344         }
4345       else if (c == 0x80 || c == 0xA0)
4346         goto invalid_code;
4347       else if (c >= 0xA1 && c <= 0xDF)
4348         {
4349           /* SJIS -> JISX0201-Kana */
4350           c &= 0x7F;
4351           charset = charset_kana;
4352         }
4353       else if (c <= 0xEF)
4354         {
4355           /* SJIS -> JISX0208 */
4356           ONE_MORE_BYTE (c1);
4357           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4358             goto invalid_code;
4359           c = (c << 8) | c1;
4360           SJIS_TO_JIS (c);
4361           charset = charset_kanji;
4362         }
4363       else if (c <= 0xFC && charset_kanji2)
4364         {
4365           /* SJIS -> JISX0213-2 */
4366           ONE_MORE_BYTE (c1);
4367           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4368             goto invalid_code;
4369           c = (c << 8) | c1;
4370           SJIS_TO_JIS2 (c);
4371           charset = charset_kanji2;
4372         }
4373       else
4374         goto invalid_code;
4375       if (charset->id != charset_ascii
4376           && last_id != charset->id)
4377         {
4378           if (last_id != charset_ascii)
4379             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4380           last_id = charset->id;
4381           last_offset = char_offset;
4382         }
4383       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4384       *charbuf++ = c;
4385       char_offset++;
4386       continue;
4387
4388     invalid_code:
4389       src = src_base;
4390       consumed_chars = consumed_chars_base;
4391       ONE_MORE_BYTE (c);
4392       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4393       char_offset++;
4394       coding->errors++;
4395     }
4396
4397  no_more_source:
4398   if (last_id != charset_ascii)
4399     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4400   coding->consumed_char += consumed_chars_base;
4401   coding->consumed = src_base - coding->source;
4402   coding->charbuf_used = charbuf - coding->charbuf;
4403 }
4404
4405 static void
4406 decode_coding_big5 (coding)
4407      struct coding_system *coding;
4408 {
4409   const unsigned char *src = coding->source + coding->consumed;
4410   const unsigned char *src_end = coding->source + coding->src_bytes;
4411   const unsigned char *src_base;
4412   int *charbuf = coding->charbuf + coding->charbuf_used;
4413   int *charbuf_end
4414     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4415   int consumed_chars = 0, consumed_chars_base;
4416   int multibytep = coding->src_multibyte;
4417   struct charset *charset_roman, *charset_big5;
4418   Lisp_Object attrs, charset_list, val;
4419   int char_offset = coding->produced_char;
4420   int last_offset = char_offset;
4421   int last_id = charset_ascii;
4422   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4423   int byte_after_cr = -1;
4424
4425   CODING_GET_INFO (coding, attrs, charset_list);
4426   val = charset_list;
4427   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4428   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4429
4430   while (1)
4431     {
4432       int c, c1;
4433       struct charset *charset;
4434
4435       src_base = src;
4436       consumed_chars_base = consumed_chars;
4437
4438       if (charbuf >= charbuf_end)
4439         break;
4440
4441       if (byte_after_cr >= 0)
4442         c = byte_after_cr, byte_after_cr = -1;
4443       else
4444         ONE_MORE_BYTE (c);
4445
4446       if (c < 0)
4447         goto invalid_code;
4448       if (c < 0x80)
4449         {
4450           if (eol_crlf && c == '\r')
4451             ONE_MORE_BYTE (byte_after_cr);
4452           charset = charset_roman;
4453         }
4454       else
4455         {
4456           /* BIG5 -> Big5 */
4457           if (c < 0xA1 || c > 0xFE)
4458             goto invalid_code;
4459           ONE_MORE_BYTE (c1);
4460           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4461             goto invalid_code;
4462           c = c << 8 | c1;
4463           charset = charset_big5;
4464         }
4465       if (charset->id != charset_ascii
4466           && last_id != charset->id)
4467         {
4468           if (last_id != charset_ascii)
4469             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4470           last_id = charset->id;
4471           last_offset = char_offset;
4472         }
4473       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4474       *charbuf++ = c;
4475       char_offset++;
4476       continue;
4477
4478     invalid_code:
4479       src = src_base;
4480       consumed_chars = consumed_chars_base;
4481       ONE_MORE_BYTE (c);
4482       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4483       char_offset++;
4484       coding->errors++;
4485     }
4486
4487  no_more_source:
4488   if (last_id != charset_ascii)
4489     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4490   coding->consumed_char += consumed_chars_base;
4491   coding->consumed = src_base - coding->source;
4492   coding->charbuf_used = charbuf - coding->charbuf;
4493 }
4494
4495 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4496    This function can encode charsets `ascii', `katakana-jisx0201',
4497    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4498    are sure that all these charsets are registered as official charset
4499    (i.e. do not have extended leading-codes).  Characters of other
4500    charsets are produced without any encoding.  If SJIS_P is 1, encode
4501    SJIS text, else encode BIG5 text.  */
4502
4503 static int
4504 encode_coding_sjis (coding)
4505      struct coding_system *coding;
4506 {
4507   int multibytep = coding->dst_multibyte;
4508   int *charbuf = coding->charbuf;
4509   int *charbuf_end = charbuf + coding->charbuf_used;
4510   unsigned char *dst = coding->destination + coding->produced;
4511   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4512   int safe_room = 4;
4513   int produced_chars = 0;
4514   Lisp_Object attrs, charset_list, val;
4515   int ascii_compatible;
4516   struct charset *charset_roman, *charset_kanji, *charset_kana;
4517   struct charset *charset_kanji2;
4518   int c;
4519
4520   CODING_GET_INFO (coding, attrs, charset_list);
4521   val = charset_list;
4522   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4523   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4524   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4525   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4526
4527   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4528
4529   while (charbuf < charbuf_end)
4530     {
4531       ASSURE_DESTINATION (safe_room);
4532       c = *charbuf++;
4533       /* Now encode the character C.  */
4534       if (ASCII_CHAR_P (c) && ascii_compatible)
4535         EMIT_ONE_ASCII_BYTE (c);
4536       else if (CHAR_BYTE8_P (c))
4537         {
4538           c = CHAR_TO_BYTE8 (c);
4539           EMIT_ONE_BYTE (c);
4540         }
4541       else
4542         {
4543           unsigned code;
4544           struct charset *charset = char_charset (c, charset_list, &code);
4545
4546           if (!charset)
4547             {
4548               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4549                 {
4550                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4551                   charset = CHARSET_FROM_ID (charset_ascii);
4552                 }
4553               else
4554                 {
4555                   c = coding->default_char;
4556                   charset = char_charset (c, charset_list, &code);
4557                 }
4558             }
4559           if (code == CHARSET_INVALID_CODE (charset))
4560             abort ();
4561           if (charset == charset_kanji)
4562             {
4563               int c1, c2;
4564               JIS_TO_SJIS (code);
4565               c1 = code >> 8, c2 = code & 0xFF;
4566               EMIT_TWO_BYTES (c1, c2);
4567             }
4568           else if (charset == charset_kana)
4569             EMIT_ONE_BYTE (code | 0x80);
4570           else if (charset_kanji2 && charset == charset_kanji2)
4571             {
4572               int c1, c2;
4573
4574               c1 = code >> 8;
4575               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4576                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4577                 {
4578                   JIS_TO_SJIS2 (code);
4579                   c1 = code >> 8, c2 = code & 0xFF;
4580                   EMIT_TWO_BYTES (c1, c2);
4581                 }
4582               else
4583                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4584             }
4585           else
4586             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4587         }
4588     }
4589   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4590   coding->produced_char += produced_chars;
4591   coding->produced = dst - coding->destination;
4592   return 0;
4593 }
4594
4595 static int
4596 encode_coding_big5 (coding)
4597      struct coding_system *coding;
4598 {
4599   int multibytep = coding->dst_multibyte;
4600   int *charbuf = coding->charbuf;
4601   int *charbuf_end = charbuf + coding->charbuf_used;
4602   unsigned char *dst = coding->destination + coding->produced;
4603   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4604   int safe_room = 4;
4605   int produced_chars = 0;
4606   Lisp_Object attrs, charset_list, val;
4607   int ascii_compatible;
4608   struct charset *charset_roman, *charset_big5;
4609   int c;
4610
4611   CODING_GET_INFO (coding, attrs, charset_list);
4612   val = charset_list;
4613   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4614   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4615   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4616
4617   while (charbuf < charbuf_end)
4618     {
4619       ASSURE_DESTINATION (safe_room);
4620       c = *charbuf++;
4621       /* Now encode the character C.  */
4622       if (ASCII_CHAR_P (c) && ascii_compatible)
4623         EMIT_ONE_ASCII_BYTE (c);
4624       else if (CHAR_BYTE8_P (c))
4625         {
4626           c = CHAR_TO_BYTE8 (c);
4627           EMIT_ONE_BYTE (c);
4628         }
4629       else
4630         {
4631           unsigned code;
4632           struct charset *charset = char_charset (c, charset_list, &code);
4633
4634           if (! charset)
4635             {
4636               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4637                 {
4638                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4639                   charset = CHARSET_FROM_ID (charset_ascii);
4640                 }
4641               else
4642                 {
4643                   c = coding->default_char;
4644                   charset = char_charset (c, charset_list, &code);
4645                 }
4646             }
4647           if (code == CHARSET_INVALID_CODE (charset))
4648             abort ();
4649           if (charset == charset_big5)
4650             {
4651               int c1, c2;
4652
4653               c1 = code >> 8, c2 = code & 0xFF;
4654               EMIT_TWO_BYTES (c1, c2);
4655             }
4656           else
4657             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4658         }
4659     }
4660   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4661   coding->produced_char += produced_chars;
4662   coding->produced = dst - coding->destination;
4663   return 0;
4664 }
4665
4666 \f
4667 /*** 10. CCL handlers ***/
4668
4669 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4670    Check if a text is encoded in a coding system of which
4671    encoder/decoder are written in CCL program.  If it is, return
4672    CATEGORY_MASK_CCL, else return 0.  */
4673
4674 static int
4675 detect_coding_ccl (coding, detect_info)
4676      struct coding_system *coding;
4677      struct coding_detection_info *detect_info;
4678 {
4679   const unsigned char *src = coding->source, *src_base;
4680   const unsigned char *src_end = coding->source + coding->src_bytes;
4681   int multibytep = coding->src_multibyte;
4682   int consumed_chars = 0;
4683   int found = 0;
4684   unsigned char *valids;
4685   int head_ascii = coding->head_ascii;
4686   Lisp_Object attrs;
4687
4688   detect_info->checked |= CATEGORY_MASK_CCL;
4689
4690   coding = &coding_categories[coding_category_ccl];
4691   valids = CODING_CCL_VALIDS (coding);
4692   attrs = CODING_ID_ATTRS (coding->id);
4693   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4694     src += head_ascii;
4695
4696   while (1)
4697     {
4698       int c;
4699
4700       src_base = src;
4701       ONE_MORE_BYTE (c);
4702       if (c < 0 || ! valids[c])
4703         break;
4704       if ((valids[c] > 1))
4705         found = CATEGORY_MASK_CCL;
4706     }
4707   detect_info->rejected |= CATEGORY_MASK_CCL;
4708   return 0;
4709
4710  no_more_source:
4711   detect_info->found |= found;
4712   return 1;
4713 }
4714
4715 static void
4716 decode_coding_ccl (coding)
4717      struct coding_system *coding;
4718 {
4719   const unsigned char *src = coding->source + coding->consumed;
4720   const unsigned char *src_end = coding->source + coding->src_bytes;
4721   int *charbuf = coding->charbuf + coding->charbuf_used;
4722   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4723   int consumed_chars = 0;
4724   int multibytep = coding->src_multibyte;
4725   struct ccl_program ccl;
4726   int source_charbuf[1024];
4727   int source_byteidx[1024];
4728   Lisp_Object attrs, charset_list;
4729
4730   CODING_GET_INFO (coding, attrs, charset_list);
4731   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4732
4733   while (src < src_end)
4734     {
4735       const unsigned char *p = src;
4736       int *source, *source_end;
4737       int i = 0;
4738
4739       if (multibytep)
4740         while (i < 1024 && p < src_end)
4741           {
4742             source_byteidx[i] = p - src;
4743             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4744           }
4745       else
4746         while (i < 1024 && p < src_end)
4747           source_charbuf[i++] = *p++;
4748
4749       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4750         ccl.last_block = 1;
4751
4752       source = source_charbuf;
4753       source_end = source + i;
4754       while (source < source_end)
4755         {
4756           ccl_driver (&ccl, source, charbuf,
4757                       source_end - source, charbuf_end - charbuf,
4758                       charset_list);
4759           source += ccl.consumed;
4760           charbuf += ccl.produced;
4761           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4762             break;
4763         }
4764       if (source < source_end)
4765         src += source_byteidx[source - source_charbuf];
4766       else
4767         src = p;
4768       consumed_chars += source - source_charbuf;
4769
4770       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4771           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4772         break;
4773     }
4774
4775   switch (ccl.status)
4776     {
4777     case CCL_STAT_SUSPEND_BY_SRC:
4778       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4779       break;
4780     case CCL_STAT_SUSPEND_BY_DST:
4781       break;
4782     case CCL_STAT_QUIT:
4783     case CCL_STAT_INVALID_CMD:
4784       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4785       break;
4786     default:
4787       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4788       break;
4789     }
4790   coding->consumed_char += consumed_chars;
4791   coding->consumed = src - coding->source;
4792   coding->charbuf_used = charbuf - coding->charbuf;
4793 }
4794
4795 static int
4796 encode_coding_ccl (coding)
4797      struct coding_system *coding;
4798 {
4799   struct ccl_program ccl;
4800   int multibytep = coding->dst_multibyte;
4801   int *charbuf = coding->charbuf;
4802   int *charbuf_end = charbuf + coding->charbuf_used;
4803   unsigned char *dst = coding->destination + coding->produced;
4804   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4805   int destination_charbuf[1024];
4806   int i, produced_chars = 0;
4807   Lisp_Object attrs, charset_list;
4808
4809   CODING_GET_INFO (coding, attrs, charset_list);
4810   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4811
4812   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4813   ccl.dst_multibyte = coding->dst_multibyte;
4814
4815   while (charbuf < charbuf_end)
4816     {
4817       ccl_driver (&ccl, charbuf, destination_charbuf,
4818                   charbuf_end - charbuf, 1024, charset_list);
4819       if (multibytep)
4820         {
4821           ASSURE_DESTINATION (ccl.produced * 2);
4822           for (i = 0; i < ccl.produced; i++)
4823             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4824         }
4825       else
4826         {
4827           ASSURE_DESTINATION (ccl.produced);
4828           for (i = 0; i < ccl.produced; i++)
4829             *dst++ = destination_charbuf[i] & 0xFF;
4830           produced_chars += ccl.produced;
4831         }
4832       charbuf += ccl.consumed;
4833       if (ccl.status == CCL_STAT_QUIT
4834           || ccl.status == CCL_STAT_INVALID_CMD)
4835         break;
4836     }
4837
4838   switch (ccl.status)
4839     {
4840     case CCL_STAT_SUSPEND_BY_SRC:
4841       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4842       break;
4843     case CCL_STAT_SUSPEND_BY_DST:
4844       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4845       break;
4846     case CCL_STAT_QUIT:
4847     case CCL_STAT_INVALID_CMD:
4848       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4849       break;
4850     default:
4851       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4852       break;
4853     }
4854
4855   coding->produced_char += produced_chars;
4856   coding->produced = dst - coding->destination;
4857   return 0;
4858 }
4859
4860
4861 \f
4862 /*** 10, 11. no-conversion handlers ***/
4863
4864 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4865
4866 static void
4867 decode_coding_raw_text (coding)
4868      struct coding_system *coding;
4869 {
4870   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4871
4872   coding->chars_at_source = 1;
4873   coding->consumed_char = coding->src_chars;
4874   coding->consumed = coding->src_bytes;
4875   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
4876     {
4877       coding->consumed_char--;
4878       coding->consumed--;
4879       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4880     }
4881   else
4882     record_conversion_result (coding, CODING_RESULT_SUCCESS);
4883 }
4884
4885 static int
4886 encode_coding_raw_text (coding)
4887      struct coding_system *coding;
4888 {
4889   int multibytep = coding->dst_multibyte;
4890   int *charbuf = coding->charbuf;
4891   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4892   unsigned char *dst = coding->destination + coding->produced;
4893   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4894   int produced_chars = 0;
4895   int c;
4896
4897   if (multibytep)
4898     {
4899       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4900
4901       if (coding->src_multibyte)
4902         while (charbuf < charbuf_end)
4903           {
4904             ASSURE_DESTINATION (safe_room);
4905             c = *charbuf++;
4906             if (ASCII_CHAR_P (c))
4907               EMIT_ONE_ASCII_BYTE (c);
4908             else if (CHAR_BYTE8_P (c))
4909               {
4910                 c = CHAR_TO_BYTE8 (c);
4911                 EMIT_ONE_BYTE (c);
4912               }
4913             else
4914               {
4915                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4916
4917                 CHAR_STRING_ADVANCE (c, p1);
4918                 while (p0 < p1)
4919                   {
4920                     EMIT_ONE_BYTE (*p0);
4921                     p0++;
4922                   }
4923               }
4924           }
4925       else
4926         while (charbuf < charbuf_end)
4927           {
4928             ASSURE_DESTINATION (safe_room);
4929             c = *charbuf++;
4930             EMIT_ONE_BYTE (c);
4931           }
4932     }
4933   else
4934     {
4935       if (coding->src_multibyte)
4936         {
4937           int safe_room = MAX_MULTIBYTE_LENGTH;
4938
4939           while (charbuf < charbuf_end)
4940             {
4941               ASSURE_DESTINATION (safe_room);
4942               c = *charbuf++;
4943               if (ASCII_CHAR_P (c))
4944                 *dst++ = c;
4945               else if (CHAR_BYTE8_P (c))
4946                 *dst++ = CHAR_TO_BYTE8 (c);
4947               else
4948                 CHAR_STRING_ADVANCE (c, dst);
4949             }
4950         }
4951       else
4952         {
4953           ASSURE_DESTINATION (charbuf_end - charbuf);
4954           while (charbuf < charbuf_end && dst < dst_end)
4955             *dst++ = *charbuf++;
4956         }
4957       produced_chars = dst - (coding->destination + coding->produced);
4958     }
4959   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4960   coding->produced_char += produced_chars;
4961   coding->produced = dst - coding->destination;
4962   return 0;
4963 }
4964
4965 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4966    Check if a text is encoded in a charset-based coding system.  If it
4967    is, return 1, else return 0.  */
4968
4969 static int
4970 detect_coding_charset (coding, detect_info)
4971      struct coding_system *coding;
4972      struct coding_detection_info *detect_info;
4973 {
4974   const unsigned char *src = coding->source, *src_base;
4975   const unsigned char *src_end = coding->source + coding->src_bytes;
4976   int multibytep = coding->src_multibyte;
4977   int consumed_chars = 0;
4978   Lisp_Object attrs, valids, name;
4979   int found = 0;
4980   int head_ascii = coding->head_ascii;
4981   int check_latin_extra = 0;
4982
4983   detect_info->checked |= CATEGORY_MASK_CHARSET;
4984
4985   coding = &coding_categories[coding_category_charset];
4986   attrs = CODING_ID_ATTRS (coding->id);
4987   valids = AREF (attrs, coding_attr_charset_valids);
4988   name = CODING_ID_NAME (coding->id);
4989   if (VECTORP (Vlatin_extra_code_table)
4990       && strcmp ((char *) SDATA (SYMBOL_NAME (name)), "iso-8859-"))
4991     check_latin_extra = 1;
4992   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4993     src += head_ascii;
4994
4995   while (1)
4996     {
4997       int c;
4998       Lisp_Object val;
4999       struct charset *charset;
5000       int dim, idx;
5001
5002       src_base = src;
5003       ONE_MORE_BYTE (c);
5004       if (c < 0)
5005         continue;
5006       val = AREF (valids, c);
5007       if (NILP (val))
5008         break;
5009       if (c >= 0x80)
5010         {
5011           if (c < 0xA0
5012               && check_latin_extra
5013               && NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
5014             break;
5015           found = CATEGORY_MASK_CHARSET;
5016         }
5017       if (INTEGERP (val))
5018         {
5019           charset = CHARSET_FROM_ID (XFASTINT (val));
5020           dim = CHARSET_DIMENSION (charset);
5021           for (idx = 1; idx < dim; idx++)
5022             {
5023               if (src == src_end)
5024                 goto too_short;
5025               ONE_MORE_BYTE (c);
5026               if (c < charset->code_space[(dim - 1 - idx) * 2]
5027                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5028                 break;
5029             }
5030           if (idx < dim)
5031             break;
5032         }
5033       else
5034         {
5035           idx = 1;
5036           for (; CONSP (val); val = XCDR (val))
5037             {
5038               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5039               dim = CHARSET_DIMENSION (charset);
5040               while (idx < dim)
5041                 {
5042                   if (src == src_end)
5043                     goto too_short;
5044                   ONE_MORE_BYTE (c);
5045                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5046                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5047                     break;
5048                   idx++;
5049                 }
5050               if (idx == dim)
5051                 {
5052                   val = Qnil;
5053                   break;
5054                 }
5055             }
5056           if (CONSP (val))
5057             break;
5058         }
5059     }
5060  too_short:
5061   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5062   return 0;
5063
5064  no_more_source:
5065   detect_info->found |= found;
5066   return 1;
5067 }
5068
5069 static void
5070 decode_coding_charset (coding)
5071      struct coding_system *coding;
5072 {
5073   const unsigned char *src = coding->source + coding->consumed;
5074   const unsigned char *src_end = coding->source + coding->src_bytes;
5075   const unsigned char *src_base;
5076   int *charbuf = coding->charbuf + coding->charbuf_used;
5077   int *charbuf_end
5078     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
5079   int consumed_chars = 0, consumed_chars_base;
5080   int multibytep = coding->src_multibyte;
5081   Lisp_Object attrs, charset_list, valids;
5082   int char_offset = coding->produced_char;
5083   int last_offset = char_offset;
5084   int last_id = charset_ascii;
5085   int eol_crlf = EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5086   int byte_after_cr = -1;
5087
5088   CODING_GET_INFO (coding, attrs, charset_list);
5089   valids = AREF (attrs, coding_attr_charset_valids);
5090
5091   while (1)
5092     {
5093       int c;
5094       Lisp_Object val;
5095       struct charset *charset;
5096       int dim;
5097       int len = 1;
5098       unsigned code;
5099
5100       src_base = src;
5101       consumed_chars_base = consumed_chars;
5102
5103       if (charbuf >= charbuf_end)
5104         break;
5105
5106       if (byte_after_cr >= 0)
5107         {
5108           c = byte_after_cr;
5109           byte_after_cr = -1;
5110         }
5111       else
5112         {
5113           ONE_MORE_BYTE (c);
5114           if (eol_crlf && c == '\r')
5115             ONE_MORE_BYTE (byte_after_cr);
5116         }
5117       if (c < 0)
5118         goto invalid_code;
5119       code = c;
5120
5121       val = AREF (valids, c);
5122       if (NILP (val))
5123         goto invalid_code;
5124       if (INTEGERP (val))
5125         {
5126           charset = CHARSET_FROM_ID (XFASTINT (val));
5127           dim = CHARSET_DIMENSION (charset);
5128           while (len < dim)
5129             {
5130               ONE_MORE_BYTE (c);
5131               code = (code << 8) | c;
5132               len++;
5133             }
5134           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5135                               charset, code, c);
5136         }
5137       else
5138         {
5139           /* VAL is a list of charset IDs.  It is assured that the
5140              list is sorted by charset dimensions (smaller one
5141              comes first).  */
5142           while (CONSP (val))
5143             {
5144               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5145               dim = CHARSET_DIMENSION (charset);
5146               while (len < dim)
5147                 {
5148                   ONE_MORE_BYTE (c);
5149                   code = (code << 8) | c;
5150                   len++;
5151                 }
5152               CODING_DECODE_CHAR (coding, src, src_base,
5153                                   src_end, charset, code, c);
5154               if (c >= 0)
5155                 break;
5156               val = XCDR (val);
5157             }
5158         }
5159       if (c < 0)
5160         goto invalid_code;
5161       if (charset->id != charset_ascii
5162           && last_id != charset->id)
5163         {
5164           if (last_id != charset_ascii)
5165             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5166           last_id = charset->id;
5167           last_offset = char_offset;
5168         }
5169
5170       *charbuf++ = c;
5171       char_offset++;
5172       continue;
5173
5174     invalid_code:
5175       src = src_base;
5176       consumed_chars = consumed_chars_base;
5177       ONE_MORE_BYTE (c);
5178       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5179       char_offset++;
5180       coding->errors++;
5181     }
5182
5183  no_more_source:
5184   if (last_id != charset_ascii)
5185     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5186   coding->consumed_char += consumed_chars_base;
5187   coding->consumed = src_base - coding->source;
5188   coding->charbuf_used = charbuf - coding->charbuf;
5189 }
5190
5191 static int
5192 encode_coding_charset (coding)
5193      struct coding_system *coding;
5194 {
5195   int multibytep = coding->dst_multibyte;
5196   int *charbuf = coding->charbuf;
5197   int *charbuf_end = charbuf + coding->charbuf_used;
5198   unsigned char *dst = coding->destination + coding->produced;
5199   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5200   int safe_room = MAX_MULTIBYTE_LENGTH;
5201   int produced_chars = 0;
5202   Lisp_Object attrs, charset_list;
5203   int ascii_compatible;
5204   int c;
5205
5206   CODING_GET_INFO (coding, attrs, charset_list);
5207   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5208
5209   while (charbuf < charbuf_end)
5210     {
5211       struct charset *charset;
5212       unsigned code;
5213
5214       ASSURE_DESTINATION (safe_room);
5215       c = *charbuf++;
5216       if (ascii_compatible && ASCII_CHAR_P (c))
5217         EMIT_ONE_ASCII_BYTE (c);
5218       else if (CHAR_BYTE8_P (c))
5219         {
5220           c = CHAR_TO_BYTE8 (c);
5221           EMIT_ONE_BYTE (c);
5222         }
5223       else
5224         {
5225           charset = char_charset (c, charset_list, &code);
5226           if (charset)
5227             {
5228               if (CHARSET_DIMENSION (charset) == 1)
5229                 EMIT_ONE_BYTE (code);
5230               else if (CHARSET_DIMENSION (charset) == 2)
5231                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5232               else if (CHARSET_DIMENSION (charset) == 3)
5233                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5234               else
5235                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5236                                  (code >> 8) & 0xFF, code & 0xFF);
5237             }
5238           else
5239             {
5240               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5241                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5242               else
5243                 c = coding->default_char;
5244               EMIT_ONE_BYTE (c);
5245             }
5246         }
5247     }
5248
5249   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5250   coding->produced_char += produced_chars;
5251   coding->produced = dst - coding->destination;
5252   return 0;
5253 }
5254
5255 \f
5256 /*** 7. C library functions ***/
5257
5258 /* Setup coding context CODING from information about CODING_SYSTEM.
5259    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5260    CODING_SYSTEM is invalid, signal an error.  */
5261
5262 void
5263 setup_coding_system (coding_system, coding)
5264      Lisp_Object coding_system;
5265      struct coding_system *coding;
5266 {
5267   Lisp_Object attrs;
5268   Lisp_Object eol_type;
5269   Lisp_Object coding_type;
5270   Lisp_Object val;
5271
5272   if (NILP (coding_system))
5273     coding_system = Qundecided;
5274
5275   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5276
5277   attrs = CODING_ID_ATTRS (coding->id);
5278   eol_type = CODING_ID_EOL_TYPE (coding->id);
5279
5280   coding->mode = 0;
5281   coding->head_ascii = -1;
5282   if (VECTORP (eol_type))
5283     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5284                             | CODING_REQUIRE_DETECTION_MASK);
5285   else if (! EQ (eol_type, Qunix))
5286     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5287                             | CODING_REQUIRE_ENCODING_MASK);
5288   else
5289     coding->common_flags = 0;
5290   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5291     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5292   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5293     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5294   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5295     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5296
5297   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5298   coding->max_charset_id = SCHARS (val) - 1;
5299   coding->safe_charsets = (char *) SDATA (val);
5300   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5301
5302   coding_type = CODING_ATTR_TYPE (attrs);
5303   if (EQ (coding_type, Qundecided))
5304     {
5305       coding->detector = NULL;
5306       coding->decoder = decode_coding_raw_text;
5307       coding->encoder = encode_coding_raw_text;
5308       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5309     }
5310   else if (EQ (coding_type, Qiso_2022))
5311     {
5312       int i;
5313       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5314
5315       /* Invoke graphic register 0 to plane 0.  */
5316       CODING_ISO_INVOCATION (coding, 0) = 0;
5317       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5318       CODING_ISO_INVOCATION (coding, 1)
5319         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5320       /* Setup the initial status of designation.  */
5321       for (i = 0; i < 4; i++)
5322         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5323       /* Not single shifting initially.  */
5324       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5325       /* Beginning of buffer should also be regarded as bol. */
5326       CODING_ISO_BOL (coding) = 1;
5327       coding->detector = detect_coding_iso_2022;
5328       coding->decoder = decode_coding_iso_2022;
5329       coding->encoder = encode_coding_iso_2022;
5330       if (flags & CODING_ISO_FLAG_SAFE)
5331         coding->mode |= CODING_MODE_SAFE_ENCODING;
5332       coding->common_flags
5333         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5334             | CODING_REQUIRE_FLUSHING_MASK);
5335       if (flags & CODING_ISO_FLAG_COMPOSITION)
5336         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5337       if (flags & CODING_ISO_FLAG_DESIGNATION)
5338         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5339       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5340         {
5341           setup_iso_safe_charsets (attrs);
5342           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5343           coding->max_charset_id = SCHARS (val) - 1;
5344           coding->safe_charsets = (char *) SDATA (val);
5345         }
5346       CODING_ISO_FLAGS (coding) = flags;
5347     }
5348   else if (EQ (coding_type, Qcharset))
5349     {
5350       coding->detector = detect_coding_charset;
5351       coding->decoder = decode_coding_charset;
5352       coding->encoder = encode_coding_charset;
5353       coding->common_flags
5354         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5355     }
5356   else if (EQ (coding_type, Qutf_8))
5357     {
5358       val = AREF (attrs, coding_attr_utf_bom);
5359       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5360                                    : EQ (val, Qt) ? utf_with_bom
5361                                    : utf_without_bom);
5362       coding->detector = detect_coding_utf_8;
5363       coding->decoder = decode_coding_utf_8;
5364       coding->encoder = encode_coding_utf_8;
5365       coding->common_flags
5366         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5367       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5368         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5369     }
5370   else if (EQ (coding_type, Qutf_16))
5371     {
5372       val = AREF (attrs, coding_attr_utf_bom);
5373       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5374                                     : EQ (val, Qt) ? utf_with_bom
5375                                     : utf_without_bom);
5376       val = AREF (attrs, coding_attr_utf_16_endian);
5377       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5378                                        : utf_16_little_endian);
5379       CODING_UTF_16_SURROGATE (coding) = 0;
5380       coding->detector = detect_coding_utf_16;
5381       coding->decoder = decode_coding_utf_16;
5382       coding->encoder = encode_coding_utf_16;
5383       coding->common_flags
5384         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5385       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5386         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5387     }
5388   else if (EQ (coding_type, Qccl))
5389     {
5390       coding->detector = detect_coding_ccl;
5391       coding->decoder = decode_coding_ccl;
5392       coding->encoder = encode_coding_ccl;
5393       coding->common_flags
5394         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5395             | CODING_REQUIRE_FLUSHING_MASK);
5396     }
5397   else if (EQ (coding_type, Qemacs_mule))
5398     {
5399       coding->detector = detect_coding_emacs_mule;
5400       coding->decoder = decode_coding_emacs_mule;
5401       coding->encoder = encode_coding_emacs_mule;
5402       coding->common_flags
5403         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5404       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5405           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5406         {
5407           Lisp_Object tail, safe_charsets;
5408           int max_charset_id = 0;
5409
5410           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5411                tail = XCDR (tail))
5412             if (max_charset_id < XFASTINT (XCAR (tail)))
5413               max_charset_id = XFASTINT (XCAR (tail));
5414           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5415                                         make_number (255));
5416           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5417                tail = XCDR (tail))
5418             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5419           coding->max_charset_id = max_charset_id;
5420           coding->safe_charsets = (char *) SDATA (safe_charsets);
5421         }
5422     }
5423   else if (EQ (coding_type, Qshift_jis))
5424     {
5425       coding->detector = detect_coding_sjis;
5426       coding->decoder = decode_coding_sjis;
5427       coding->encoder = encode_coding_sjis;
5428       coding->common_flags
5429         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5430     }
5431   else if (EQ (coding_type, Qbig5))
5432     {
5433       coding->detector = detect_coding_big5;
5434       coding->decoder = decode_coding_big5;
5435       coding->encoder = encode_coding_big5;
5436       coding->common_flags
5437         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5438     }
5439   else                          /* EQ (coding_type, Qraw_text) */
5440     {
5441       coding->detector = NULL;
5442       coding->decoder = decode_coding_raw_text;
5443       coding->encoder = encode_coding_raw_text;
5444       if (! EQ (eol_type, Qunix))
5445         {
5446           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5447           if (! VECTORP (eol_type))
5448             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5449         }
5450
5451     }
5452
5453   return;
5454 }
5455
5456 /* Return a list of charsets supported by CODING.  */
5457
5458 Lisp_Object
5459 coding_charset_list (coding)
5460      struct coding_system *coding;
5461 {
5462   Lisp_Object attrs, charset_list;
5463
5464   CODING_GET_INFO (coding, attrs, charset_list);
5465   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5466     {
5467       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5468
5469       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5470         charset_list = Viso_2022_charset_list;
5471     }
5472   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5473     {
5474       charset_list = Vemacs_mule_charset_list;
5475     }
5476   return charset_list;
5477 }
5478
5479
5480 /* Return raw-text or one of its subsidiaries that has the same
5481    eol_type as CODING-SYSTEM.  */
5482
5483 Lisp_Object
5484 raw_text_coding_system (coding_system)
5485      Lisp_Object coding_system;
5486 {
5487   Lisp_Object spec, attrs;
5488   Lisp_Object eol_type, raw_text_eol_type;
5489
5490   if (NILP (coding_system))
5491     return Qraw_text;
5492   spec = CODING_SYSTEM_SPEC (coding_system);
5493   attrs = AREF (spec, 0);
5494
5495   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5496     return coding_system;
5497
5498   eol_type = AREF (spec, 2);
5499   if (VECTORP (eol_type))
5500     return Qraw_text;
5501   spec = CODING_SYSTEM_SPEC (Qraw_text);
5502   raw_text_eol_type = AREF (spec, 2);
5503   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5504           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5505           : AREF (raw_text_eol_type, 2));
5506 }
5507
5508
5509 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5510    does, return one of the subsidiary that has the same eol-spec as
5511    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
5512    inherit end-of-line format from the system's setting
5513    (system_eol_type).  */
5514
5515 Lisp_Object
5516 coding_inherit_eol_type (coding_system, parent)
5517      Lisp_Object coding_system, parent;
5518 {
5519   Lisp_Object spec, eol_type;
5520
5521   if (NILP (coding_system))
5522     coding_system = Qraw_text;
5523   spec = CODING_SYSTEM_SPEC (coding_system);
5524   eol_type = AREF (spec, 2);
5525   if (VECTORP (eol_type))
5526     {
5527       Lisp_Object parent_eol_type;
5528
5529       if (! NILP (parent))
5530         {
5531           Lisp_Object parent_spec;
5532
5533           parent_spec = CODING_SYSTEM_SPEC (parent);
5534           parent_eol_type = AREF (parent_spec, 2);
5535         }
5536       else
5537         parent_eol_type = system_eol_type;
5538       if (EQ (parent_eol_type, Qunix))
5539         coding_system = AREF (eol_type, 0);
5540       else if (EQ (parent_eol_type, Qdos))
5541         coding_system = AREF (eol_type, 1);
5542       else if (EQ (parent_eol_type, Qmac))
5543         coding_system = AREF (eol_type, 2);
5544     }
5545   return coding_system;
5546 }
5547
5548 /* Emacs has a mechanism to automatically detect a coding system if it
5549    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5550    it's impossible to distinguish some coding systems accurately
5551    because they use the same range of codes.  So, at first, coding
5552    systems are categorized into 7, those are:
5553
5554    o coding-category-emacs-mule
5555
5556         The category for a coding system which has the same code range
5557         as Emacs' internal format.  Assigned the coding-system (Lisp
5558         symbol) `emacs-mule' by default.
5559
5560    o coding-category-sjis
5561
5562         The category for a coding system which has the same code range
5563         as SJIS.  Assigned the coding-system (Lisp
5564         symbol) `japanese-shift-jis' by default.
5565
5566    o coding-category-iso-7
5567
5568         The category for a coding system which has the same code range
5569         as ISO2022 of 7-bit environment.  This doesn't use any locking
5570         shift and single shift functions.  This can encode/decode all
5571         charsets.  Assigned the coding-system (Lisp symbol)
5572         `iso-2022-7bit' by default.
5573
5574    o coding-category-iso-7-tight
5575
5576         Same as coding-category-iso-7 except that this can
5577         encode/decode only the specified charsets.
5578
5579    o coding-category-iso-8-1
5580
5581         The category for a coding system which has the same code range
5582         as ISO2022 of 8-bit environment and graphic plane 1 used only
5583         for DIMENSION1 charset.  This doesn't use any locking shift
5584         and single shift functions.  Assigned the coding-system (Lisp
5585         symbol) `iso-latin-1' by default.
5586
5587    o coding-category-iso-8-2
5588
5589         The category for a coding system which has the same code range
5590         as ISO2022 of 8-bit environment and graphic plane 1 used only
5591         for DIMENSION2 charset.  This doesn't use any locking shift
5592         and single shift functions.  Assigned the coding-system (Lisp
5593         symbol) `japanese-iso-8bit' by default.
5594
5595    o coding-category-iso-7-else
5596
5597         The category for a coding system which has the same code range
5598         as ISO2022 of 7-bit environemnt but uses locking shift or
5599         single shift functions.  Assigned the coding-system (Lisp
5600         symbol) `iso-2022-7bit-lock' by default.
5601
5602    o coding-category-iso-8-else
5603
5604         The category for a coding system which has the same code range
5605         as ISO2022 of 8-bit environemnt but uses locking shift or
5606         single shift functions.  Assigned the coding-system (Lisp
5607         symbol) `iso-2022-8bit-ss2' by default.
5608
5609    o coding-category-big5
5610
5611         The category for a coding system which has the same code range
5612         as BIG5.  Assigned the coding-system (Lisp symbol)
5613         `cn-big5' by default.
5614
5615    o coding-category-utf-8
5616
5617         The category for a coding system which has the same code range
5618         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5619         symbol) `utf-8' by default.
5620
5621    o coding-category-utf-16-be
5622
5623         The category for a coding system in which a text has an
5624         Unicode signature (cf. Unicode Standard) in the order of BIG
5625         endian at the head.  Assigned the coding-system (Lisp symbol)
5626         `utf-16-be' by default.
5627
5628    o coding-category-utf-16-le
5629
5630         The category for a coding system in which a text has an
5631         Unicode signature (cf. Unicode Standard) in the order of
5632         LITTLE endian at the head.  Assigned the coding-system (Lisp
5633         symbol) `utf-16-le' by default.
5634
5635    o coding-category-ccl
5636
5637         The category for a coding system of which encoder/decoder is
5638         written in CCL programs.  The default value is nil, i.e., no
5639         coding system is assigned.
5640
5641    o coding-category-binary
5642
5643         The category for a coding system not categorized in any of the
5644         above.  Assigned the coding-system (Lisp symbol)
5645         `no-conversion' by default.
5646
5647    Each of them is a Lisp symbol and the value is an actual
5648    `coding-system's (this is also a Lisp symbol) assigned by a user.
5649    What Emacs does actually is to detect a category of coding system.
5650    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5651    decide only one possible category, it selects a category of the
5652    highest priority.  Priorities of categories are also specified by a
5653    user in a Lisp variable `coding-category-list'.
5654
5655 */
5656
5657 #define EOL_SEEN_NONE   0
5658 #define EOL_SEEN_LF     1
5659 #define EOL_SEEN_CR     2
5660 #define EOL_SEEN_CRLF   4
5661
5662 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5663    SOURCE is encoded.  If CATEGORY is one of
5664    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5665    two-byte, else they are encoded by one-byte.
5666
5667    Return one of EOL_SEEN_XXX.  */
5668
5669 #define MAX_EOL_CHECK_COUNT 3
5670
5671 static int
5672 detect_eol (source, src_bytes, category)
5673      const unsigned char *source;
5674      EMACS_INT src_bytes;
5675      enum coding_category category;
5676 {
5677   const unsigned char *src = source, *src_end = src + src_bytes;
5678   unsigned char c;
5679   int total  = 0;
5680   int eol_seen = EOL_SEEN_NONE;
5681
5682   if ((1 << category) & CATEGORY_MASK_UTF_16)
5683     {
5684       int msb, lsb;
5685
5686       msb = category == (coding_category_utf_16_le
5687                          | coding_category_utf_16_le_nosig);
5688       lsb = 1 - msb;
5689
5690       while (src + 1 < src_end)
5691         {
5692           c = src[lsb];
5693           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5694             {
5695               int this_eol;
5696
5697               if (c == '\n')
5698                 this_eol = EOL_SEEN_LF;
5699               else if (src + 3 >= src_end
5700                        || src[msb + 2] != 0
5701                        || src[lsb + 2] != '\n')
5702                 this_eol = EOL_SEEN_CR;
5703               else
5704                 this_eol = EOL_SEEN_CRLF;
5705
5706               if (eol_seen == EOL_SEEN_NONE)
5707                 /* This is the first end-of-line.  */
5708                 eol_seen = this_eol;
5709               else if (eol_seen != this_eol)
5710                 {
5711                   /* The found type is different from what found before.  */
5712                   eol_seen = EOL_SEEN_LF;
5713                   break;
5714                 }
5715               if (++total == MAX_EOL_CHECK_COUNT)
5716                 break;
5717             }
5718           src += 2;
5719         }
5720     }
5721   else
5722     {
5723       while (src < src_end)
5724         {
5725           c = *src++;
5726           if (c == '\n' || c == '\r')
5727             {
5728               int this_eol;
5729
5730               if (c == '\n')
5731                 this_eol = EOL_SEEN_LF;
5732               else if (src >= src_end || *src != '\n')
5733                 this_eol = EOL_SEEN_CR;
5734               else
5735                 this_eol = EOL_SEEN_CRLF, src++;
5736
5737               if (eol_seen == EOL_SEEN_NONE)
5738                 /* This is the first end-of-line.  */
5739                 eol_seen = this_eol;
5740               else if (eol_seen != this_eol)
5741                 {
5742                   /* The found type is different from what found before.  */
5743                   eol_seen = EOL_SEEN_LF;
5744                   break;
5745                 }
5746               if (++total == MAX_EOL_CHECK_COUNT)
5747                 break;
5748             }
5749         }
5750     }
5751   return eol_seen;
5752 }
5753
5754
5755 static Lisp_Object
5756 adjust_coding_eol_type (coding, eol_seen)
5757      struct coding_system *coding;
5758      int eol_seen;
5759 {
5760   Lisp_Object eol_type;
5761
5762   eol_type = CODING_ID_EOL_TYPE (coding->id);
5763   if (eol_seen & EOL_SEEN_LF)
5764     {
5765       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5766       eol_type = Qunix;
5767     }
5768   else if (eol_seen & EOL_SEEN_CRLF)
5769     {
5770       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5771       eol_type = Qdos;
5772     }
5773   else if (eol_seen & EOL_SEEN_CR)
5774     {
5775       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5776       eol_type = Qmac;
5777     }
5778   return eol_type;
5779 }
5780
5781 /* Detect how a text specified in CODING is encoded.  If a coding
5782    system is detected, update fields of CODING by the detected coding
5783    system.  */
5784
5785 void
5786 detect_coding (coding)
5787      struct coding_system *coding;
5788 {
5789   const unsigned char *src, *src_end;
5790
5791   coding->consumed = coding->consumed_char = 0;
5792   coding->produced = coding->produced_char = 0;
5793   coding_set_source (coding);
5794
5795   src_end = coding->source + coding->src_bytes;
5796   coding->head_ascii = 0;
5797
5798   /* If we have not yet decided the text encoding type, detect it
5799      now.  */
5800   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5801     {
5802       int c, i;
5803       struct coding_detection_info detect_info;
5804       int null_byte_found = 0, eight_bit_found = 0;
5805
5806       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5807       for (src = coding->source; src < src_end; src++)
5808         {
5809           c = *src;
5810           if (c & 0x80)
5811             {
5812               eight_bit_found = 1;
5813               if (null_byte_found)
5814                 break;
5815             }
5816           else if (c < 0x20)
5817             {
5818               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5819                   && ! inhibit_iso_escape_detection
5820                   && ! detect_info.checked)
5821                 {
5822                   if (detect_coding_iso_2022 (coding, &detect_info))
5823                     {
5824                       /* We have scanned the whole data.  */
5825                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5826                         {
5827                           /* We didn't find an 8-bit code.  We may
5828                              have found a null-byte, but it's very
5829                              rare that a binary file confirm to
5830                              ISO-2022.  */
5831                           src = src_end;
5832                           coding->head_ascii = src - coding->source;
5833                         }
5834                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
5835                       break;
5836                     }
5837                 }
5838               else if (! c)
5839                 {
5840                   null_byte_found = 1;
5841                   if (eight_bit_found)
5842                     break;
5843                 }
5844               if (! eight_bit_found)
5845                 coding->head_ascii++;
5846             }
5847           else if (! eight_bit_found)
5848             coding->head_ascii++;
5849         }
5850
5851       if (null_byte_found || eight_bit_found
5852           || coding->head_ascii < coding->src_bytes
5853           || detect_info.found)
5854         {
5855           enum coding_category category;
5856           struct coding_system *this;
5857
5858           if (coding->head_ascii == coding->src_bytes)
5859             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5860             for (i = 0; i < coding_category_raw_text; i++)
5861               {
5862                 category = coding_priorities[i];
5863                 this = coding_categories + category;
5864                 if (detect_info.found & (1 << category))
5865                   break;
5866               }
5867           else
5868             {
5869               if (null_byte_found)
5870                 {
5871                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
5872                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
5873                 }
5874               for (i = 0; i < coding_category_raw_text; i++)
5875                 {
5876                   category = coding_priorities[i];
5877                   this = coding_categories + category;
5878                   if (this->id < 0)
5879                     {
5880                       /* No coding system of this category is defined.  */
5881                       detect_info.rejected |= (1 << category);
5882                     }
5883                   else if (category >= coding_category_raw_text)
5884                     continue;
5885                   else if (detect_info.checked & (1 << category))
5886                     {
5887                       if (detect_info.found & (1 << category))
5888                         break;
5889                     }
5890                   else if ((*(this->detector)) (coding, &detect_info)
5891                            && detect_info.found & (1 << category))
5892                     {
5893                       if (category == coding_category_utf_16_auto)
5894                         {
5895                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5896                             category = coding_category_utf_16_le;
5897                           else
5898                             category = coding_category_utf_16_be;
5899                         }
5900                       break;
5901                     }
5902                 }
5903             }
5904
5905           if (i < coding_category_raw_text)
5906             setup_coding_system (CODING_ID_NAME (this->id), coding);
5907           else if (null_byte_found)
5908             setup_coding_system (Qno_conversion, coding);
5909           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
5910                    == CATEGORY_MASK_ANY)
5911             setup_coding_system (Qraw_text, coding);
5912           else if (detect_info.rejected)
5913             for (i = 0; i < coding_category_raw_text; i++)
5914               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5915                 {
5916                   this = coding_categories + coding_priorities[i];
5917                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5918                   break;
5919                 }
5920         }
5921     }
5922   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5923            == coding_category_utf_8_auto)
5924     {
5925       Lisp_Object coding_systems;
5926       struct coding_detection_info detect_info;
5927
5928       coding_systems
5929         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5930       detect_info.found = detect_info.rejected = 0;
5931       coding->head_ascii = 0;
5932       if (CONSP (coding_systems)
5933           && detect_coding_utf_8 (coding, &detect_info))
5934         {
5935           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
5936             setup_coding_system (XCAR (coding_systems), coding);
5937           else
5938             setup_coding_system (XCDR (coding_systems), coding);
5939         }
5940     }
5941   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5942            == coding_category_utf_16_auto)
5943     {
5944       Lisp_Object coding_systems;
5945       struct coding_detection_info detect_info;
5946
5947       coding_systems
5948         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
5949       detect_info.found = detect_info.rejected = 0;
5950       coding->head_ascii = 0;
5951       if (CONSP (coding_systems)
5952           && detect_coding_utf_16 (coding, &detect_info))
5953         {
5954           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5955             setup_coding_system (XCAR (coding_systems), coding);
5956           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5957             setup_coding_system (XCDR (coding_systems), coding);
5958         }
5959     }
5960 }
5961
5962
5963 static void
5964 decode_eol (coding)
5965      struct coding_system *coding;
5966 {
5967   Lisp_Object eol_type;
5968   unsigned char *p, *pbeg, *pend;
5969
5970   eol_type = CODING_ID_EOL_TYPE (coding->id);
5971   if (EQ (eol_type, Qunix))
5972     return;
5973
5974   if (NILP (coding->dst_object))
5975     pbeg = coding->destination;
5976   else
5977     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5978   pend = pbeg + coding->produced;
5979
5980   if (VECTORP (eol_type))
5981     {
5982       int eol_seen = EOL_SEEN_NONE;
5983
5984       for (p = pbeg; p < pend; p++)
5985         {
5986           if (*p == '\n')
5987             eol_seen |= EOL_SEEN_LF;
5988           else if (*p == '\r')
5989             {
5990               if (p + 1 < pend && *(p + 1) == '\n')
5991                 {
5992                   eol_seen |= EOL_SEEN_CRLF;
5993                   p++;
5994                 }
5995               else
5996                 eol_seen |= EOL_SEEN_CR;
5997             }
5998         }
5999       if (eol_seen != EOL_SEEN_NONE
6000           && eol_seen != EOL_SEEN_LF
6001           && eol_seen != EOL_SEEN_CRLF
6002           && eol_seen != EOL_SEEN_CR)
6003         eol_seen = EOL_SEEN_LF;
6004       if (eol_seen != EOL_SEEN_NONE)
6005         eol_type = adjust_coding_eol_type (coding, eol_seen);
6006     }
6007
6008   if (EQ (eol_type, Qmac))
6009     {
6010       for (p = pbeg; p < pend; p++)
6011         if (*p == '\r')
6012           *p = '\n';
6013     }
6014   else if (EQ (eol_type, Qdos))
6015     {
6016       int n = 0;
6017
6018       if (NILP (coding->dst_object))
6019         {
6020           /* Start deleting '\r' from the tail to minimize the memory
6021              movement.  */
6022           for (p = pend - 2; p >= pbeg; p--)
6023             if (*p == '\r')
6024               {
6025                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6026                 n++;
6027               }
6028         }
6029       else
6030         {
6031           int pos_byte = coding->dst_pos_byte;
6032           int pos = coding->dst_pos;
6033           int pos_end = pos + coding->produced_char - 1;
6034
6035           while (pos < pos_end)
6036             {
6037               p = BYTE_POS_ADDR (pos_byte);
6038               if (*p == '\r' && p[1] == '\n')
6039                 {
6040                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6041                   n++;
6042                   pos_end--;
6043                 }
6044               pos++;
6045               if (coding->dst_multibyte)
6046                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6047               else
6048                 pos_byte++;
6049             }
6050         }
6051       coding->produced -= n;
6052       coding->produced_char -= n;
6053     }
6054 }
6055
6056
6057 /* Return a translation table (or list of them) from coding system
6058    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6059    decoding (ENCODEP is zero). */
6060
6061 static Lisp_Object
6062 get_translation_table (attrs, encodep, max_lookup)
6063      Lisp_Object attrs;
6064      int encodep, *max_lookup;
6065 {
6066   Lisp_Object standard, translation_table;
6067   Lisp_Object val;
6068
6069   if (encodep)
6070     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6071       standard = Vstandard_translation_table_for_encode;
6072   else
6073     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6074       standard = Vstandard_translation_table_for_decode;
6075   if (NILP (translation_table))
6076     translation_table = standard;
6077   else
6078     {
6079       if (SYMBOLP (translation_table))
6080         translation_table = Fget (translation_table, Qtranslation_table);
6081       else if (CONSP (translation_table))
6082         {
6083           translation_table = Fcopy_sequence (translation_table);
6084           for (val = translation_table; CONSP (val); val = XCDR (val))
6085             if (SYMBOLP (XCAR (val)))
6086               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6087         }
6088       if (CHAR_TABLE_P (standard))
6089         {
6090           if (CONSP (translation_table))
6091             translation_table = nconc2 (translation_table,
6092                                         Fcons (standard, Qnil));
6093           else
6094             translation_table = Fcons (translation_table,
6095                                        Fcons (standard, Qnil));
6096         }
6097     }
6098
6099   if (max_lookup)
6100     {
6101       *max_lookup = 1;
6102       if (CHAR_TABLE_P (translation_table)
6103           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6104         {
6105           val = XCHAR_TABLE (translation_table)->extras[1];
6106           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6107             *max_lookup = XFASTINT (val);
6108         }
6109       else if (CONSP (translation_table))
6110         {
6111           Lisp_Object tail, val;
6112
6113           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6114             if (CHAR_TABLE_P (XCAR (tail))
6115                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6116               {
6117                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6118                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6119                   *max_lookup = XFASTINT (val);
6120               }
6121         }
6122     }
6123   return translation_table;
6124 }
6125
6126 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6127   do {                                                          \
6128     trans = Qnil;                                               \
6129     if (CHAR_TABLE_P (table))                                   \
6130       {                                                         \
6131         trans = CHAR_TABLE_REF (table, c);                      \
6132         if (CHARACTERP (trans))                                 \
6133           c = XFASTINT (trans), trans = Qnil;                   \
6134       }                                                         \
6135     else if (CONSP (table))                                     \
6136       {                                                         \
6137         Lisp_Object tail;                                       \
6138                                                                 \
6139         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6140           if (CHAR_TABLE_P (XCAR (tail)))                       \
6141             {                                                   \
6142               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6143               if (CHARACTERP (trans))                           \
6144                 c = XFASTINT (trans), trans = Qnil;             \
6145               else if (! NILP (trans))                          \
6146                 break;                                          \
6147             }                                                   \
6148       }                                                         \
6149   } while (0)
6150
6151
6152 static Lisp_Object
6153 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
6154      Lisp_Object val;
6155      int *buf, *buf_end;
6156      int last_block;
6157      int *from_nchars, *to_nchars;
6158 {
6159   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
6160      [TO-CHAR ...].  */
6161   if (CONSP (val))
6162     {
6163       Lisp_Object from, tail;
6164       int i, len;
6165
6166       for (tail = val; CONSP (tail); tail = XCDR (tail))
6167         {
6168           val = XCAR (tail);
6169           from = XCAR (val);
6170           len = ASIZE (from);
6171           for (i = 0; i < len; i++)
6172             {
6173               if (buf + i == buf_end)
6174                 {
6175                   if (! last_block)
6176                     return Qt;
6177                   break;
6178                 }
6179               if (XINT (AREF (from, i)) != buf[i])
6180                 break;
6181             }
6182           if (i == len)
6183             {
6184               val = XCDR (val);
6185               *from_nchars = len;
6186               break;
6187             }
6188         }
6189       if (! CONSP (tail))
6190         return Qnil;
6191     }
6192   if (VECTORP (val))
6193     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
6194   else
6195     *buf = XINT (val);
6196   return val;
6197 }
6198
6199
6200 static int
6201 produce_chars (coding, translation_table, last_block)
6202      struct coding_system *coding;
6203      Lisp_Object translation_table;
6204      int last_block;
6205 {
6206   unsigned char *dst = coding->destination + coding->produced;
6207   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6208   EMACS_INT produced;
6209   EMACS_INT produced_chars = 0;
6210   int carryover = 0;
6211
6212   if (! coding->chars_at_source)
6213     {
6214       /* Source characters are in coding->charbuf.  */
6215       int *buf = coding->charbuf;
6216       int *buf_end = buf + coding->charbuf_used;
6217
6218       if (EQ (coding->src_object, coding->dst_object))
6219         {
6220           coding_set_source (coding);
6221           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6222         }
6223
6224       while (buf < buf_end)
6225         {
6226           int c = *buf, i;
6227
6228           if (c >= 0)
6229             {
6230               int from_nchars = 1, to_nchars = 1;
6231               Lisp_Object trans = Qnil;
6232
6233               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6234               if (! NILP (trans))
6235                 {
6236                   trans = get_translation (trans, buf, buf_end, last_block,
6237                                            &from_nchars, &to_nchars);
6238                   if (EQ (trans, Qt))
6239                     break;
6240                   c = *buf;
6241                 }
6242
6243               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6244                 {
6245                   dst = alloc_destination (coding,
6246                                            buf_end - buf
6247                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6248                                            dst);
6249                   if (EQ (coding->src_object, coding->dst_object))
6250                     {
6251                       coding_set_source (coding);
6252                       dst_end = ((unsigned char *) coding->source) + coding->consumed;
6253                     }
6254                   else
6255                     dst_end = coding->destination + coding->dst_bytes;
6256                 }
6257
6258               for (i = 0; i < to_nchars; i++)
6259                 {
6260                   if (i > 0)
6261                     c = XINT (AREF (trans, i));
6262                   if (coding->dst_multibyte
6263                       || ! CHAR_BYTE8_P (c))
6264                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6265                   else
6266                     *dst++ = CHAR_TO_BYTE8 (c);
6267                 }
6268               produced_chars += to_nchars;
6269               *buf++ = to_nchars;
6270               while (--from_nchars > 0)
6271                 *buf++ = 0;
6272             }
6273           else
6274             /* This is an annotation datum.  (-C) is the length.  */
6275             buf += -c;
6276         }
6277       carryover = buf_end - buf;
6278     }
6279   else
6280     {
6281       /* Source characters are at coding->source.  */
6282       const unsigned char *src = coding->source;
6283       const unsigned char *src_end = src + coding->consumed;
6284
6285       if (EQ (coding->dst_object, coding->src_object))
6286         dst_end = (unsigned char *) src;
6287       if (coding->src_multibyte != coding->dst_multibyte)
6288         {
6289           if (coding->src_multibyte)
6290             {
6291               int multibytep = 1;
6292               EMACS_INT consumed_chars;
6293
6294               while (1)
6295                 {
6296                   const unsigned char *src_base = src;
6297                   int c;
6298
6299                   ONE_MORE_BYTE (c);
6300                   if (dst == dst_end)
6301                     {
6302                       if (EQ (coding->src_object, coding->dst_object))
6303                         dst_end = (unsigned char *) src;
6304                       if (dst == dst_end)
6305                         {
6306                           EMACS_INT offset = src - coding->source;
6307
6308                           dst = alloc_destination (coding, src_end - src + 1,
6309                                                    dst);
6310                           dst_end = coding->destination + coding->dst_bytes;
6311                           coding_set_source (coding);
6312                           src = coding->source + offset;
6313                           src_end = coding->source + coding->src_bytes;
6314                           if (EQ (coding->src_object, coding->dst_object))
6315                             dst_end = (unsigned char *) src;
6316                         }
6317                     }
6318                   *dst++ = c;
6319                   produced_chars++;
6320                 }
6321             no_more_source:
6322               ;
6323             }
6324           else
6325             while (src < src_end)
6326               {
6327                 int multibytep = 1;
6328                 int c = *src++;
6329
6330                 if (dst >= dst_end - 1)
6331                   {
6332                     if (EQ (coding->src_object, coding->dst_object))
6333                       dst_end = (unsigned char *) src;
6334                     if (dst >= dst_end - 1)
6335                       {
6336                         EMACS_INT offset = src - coding->source;
6337                         EMACS_INT more_bytes;
6338
6339                         if (EQ (coding->src_object, coding->dst_object))
6340                           more_bytes = ((src_end - src) / 2) + 2;
6341                         else
6342                           more_bytes = src_end - src + 2;
6343                         dst = alloc_destination (coding, more_bytes, dst);
6344                         dst_end = coding->destination + coding->dst_bytes;
6345                         coding_set_source (coding);
6346                         src = coding->source + offset;
6347                         src_end = coding->source + coding->src_bytes;
6348                         if (EQ (coding->src_object, coding->dst_object))
6349                           dst_end = (unsigned char *) src;
6350                       }
6351                   }
6352                 EMIT_ONE_BYTE (c);
6353               }
6354         }
6355       else
6356         {
6357           if (!EQ (coding->src_object, coding->dst_object))
6358             {
6359               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6360
6361               if (require > 0)
6362                 {
6363                   EMACS_INT offset = src - coding->source;
6364
6365                   dst = alloc_destination (coding, require, dst);
6366                   coding_set_source (coding);
6367                   src = coding->source + offset;
6368                   src_end = coding->source + coding->src_bytes;
6369                 }
6370             }
6371           produced_chars = coding->consumed_char;
6372           while (src < src_end)
6373             *dst++ = *src++;
6374         }
6375     }
6376
6377   produced = dst - (coding->destination + coding->produced);
6378   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6379     insert_from_gap (produced_chars, produced);
6380   coding->produced += produced;
6381   coding->produced_char += produced_chars;
6382   return carryover;
6383 }
6384
6385 /* Compose text in CODING->object according to the annotation data at
6386    CHARBUF.  CHARBUF is an array:
6387      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6388  */
6389
6390 static INLINE void
6391 produce_composition (coding, charbuf, pos)
6392      struct coding_system *coding;
6393      int *charbuf;
6394      EMACS_INT pos;
6395 {
6396   int len;
6397   EMACS_INT to;
6398   enum composition_method method;
6399   Lisp_Object components;
6400
6401   len = -charbuf[0];
6402   to = pos + charbuf[2];
6403   if (to <= pos)
6404     return;
6405   method = (enum composition_method) (charbuf[3]);
6406
6407   if (method == COMPOSITION_RELATIVE)
6408     components = Qnil;
6409   else if (method >= COMPOSITION_WITH_RULE
6410            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6411     {
6412       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6413       int i;
6414
6415       len -= 4;
6416       charbuf += 4;
6417       for (i = 0; i < len; i++)
6418         {
6419           args[i] = make_number (charbuf[i]);
6420           if (charbuf[i] < 0)
6421             return;
6422         }
6423       components = (method == COMPOSITION_WITH_ALTCHARS
6424                     ? Fstring (len, args) : Fvector (len, args));
6425     }
6426   else
6427     return;
6428   compose_text (pos, to, components, Qnil, coding->dst_object);
6429 }
6430
6431
6432 /* Put `charset' property on text in CODING->object according to
6433    the annotation data at CHARBUF.  CHARBUF is an array:
6434      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6435  */
6436
6437 static INLINE void
6438 produce_charset (coding, charbuf, pos)
6439      struct coding_system *coding;
6440      int *charbuf;
6441      EMACS_INT pos;
6442 {
6443   EMACS_INT from = pos - charbuf[2];
6444   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6445
6446   Fput_text_property (make_number (from), make_number (pos),
6447                       Qcharset, CHARSET_NAME (charset),
6448                       coding->dst_object);
6449 }
6450
6451
6452 #define CHARBUF_SIZE 0x4000
6453
6454 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6455   do {                                                                  \
6456     int size = CHARBUF_SIZE;;                                           \
6457                                                                         \
6458     coding->charbuf = NULL;                                             \
6459     while (size > 1024)                                                 \
6460       {                                                                 \
6461         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6462         if (coding->charbuf)                                            \
6463           break;                                                        \
6464         size >>= 1;                                                     \
6465       }                                                                 \
6466     if (! coding->charbuf)                                              \
6467       {                                                                 \
6468         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6469         return coding->result;                                          \
6470       }                                                                 \
6471     coding->charbuf_size = size;                                        \
6472   } while (0)
6473
6474
6475 static void
6476 produce_annotation (coding, pos)
6477      struct coding_system *coding;
6478      EMACS_INT pos;
6479 {
6480   int *charbuf = coding->charbuf;
6481   int *charbuf_end = charbuf + coding->charbuf_used;
6482
6483   if (NILP (coding->dst_object))
6484     return;
6485
6486   while (charbuf < charbuf_end)
6487     {
6488       if (*charbuf >= 0)
6489         pos += *charbuf++;
6490       else
6491         {
6492           int len = -*charbuf;
6493           switch (charbuf[1])
6494             {
6495             case CODING_ANNOTATE_COMPOSITION_MASK:
6496               produce_composition (coding, charbuf, pos);
6497               break;
6498             case CODING_ANNOTATE_CHARSET_MASK:
6499               produce_charset (coding, charbuf, pos);
6500               break;
6501             default:
6502               abort ();
6503             }
6504           charbuf += len;
6505         }
6506     }
6507 }
6508
6509 /* Decode the data at CODING->src_object into CODING->dst_object.
6510    CODING->src_object is a buffer, a string, or nil.
6511    CODING->dst_object is a buffer.
6512
6513    If CODING->src_object is a buffer, it must be the current buffer.
6514    In this case, if CODING->src_pos is positive, it is a position of
6515    the source text in the buffer, otherwise, the source text is in the
6516    gap area of the buffer, and CODING->src_pos specifies the offset of
6517    the text from GPT (which must be the same as PT).  If this is the
6518    same buffer as CODING->dst_object, CODING->src_pos must be
6519    negative.
6520
6521    If CODING->src_object is a string, CODING->src_pos is an index to
6522    that string.
6523
6524    If CODING->src_object is nil, CODING->source must already point to
6525    the non-relocatable memory area.  In this case, CODING->src_pos is
6526    an offset from CODING->source.
6527
6528    The decoded data is inserted at the current point of the buffer
6529    CODING->dst_object.
6530 */
6531
6532 static int
6533 decode_coding (coding)
6534      struct coding_system *coding;
6535 {
6536   Lisp_Object attrs;
6537   Lisp_Object undo_list;
6538   Lisp_Object translation_table;
6539   int carryover;
6540   int i;
6541
6542   if (BUFFERP (coding->src_object)
6543       && coding->src_pos > 0
6544       && coding->src_pos < GPT
6545       && coding->src_pos + coding->src_chars > GPT)
6546     move_gap_both (coding->src_pos, coding->src_pos_byte);
6547
6548   undo_list = Qt;
6549   if (BUFFERP (coding->dst_object))
6550     {
6551       if (current_buffer != XBUFFER (coding->dst_object))
6552         set_buffer_internal (XBUFFER (coding->dst_object));
6553       if (GPT != PT)
6554         move_gap_both (PT, PT_BYTE);
6555       undo_list = current_buffer->undo_list;
6556       current_buffer->undo_list = Qt;
6557     }
6558
6559   coding->consumed = coding->consumed_char = 0;
6560   coding->produced = coding->produced_char = 0;
6561   coding->chars_at_source = 0;
6562   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6563   coding->errors = 0;
6564
6565   ALLOC_CONVERSION_WORK_AREA (coding);
6566
6567   attrs = CODING_ID_ATTRS (coding->id);
6568   translation_table = get_translation_table (attrs, 0, NULL);
6569
6570   carryover = 0;
6571   do
6572     {
6573       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6574
6575       coding_set_source (coding);
6576       coding->annotated = 0;
6577       coding->charbuf_used = carryover;
6578       (*(coding->decoder)) (coding);
6579       coding_set_destination (coding);
6580       carryover = produce_chars (coding, translation_table, 0);
6581       if (coding->annotated)
6582         produce_annotation (coding, pos);
6583       for (i = 0; i < carryover; i++)
6584         coding->charbuf[i]
6585           = coding->charbuf[coding->charbuf_used - carryover + i];
6586     }
6587   while (coding->consumed < coding->src_bytes
6588          && (coding->result == CODING_RESULT_SUCCESS
6589              || coding->result == CODING_RESULT_INVALID_SRC));
6590
6591   if (carryover > 0)
6592     {
6593       coding_set_destination (coding);
6594       coding->charbuf_used = carryover;
6595       produce_chars (coding, translation_table, 1);
6596     }
6597
6598   coding->carryover_bytes = 0;
6599   if (coding->consumed < coding->src_bytes)
6600     {
6601       int nbytes = coding->src_bytes - coding->consumed;
6602       const unsigned char *src;
6603
6604       coding_set_source (coding);
6605       coding_set_destination (coding);
6606       src = coding->source + coding->consumed;
6607
6608       if (coding->mode & CODING_MODE_LAST_BLOCK)
6609         {
6610           /* Flush out unprocessed data as binary chars.  We are sure
6611              that the number of data is less than the size of
6612              coding->charbuf.  */
6613           coding->charbuf_used = 0;
6614           while (nbytes-- > 0)
6615             {
6616               int c = *src++;
6617
6618               if (c & 0x80)
6619                 c = BYTE8_TO_CHAR (c);
6620               coding->charbuf[coding->charbuf_used++] = c;
6621             }
6622           produce_chars (coding, Qnil, 1);
6623         }
6624       else
6625         {
6626           /* Record unprocessed bytes in coding->carryover.  We are
6627              sure that the number of data is less than the size of
6628              coding->carryover.  */
6629           unsigned char *p = coding->carryover;
6630
6631           coding->carryover_bytes = nbytes;
6632           while (nbytes-- > 0)
6633             *p++ = *src++;
6634         }
6635       coding->consumed = coding->src_bytes;
6636     }
6637
6638   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6639     decode_eol (coding);
6640   if (BUFFERP (coding->dst_object))
6641     {
6642       current_buffer->undo_list = undo_list;
6643       record_insert (coding->dst_pos, coding->produced_char);
6644     }
6645   return coding->result;
6646 }
6647
6648
6649 /* Extract an annotation datum from a composition starting at POS and
6650    ending before LIMIT of CODING->src_object (buffer or string), store
6651    the data in BUF, set *STOP to a starting position of the next
6652    composition (if any) or to LIMIT, and return the address of the
6653    next element of BUF.
6654
6655    If such an annotation is not found, set *STOP to a starting
6656    position of a composition after POS (if any) or to LIMIT, and
6657    return BUF.  */
6658
6659 static INLINE int *
6660 handle_composition_annotation (pos, limit, coding, buf, stop)
6661      EMACS_INT pos, limit;
6662      struct coding_system *coding;
6663      int *buf;
6664      EMACS_INT *stop;
6665 {
6666   EMACS_INT start, end;
6667   Lisp_Object prop;
6668
6669   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6670       || end > limit)
6671     *stop = limit;
6672   else if (start > pos)
6673     *stop = start;
6674   else
6675     {
6676       if (start == pos)
6677         {
6678           /* We found a composition.  Store the corresponding
6679              annotation data in BUF.  */
6680           int *head = buf;
6681           enum composition_method method = COMPOSITION_METHOD (prop);
6682           int nchars = COMPOSITION_LENGTH (prop);
6683
6684           ADD_COMPOSITION_DATA (buf, nchars, method);
6685           if (method != COMPOSITION_RELATIVE)
6686             {
6687               Lisp_Object components;
6688               int len, i, i_byte;
6689
6690               components = COMPOSITION_COMPONENTS (prop);
6691               if (VECTORP (components))
6692                 {
6693                   len = XVECTOR (components)->size;
6694                   for (i = 0; i < len; i++)
6695                     *buf++ = XINT (AREF (components, i));
6696                 }
6697               else if (STRINGP (components))
6698                 {
6699                   len = SCHARS (components);
6700                   i = i_byte = 0;
6701                   while (i < len)
6702                     {
6703                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6704                       buf++;
6705                     }
6706                 }
6707               else if (INTEGERP (components))
6708                 {
6709                   len = 1;
6710                   *buf++ = XINT (components);
6711                 }
6712               else if (CONSP (components))
6713                 {
6714                   for (len = 0; CONSP (components);
6715                        len++, components = XCDR (components))
6716                     *buf++ = XINT (XCAR (components));
6717                 }
6718               else
6719                 abort ();
6720               *head -= len;
6721             }
6722         }
6723
6724       if (find_composition (end, limit, &start, &end, &prop,
6725                             coding->src_object)
6726           && end <= limit)
6727         *stop = start;
6728       else
6729         *stop = limit;
6730     }
6731   return buf;
6732 }
6733
6734
6735 /* Extract an annotation datum from a text property `charset' at POS of
6736    CODING->src_object (buffer of string), store the data in BUF, set
6737    *STOP to the position where the value of `charset' property changes
6738    (limiting by LIMIT), and return the address of the next element of
6739    BUF.
6740
6741    If the property value is nil, set *STOP to the position where the
6742    property value is non-nil (limiting by LIMIT), and return BUF.  */
6743
6744 static INLINE int *
6745 handle_charset_annotation (pos, limit, coding, buf, stop)
6746      EMACS_INT pos, limit;
6747      struct coding_system *coding;
6748      int *buf;
6749      EMACS_INT *stop;
6750 {
6751   Lisp_Object val, next;
6752   int id;
6753
6754   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6755   if (! NILP (val) && CHARSETP (val))
6756     id = XINT (CHARSET_SYMBOL_ID (val));
6757   else
6758     id = -1;
6759   ADD_CHARSET_DATA (buf, 0, id);
6760   next = Fnext_single_property_change (make_number (pos), Qcharset,
6761                                        coding->src_object,
6762                                        make_number (limit));
6763   *stop = XINT (next);
6764   return buf;
6765 }
6766
6767
6768 static void
6769 consume_chars (coding, translation_table, max_lookup)
6770      struct coding_system *coding;
6771      Lisp_Object translation_table;
6772      int max_lookup;
6773 {
6774   int *buf = coding->charbuf;
6775   int *buf_end = coding->charbuf + coding->charbuf_size;
6776   const unsigned char *src = coding->source + coding->consumed;
6777   const unsigned char *src_end = coding->source + coding->src_bytes;
6778   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6779   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6780   int multibytep = coding->src_multibyte;
6781   Lisp_Object eol_type;
6782   int c;
6783   EMACS_INT stop, stop_composition, stop_charset;
6784   int *lookup_buf = NULL;
6785
6786   if (! NILP (translation_table))
6787     lookup_buf = alloca (sizeof (int) * max_lookup);
6788
6789   eol_type = CODING_ID_EOL_TYPE (coding->id);
6790   if (VECTORP (eol_type))
6791     eol_type = Qunix;
6792
6793   /* Note: composition handling is not yet implemented.  */
6794   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6795
6796   if (NILP (coding->src_object))
6797     stop = stop_composition = stop_charset = end_pos;
6798   else
6799     {
6800       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6801         stop = stop_composition = pos;
6802       else
6803         stop = stop_composition = end_pos;
6804       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6805         stop = stop_charset = pos;
6806       else
6807         stop_charset = end_pos;
6808     }
6809
6810   /* Compensate for CRLF and conversion.  */
6811   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6812   while (buf < buf_end)
6813     {
6814       Lisp_Object trans;
6815
6816       if (pos == stop)
6817         {
6818           if (pos == end_pos)
6819             break;
6820           if (pos == stop_composition)
6821             buf = handle_composition_annotation (pos, end_pos, coding,
6822                                                  buf, &stop_composition);
6823           if (pos == stop_charset)
6824             buf = handle_charset_annotation (pos, end_pos, coding,
6825                                              buf, &stop_charset);
6826           stop = (stop_composition < stop_charset
6827                   ? stop_composition : stop_charset);
6828         }
6829
6830       if (! multibytep)
6831         {
6832           EMACS_INT bytes;
6833
6834           if (coding->encoder == encode_coding_raw_text)
6835             c = *src++, pos++;
6836           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6837             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
6838           else
6839             c = BYTE8_TO_CHAR (*src), src++, pos++;
6840         }
6841       else
6842         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
6843       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6844         c = '\n';
6845       if (! EQ (eol_type, Qunix))
6846         {
6847           if (c == '\n')
6848             {
6849               if (EQ (eol_type, Qdos))
6850                 *buf++ = '\r';
6851               else
6852                 c = '\r';
6853             }
6854         }
6855
6856       trans = Qnil;
6857       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6858       if (NILP (trans))
6859         *buf++ = c;
6860       else
6861         {
6862           int from_nchars = 1, to_nchars = 1;
6863           int *lookup_buf_end;
6864           const unsigned char *p = src;
6865           int i;
6866
6867           lookup_buf[0] = c;
6868           for (i = 1; i < max_lookup && p < src_end; i++)
6869             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6870           lookup_buf_end = lookup_buf + i;
6871           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6872                                    &from_nchars, &to_nchars);
6873           if (EQ (trans, Qt)
6874               || buf + to_nchars > buf_end)
6875             break;
6876           *buf++ = *lookup_buf;
6877           for (i = 1; i < to_nchars; i++)
6878             *buf++ = XINT (AREF (trans, i));
6879           for (i = 1; i < from_nchars; i++, pos++)
6880             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6881         }
6882     }
6883
6884   coding->consumed = src - coding->source;
6885   coding->consumed_char = pos - coding->src_pos;
6886   coding->charbuf_used = buf - coding->charbuf;
6887   coding->chars_at_source = 0;
6888 }
6889
6890
6891 /* Encode the text at CODING->src_object into CODING->dst_object.
6892    CODING->src_object is a buffer or a string.
6893    CODING->dst_object is a buffer or nil.
6894
6895    If CODING->src_object is a buffer, it must be the current buffer.
6896    In this case, if CODING->src_pos is positive, it is a position of
6897    the source text in the buffer, otherwise. the source text is in the
6898    gap area of the buffer, and coding->src_pos specifies the offset of
6899    the text from GPT (which must be the same as PT).  If this is the
6900    same buffer as CODING->dst_object, CODING->src_pos must be
6901    negative and CODING should not have `pre-write-conversion'.
6902
6903    If CODING->src_object is a string, CODING should not have
6904    `pre-write-conversion'.
6905
6906    If CODING->dst_object is a buffer, the encoded data is inserted at
6907    the current point of that buffer.
6908
6909    If CODING->dst_object is nil, the encoded data is placed at the
6910    memory area specified by CODING->destination.  */
6911
6912 static int
6913 encode_coding (coding)
6914      struct coding_system *coding;
6915 {
6916   Lisp_Object attrs;
6917   Lisp_Object translation_table;
6918   int max_lookup;
6919
6920   attrs = CODING_ID_ATTRS (coding->id);
6921   if (coding->encoder == encode_coding_raw_text)
6922     translation_table = Qnil, max_lookup = 0;
6923   else
6924     translation_table = get_translation_table (attrs, 1, &max_lookup);
6925
6926   if (BUFFERP (coding->dst_object))
6927     {
6928       set_buffer_internal (XBUFFER (coding->dst_object));
6929       coding->dst_multibyte
6930         = ! NILP (current_buffer->enable_multibyte_characters);
6931     }
6932
6933   coding->consumed = coding->consumed_char = 0;
6934   coding->produced = coding->produced_char = 0;
6935   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6936   coding->errors = 0;
6937
6938   ALLOC_CONVERSION_WORK_AREA (coding);
6939
6940   do {
6941     coding_set_source (coding);
6942     consume_chars (coding, translation_table, max_lookup);
6943     coding_set_destination (coding);
6944     (*(coding->encoder)) (coding);
6945   } while (coding->consumed_char < coding->src_chars);
6946
6947   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
6948     insert_from_gap (coding->produced_char, coding->produced);
6949
6950   return (coding->result);
6951 }
6952
6953
6954 /* Name (or base name) of work buffer for code conversion.  */
6955 static Lisp_Object Vcode_conversion_workbuf_name;
6956
6957 /* A working buffer used by the top level conversion.  Once it is
6958    created, it is never destroyed.  It has the name
6959    Vcode_conversion_workbuf_name.  The other working buffers are
6960    destroyed after the use is finished, and their names are modified
6961    versions of Vcode_conversion_workbuf_name.  */
6962 static Lisp_Object Vcode_conversion_reused_workbuf;
6963
6964 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6965 static int reused_workbuf_in_use;
6966
6967
6968 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6969    multibyteness of returning buffer.  */
6970
6971 static Lisp_Object
6972 make_conversion_work_buffer (multibyte)
6973      int multibyte;
6974 {
6975   Lisp_Object name, workbuf;
6976   struct buffer *current;
6977
6978   if (reused_workbuf_in_use++)
6979     {
6980       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6981       workbuf = Fget_buffer_create (name);
6982     }
6983   else
6984     {
6985       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
6986         Vcode_conversion_reused_workbuf
6987           = Fget_buffer_create (Vcode_conversion_workbuf_name);
6988       workbuf = Vcode_conversion_reused_workbuf;
6989     }
6990   current = current_buffer;
6991   set_buffer_internal (XBUFFER (workbuf));
6992   Ferase_buffer ();
6993   current_buffer->undo_list = Qt;
6994   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6995   set_buffer_internal (current);
6996   return workbuf;
6997 }
6998
6999
7000 static Lisp_Object
7001 code_conversion_restore (arg)
7002      Lisp_Object arg;
7003 {
7004   Lisp_Object current, workbuf;
7005   struct gcpro gcpro1;
7006
7007   GCPRO1 (arg);
7008   current = XCAR (arg);
7009   workbuf = XCDR (arg);
7010   if (! NILP (workbuf))
7011     {
7012       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7013         reused_workbuf_in_use = 0;
7014       else if (! NILP (Fbuffer_live_p (workbuf)))
7015         Fkill_buffer (workbuf);
7016     }
7017   set_buffer_internal (XBUFFER (current));
7018   UNGCPRO;
7019   return Qnil;
7020 }
7021
7022 Lisp_Object
7023 code_conversion_save (with_work_buf, multibyte)
7024      int with_work_buf, multibyte;
7025 {
7026   Lisp_Object workbuf = Qnil;
7027
7028   if (with_work_buf)
7029     workbuf = make_conversion_work_buffer (multibyte);
7030   record_unwind_protect (code_conversion_restore,
7031                          Fcons (Fcurrent_buffer (), workbuf));
7032   return workbuf;
7033 }
7034
7035 int
7036 decode_coding_gap (coding, chars, bytes)
7037      struct coding_system *coding;
7038      EMACS_INT chars, bytes;
7039 {
7040   int count = specpdl_ptr - specpdl;
7041   Lisp_Object attrs;
7042
7043   code_conversion_save (0, 0);
7044
7045   coding->src_object = Fcurrent_buffer ();
7046   coding->src_chars = chars;
7047   coding->src_bytes = bytes;
7048   coding->src_pos = -chars;
7049   coding->src_pos_byte = -bytes;
7050   coding->src_multibyte = chars < bytes;
7051   coding->dst_object = coding->src_object;
7052   coding->dst_pos = PT;
7053   coding->dst_pos_byte = PT_BYTE;
7054   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7055
7056   if (CODING_REQUIRE_DETECTION (coding))
7057     detect_coding (coding);
7058
7059   coding->mode |= CODING_MODE_LAST_BLOCK;
7060   current_buffer->text->inhibit_shrinking = 1;
7061   decode_coding (coding);
7062   current_buffer->text->inhibit_shrinking = 0;
7063
7064   attrs = CODING_ID_ATTRS (coding->id);
7065   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7066     {
7067       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7068       Lisp_Object val;
7069
7070       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7071       val = call1 (CODING_ATTR_POST_READ (attrs),
7072                    make_number (coding->produced_char));
7073       CHECK_NATNUM (val);
7074       coding->produced_char += Z - prev_Z;
7075       coding->produced += Z_BYTE - prev_Z_BYTE;
7076     }
7077
7078   unbind_to (count, Qnil);
7079   return coding->result;
7080 }
7081
7082 int
7083 encode_coding_gap (coding, chars, bytes)
7084      struct coding_system *coding;
7085      EMACS_INT chars, bytes;
7086 {
7087   int count = specpdl_ptr - specpdl;
7088
7089   code_conversion_save (0, 0);
7090
7091   coding->src_object = Fcurrent_buffer ();
7092   coding->src_chars = chars;
7093   coding->src_bytes = bytes;
7094   coding->src_pos = -chars;
7095   coding->src_pos_byte = -bytes;
7096   coding->src_multibyte = chars < bytes;
7097   coding->dst_object = coding->src_object;
7098   coding->dst_pos = PT;
7099   coding->dst_pos_byte = PT_BYTE;
7100
7101   encode_coding (coding);
7102
7103   unbind_to (count, Qnil);
7104   return coding->result;
7105 }
7106
7107
7108 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7109    SRC_OBJECT into DST_OBJECT by coding context CODING.
7110
7111    SRC_OBJECT is a buffer, a string, or Qnil.
7112
7113    If it is a buffer, the text is at point of the buffer.  FROM and TO
7114    are positions in the buffer.
7115
7116    If it is a string, the text is at the beginning of the string.
7117    FROM and TO are indices to the string.
7118
7119    If it is nil, the text is at coding->source.  FROM and TO are
7120    indices to coding->source.
7121
7122    DST_OBJECT is a buffer, Qt, or Qnil.
7123
7124    If it is a buffer, the decoded text is inserted at point of the
7125    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7126    is deleted.
7127
7128    If it is Qt, a string is made from the decoded text, and
7129    set in CODING->dst_object.
7130
7131    If it is Qnil, the decoded text is stored at CODING->destination.
7132    The caller must allocate CODING->dst_bytes bytes at
7133    CODING->destination by xmalloc.  If the decoded text is longer than
7134    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7135  */
7136
7137 void
7138 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7139                       dst_object)
7140      struct coding_system *coding;
7141      Lisp_Object src_object;
7142      EMACS_INT from, from_byte, to, to_byte;
7143      Lisp_Object dst_object;
7144 {
7145   int count = specpdl_ptr - specpdl;
7146   unsigned char *destination;
7147   EMACS_INT dst_bytes;
7148   EMACS_INT chars = to - from;
7149   EMACS_INT bytes = to_byte - from_byte;
7150   Lisp_Object attrs;
7151   int saved_pt = -1, saved_pt_byte;
7152   int need_marker_adjustment = 0;
7153   Lisp_Object old_deactivate_mark;
7154
7155   old_deactivate_mark = Vdeactivate_mark;
7156
7157   if (NILP (dst_object))
7158     {
7159       destination = coding->destination;
7160       dst_bytes = coding->dst_bytes;
7161     }
7162
7163   coding->src_object = src_object;
7164   coding->src_chars = chars;
7165   coding->src_bytes = bytes;
7166   coding->src_multibyte = chars < bytes;
7167
7168   if (STRINGP (src_object))
7169     {
7170       coding->src_pos = from;
7171       coding->src_pos_byte = from_byte;
7172     }
7173   else if (BUFFERP (src_object))
7174     {
7175       set_buffer_internal (XBUFFER (src_object));
7176       if (from != GPT)
7177         move_gap_both (from, from_byte);
7178       if (EQ (src_object, dst_object))
7179         {
7180           struct Lisp_Marker *tail;
7181
7182           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7183             {
7184               tail->need_adjustment
7185                 = tail->charpos == (tail->insertion_type ? from : to);
7186               need_marker_adjustment |= tail->need_adjustment;
7187             }
7188           saved_pt = PT, saved_pt_byte = PT_BYTE;
7189           TEMP_SET_PT_BOTH (from, from_byte);
7190           current_buffer->text->inhibit_shrinking = 1;
7191           del_range_both (from, from_byte, to, to_byte, 1);
7192           coding->src_pos = -chars;
7193           coding->src_pos_byte = -bytes;
7194         }
7195       else
7196         {
7197           coding->src_pos = from;
7198           coding->src_pos_byte = from_byte;
7199         }
7200     }
7201
7202   if (CODING_REQUIRE_DETECTION (coding))
7203     detect_coding (coding);
7204   attrs = CODING_ID_ATTRS (coding->id);
7205
7206   if (EQ (dst_object, Qt)
7207       || (! NILP (CODING_ATTR_POST_READ (attrs))
7208           && NILP (dst_object)))
7209     {
7210       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7211       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7212       coding->dst_pos = BEG;
7213       coding->dst_pos_byte = BEG_BYTE;
7214     }
7215   else if (BUFFERP (dst_object))
7216     {
7217       code_conversion_save (0, 0);
7218       coding->dst_object = dst_object;
7219       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7220       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7221       coding->dst_multibyte
7222         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7223     }
7224   else
7225     {
7226       code_conversion_save (0, 0);
7227       coding->dst_object = Qnil;
7228       /* Most callers presume this will return a multibyte result, and they
7229          won't use `binary' or `raw-text' anyway, so let's not worry about
7230          CODING_FOR_UNIBYTE.  */
7231       coding->dst_multibyte = 1;
7232     }
7233
7234   decode_coding (coding);
7235
7236   if (BUFFERP (coding->dst_object))
7237     set_buffer_internal (XBUFFER (coding->dst_object));
7238
7239   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7240     {
7241       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7242       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7243       Lisp_Object val;
7244
7245       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7246       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7247               old_deactivate_mark);
7248       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7249                         make_number (coding->produced_char));
7250       UNGCPRO;
7251       CHECK_NATNUM (val);
7252       coding->produced_char += Z - prev_Z;
7253       coding->produced += Z_BYTE - prev_Z_BYTE;
7254     }
7255
7256   if (EQ (dst_object, Qt))
7257     {
7258       coding->dst_object = Fbuffer_string ();
7259     }
7260   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7261     {
7262       set_buffer_internal (XBUFFER (coding->dst_object));
7263       if (dst_bytes < coding->produced)
7264         {
7265           destination = xrealloc (destination, coding->produced);
7266           if (! destination)
7267             {
7268               record_conversion_result (coding,
7269                                         CODING_RESULT_INSUFFICIENT_DST);
7270               unbind_to (count, Qnil);
7271               return;
7272             }
7273           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7274             move_gap_both (BEGV, BEGV_BYTE);
7275           bcopy (BEGV_ADDR, destination, coding->produced);
7276           coding->destination = destination;
7277         }
7278     }
7279
7280   if (saved_pt >= 0)
7281     {
7282       /* This is the case of:
7283          (BUFFERP (src_object) && EQ (src_object, dst_object))
7284          As we have moved PT while replacing the original buffer
7285          contents, we must recover it now.  */
7286       set_buffer_internal (XBUFFER (src_object));
7287       current_buffer->text->inhibit_shrinking = 0;
7288       if (saved_pt < from)
7289         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7290       else if (saved_pt < from + chars)
7291         TEMP_SET_PT_BOTH (from, from_byte);
7292       else if (! NILP (current_buffer->enable_multibyte_characters))
7293         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7294                           saved_pt_byte + (coding->produced - bytes));
7295       else
7296         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7297                           saved_pt_byte + (coding->produced - bytes));
7298
7299       if (need_marker_adjustment)
7300         {
7301           struct Lisp_Marker *tail;
7302
7303           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7304             if (tail->need_adjustment)
7305               {
7306                 tail->need_adjustment = 0;
7307                 if (tail->insertion_type)
7308                   {
7309                     tail->bytepos = from_byte;
7310                     tail->charpos = from;
7311                   }
7312                 else
7313                   {
7314                     tail->bytepos = from_byte + coding->produced;
7315                     tail->charpos
7316                       = (NILP (current_buffer->enable_multibyte_characters)
7317                          ? tail->bytepos : from + coding->produced_char);
7318                   }
7319               }
7320         }
7321     }
7322
7323   Vdeactivate_mark = old_deactivate_mark;
7324   unbind_to (count, coding->dst_object);
7325 }
7326
7327
7328 void
7329 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7330                       dst_object)
7331      struct coding_system *coding;
7332      Lisp_Object src_object;
7333      EMACS_INT from, from_byte, to, to_byte;
7334      Lisp_Object dst_object;
7335 {
7336   int count = specpdl_ptr - specpdl;
7337   EMACS_INT chars = to - from;
7338   EMACS_INT bytes = to_byte - from_byte;
7339   Lisp_Object attrs;
7340   int saved_pt = -1, saved_pt_byte;
7341   int need_marker_adjustment = 0;
7342   int kill_src_buffer = 0;
7343   Lisp_Object old_deactivate_mark;
7344
7345   old_deactivate_mark = Vdeactivate_mark;
7346
7347   coding->src_object = src_object;
7348   coding->src_chars = chars;
7349   coding->src_bytes = bytes;
7350   coding->src_multibyte = chars < bytes;
7351
7352   attrs = CODING_ID_ATTRS (coding->id);
7353
7354   if (EQ (src_object, dst_object))
7355     {
7356       struct Lisp_Marker *tail;
7357
7358       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7359         {
7360           tail->need_adjustment
7361             = tail->charpos == (tail->insertion_type ? from : to);
7362           need_marker_adjustment |= tail->need_adjustment;
7363         }
7364     }
7365
7366   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7367     {
7368       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7369       set_buffer_internal (XBUFFER (coding->src_object));
7370       if (STRINGP (src_object))
7371         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7372       else if (BUFFERP (src_object))
7373         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7374       else
7375         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7376
7377       if (EQ (src_object, dst_object))
7378         {
7379           set_buffer_internal (XBUFFER (src_object));
7380           saved_pt = PT, saved_pt_byte = PT_BYTE;
7381           del_range_both (from, from_byte, to, to_byte, 1);
7382           set_buffer_internal (XBUFFER (coding->src_object));
7383         }
7384
7385       {
7386         Lisp_Object args[3];
7387         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7388
7389         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7390                 old_deactivate_mark);
7391         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7392         args[1] = make_number (BEG);
7393         args[2] = make_number (Z);
7394         safe_call (3, args);
7395         UNGCPRO;
7396       }
7397       if (XBUFFER (coding->src_object) != current_buffer)
7398         kill_src_buffer = 1;
7399       coding->src_object = Fcurrent_buffer ();
7400       if (BEG != GPT)
7401         move_gap_both (BEG, BEG_BYTE);
7402       coding->src_chars = Z - BEG;
7403       coding->src_bytes = Z_BYTE - BEG_BYTE;
7404       coding->src_pos = BEG;
7405       coding->src_pos_byte = BEG_BYTE;
7406       coding->src_multibyte = Z < Z_BYTE;
7407     }
7408   else if (STRINGP (src_object))
7409     {
7410       code_conversion_save (0, 0);
7411       coding->src_pos = from;
7412       coding->src_pos_byte = from_byte;
7413     }
7414   else if (BUFFERP (src_object))
7415     {
7416       code_conversion_save (0, 0);
7417       set_buffer_internal (XBUFFER (src_object));
7418       if (EQ (src_object, dst_object))
7419         {
7420           saved_pt = PT, saved_pt_byte = PT_BYTE;
7421           coding->src_object = del_range_1 (from, to, 1, 1);
7422           coding->src_pos = 0;
7423           coding->src_pos_byte = 0;
7424         }
7425       else
7426         {
7427           if (from < GPT && to >= GPT)
7428             move_gap_both (from, from_byte);
7429           coding->src_pos = from;
7430           coding->src_pos_byte = from_byte;
7431         }
7432     }
7433   else
7434     code_conversion_save (0, 0);
7435
7436   if (BUFFERP (dst_object))
7437     {
7438       coding->dst_object = dst_object;
7439       if (EQ (src_object, dst_object))
7440         {
7441           coding->dst_pos = from;
7442           coding->dst_pos_byte = from_byte;
7443         }
7444       else
7445         {
7446           struct buffer *current = current_buffer;
7447
7448           set_buffer_temp (XBUFFER (dst_object));
7449           coding->dst_pos = PT;
7450           coding->dst_pos_byte = PT_BYTE;
7451           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7452           set_buffer_temp (current);
7453         }
7454       coding->dst_multibyte
7455         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7456     }
7457   else if (EQ (dst_object, Qt))
7458     {
7459       coding->dst_object = Qnil;
7460       coding->dst_bytes = coding->src_chars;
7461       if (coding->dst_bytes == 0)
7462         coding->dst_bytes = 1;
7463       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7464       coding->dst_multibyte = 0;
7465     }
7466   else
7467     {
7468       coding->dst_object = Qnil;
7469       coding->dst_multibyte = 0;
7470     }
7471
7472   encode_coding (coding);
7473
7474   if (EQ (dst_object, Qt))
7475     {
7476       if (BUFFERP (coding->dst_object))
7477         coding->dst_object = Fbuffer_string ();
7478       else
7479         {
7480           coding->dst_object
7481             = make_unibyte_string ((char *) coding->destination,
7482                                    coding->produced);
7483           xfree (coding->destination);
7484         }
7485     }
7486
7487   if (saved_pt >= 0)
7488     {
7489       /* This is the case of:
7490          (BUFFERP (src_object) && EQ (src_object, dst_object))
7491          As we have moved PT while replacing the original buffer
7492          contents, we must recover it now.  */
7493       set_buffer_internal (XBUFFER (src_object));
7494       if (saved_pt < from)
7495         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7496       else if (saved_pt < from + chars)
7497         TEMP_SET_PT_BOTH (from, from_byte);
7498       else if (! NILP (current_buffer->enable_multibyte_characters))
7499         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7500                           saved_pt_byte + (coding->produced - bytes));
7501       else
7502         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7503                           saved_pt_byte + (coding->produced - bytes));
7504
7505       if (need_marker_adjustment)
7506         {
7507           struct Lisp_Marker *tail;
7508
7509           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7510             if (tail->need_adjustment)
7511               {
7512                 tail->need_adjustment = 0;
7513                 if (tail->insertion_type)
7514                   {
7515                     tail->bytepos = from_byte;
7516                     tail->charpos = from;
7517                   }
7518                 else
7519                   {
7520                     tail->bytepos = from_byte + coding->produced;
7521                     tail->charpos
7522                       = (NILP (current_buffer->enable_multibyte_characters)
7523                          ? tail->bytepos : from + coding->produced_char);
7524                   }
7525               }
7526         }
7527     }
7528
7529   if (kill_src_buffer)
7530     Fkill_buffer (coding->src_object);
7531
7532   Vdeactivate_mark = old_deactivate_mark;
7533   unbind_to (count, Qnil);
7534 }
7535
7536
7537 Lisp_Object
7538 preferred_coding_system ()
7539 {
7540   int id = coding_categories[coding_priorities[0]].id;
7541
7542   return CODING_ID_NAME (id);
7543 }
7544
7545 \f
7546 #ifdef emacs
7547 /*** 8. Emacs Lisp library functions ***/
7548
7549 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7550        doc: /* Return t if OBJECT is nil or a coding-system.
7551 See the documentation of `define-coding-system' for information
7552 about coding-system objects.  */)
7553      (object)
7554      Lisp_Object object;
7555 {
7556   if (NILP (object)
7557       || CODING_SYSTEM_ID (object) >= 0)
7558     return Qt;
7559   if (! SYMBOLP (object)
7560       || NILP (Fget (object, Qcoding_system_define_form)))
7561     return Qnil;
7562   return Qt;
7563 }
7564
7565 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7566        Sread_non_nil_coding_system, 1, 1, 0,
7567        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7568      (prompt)
7569      Lisp_Object prompt;
7570 {
7571   Lisp_Object val;
7572   do
7573     {
7574       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7575                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7576     }
7577   while (SCHARS (val) == 0);
7578   return (Fintern (val, Qnil));
7579 }
7580
7581 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7582        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7583 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
7584 Ignores case when completing coding systems (all Emacs coding systems
7585 are lower-case).  */)
7586      (prompt, default_coding_system)
7587      Lisp_Object prompt, default_coding_system;
7588 {
7589   Lisp_Object val;
7590   int count = SPECPDL_INDEX ();
7591
7592   if (SYMBOLP (default_coding_system))
7593     default_coding_system = SYMBOL_NAME (default_coding_system);
7594   specbind (Qcompletion_ignore_case, Qt);
7595   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7596                           Qt, Qnil, Qcoding_system_history,
7597                           default_coding_system, Qnil);
7598   unbind_to (count, Qnil);
7599   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7600 }
7601
7602 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7603        1, 1, 0,
7604        doc: /* Check validity of CODING-SYSTEM.
7605 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7606 It is valid if it is nil or a symbol defined as a coding system by the
7607 function `define-coding-system'.  */)
7608   (coding_system)
7609      Lisp_Object coding_system;
7610 {
7611   Lisp_Object define_form;
7612
7613   define_form = Fget (coding_system, Qcoding_system_define_form);
7614   if (! NILP (define_form))
7615     {
7616       Fput (coding_system, Qcoding_system_define_form, Qnil);
7617       safe_eval (define_form);
7618     }
7619   if (!NILP (Fcoding_system_p (coding_system)))
7620     return coding_system;
7621   xsignal1 (Qcoding_system_error, coding_system);
7622 }
7623
7624 \f
7625 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7626    HIGHEST is nonzero, return the coding system of the highest
7627    priority among the detected coding systems.  Otherwize return a
7628    list of detected coding systems sorted by their priorities.  If
7629    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7630    multibyte form but contains only ASCII and eight-bit chars.
7631    Otherwise, the bytes are raw bytes.
7632
7633    CODING-SYSTEM controls the detection as below:
7634
7635    If it is nil, detect both text-format and eol-format.  If the
7636    text-format part of CODING-SYSTEM is already specified
7637    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7638    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7639    detect only text-format.  */
7640
7641 Lisp_Object
7642 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7643                       coding_system)
7644      const unsigned char *src;
7645      EMACS_INT src_chars, src_bytes;
7646      int highest;
7647      int multibytep;
7648      Lisp_Object coding_system;
7649 {
7650   const unsigned char *src_end = src + src_bytes;
7651   Lisp_Object attrs, eol_type;
7652   Lisp_Object val;
7653   struct coding_system coding;
7654   int id;
7655   struct coding_detection_info detect_info;
7656   enum coding_category base_category;
7657   int null_byte_found = 0, eight_bit_found = 0;
7658
7659   if (NILP (coding_system))
7660     coding_system = Qundecided;
7661   setup_coding_system (coding_system, &coding);
7662   attrs = CODING_ID_ATTRS (coding.id);
7663   eol_type = CODING_ID_EOL_TYPE (coding.id);
7664   coding_system = CODING_ATTR_BASE_NAME (attrs);
7665
7666   coding.source = src;
7667   coding.src_chars = src_chars;
7668   coding.src_bytes = src_bytes;
7669   coding.src_multibyte = multibytep;
7670   coding.consumed = 0;
7671   coding.mode |= CODING_MODE_LAST_BLOCK;
7672   coding.head_ascii = 0;
7673
7674   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7675
7676   /* At first, detect text-format if necessary.  */
7677   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7678   if (base_category == coding_category_undecided)
7679     {
7680       enum coding_category category;
7681       struct coding_system *this;
7682       int c, i;
7683
7684       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7685       for (; src < src_end; src++)
7686         {
7687           c = *src;
7688           if (c & 0x80)
7689             {
7690               eight_bit_found = 1;
7691               if (null_byte_found)
7692                 break;
7693             }
7694           else if (c < 0x20)
7695             {
7696               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7697                   && ! inhibit_iso_escape_detection
7698                   && ! detect_info.checked)
7699                 {
7700                   if (detect_coding_iso_2022 (&coding, &detect_info))
7701                     {
7702                       /* We have scanned the whole data.  */
7703                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7704                         {
7705                           /* We didn't find an 8-bit code.  We may
7706                              have found a null-byte, but it's very
7707                              rare that a binary file confirm to
7708                              ISO-2022.  */
7709                           src = src_end;
7710                           coding.head_ascii = src - coding.source;
7711                         }
7712                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
7713                       break;
7714                     }
7715                 }
7716               else if (! c)
7717                 {
7718                   null_byte_found = 1;
7719                   if (eight_bit_found)
7720                     break;
7721                 }
7722               if (! eight_bit_found)
7723                 coding.head_ascii++;
7724             }
7725           else if (! eight_bit_found)
7726             coding.head_ascii++;
7727         }
7728
7729       if (null_byte_found || eight_bit_found
7730           || coding.head_ascii < coding.src_bytes
7731           || detect_info.found)
7732         {
7733           if (coding.head_ascii == coding.src_bytes)
7734             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7735             for (i = 0; i < coding_category_raw_text; i++)
7736               {
7737                 category = coding_priorities[i];
7738                 this = coding_categories + category;
7739                 if (detect_info.found & (1 << category))
7740                   break;
7741               }
7742           else
7743             {
7744               if (null_byte_found)
7745                 {
7746                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
7747                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
7748                 }
7749               for (i = 0; i < coding_category_raw_text; i++)
7750                 {
7751                   category = coding_priorities[i];
7752                   this = coding_categories + category;
7753
7754                   if (this->id < 0)
7755                     {
7756                       /* No coding system of this category is defined.  */
7757                       detect_info.rejected |= (1 << category);
7758                     }
7759                   else if (category >= coding_category_raw_text)
7760                     continue;
7761                   else if (detect_info.checked & (1 << category))
7762                     {
7763                       if (highest
7764                           && (detect_info.found & (1 << category)))
7765                         break;
7766                     }
7767                   else if ((*(this->detector)) (&coding, &detect_info)
7768                            && highest
7769                            && (detect_info.found & (1 << category)))
7770                     {
7771                       if (category == coding_category_utf_16_auto)
7772                         {
7773                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7774                             category = coding_category_utf_16_le;
7775                           else
7776                             category = coding_category_utf_16_be;
7777                         }
7778                       break;
7779                     }
7780                 }
7781             }
7782         }
7783
7784       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
7785         {
7786           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7787           id = coding_categories[coding_category_raw_text].id;
7788           val = Fcons (make_number (id), Qnil);
7789         }
7790       else if (! detect_info.rejected && ! detect_info.found)
7791         {
7792           detect_info.found = CATEGORY_MASK_ANY;
7793           id = coding_categories[coding_category_undecided].id;
7794           val = Fcons (make_number (id), Qnil);
7795         }
7796       else if (highest)
7797         {
7798           if (detect_info.found)
7799             {
7800               detect_info.found = 1 << category;
7801               val = Fcons (make_number (this->id), Qnil);
7802             }
7803           else
7804             for (i = 0; i < coding_category_raw_text; i++)
7805               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7806                 {
7807                   detect_info.found = 1 << coding_priorities[i];
7808                   id = coding_categories[coding_priorities[i]].id;
7809                   val = Fcons (make_number (id), Qnil);
7810                   break;
7811                 }
7812         }
7813       else
7814         {
7815           int mask = detect_info.rejected | detect_info.found;
7816           int found = 0;
7817           val = Qnil;
7818
7819           for (i = coding_category_raw_text - 1; i >= 0; i--)
7820             {
7821               category = coding_priorities[i];
7822               if (! (mask & (1 << category)))
7823                 {
7824                   found |= 1 << category;
7825                   id = coding_categories[category].id;
7826                   if (id >= 0)
7827                     val = Fcons (make_number (id), val);
7828                 }
7829             }
7830           for (i = coding_category_raw_text - 1; i >= 0; i--)
7831             {
7832               category = coding_priorities[i];
7833               if (detect_info.found & (1 << category))
7834                 {
7835                   id = coding_categories[category].id;
7836                   val = Fcons (make_number (id), val);
7837                 }
7838             }
7839           detect_info.found |= found;
7840         }
7841     }
7842   else if (base_category == coding_category_utf_8_auto)
7843     {
7844       if (detect_coding_utf_8 (&coding, &detect_info))
7845         {
7846           struct coding_system *this;
7847
7848           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
7849             this = coding_categories + coding_category_utf_8_sig;
7850           else
7851             this = coding_categories + coding_category_utf_8_nosig;
7852           val = Fcons (make_number (this->id), Qnil);
7853         }
7854     }
7855   else if (base_category == coding_category_utf_16_auto)
7856     {
7857       if (detect_coding_utf_16 (&coding, &detect_info))
7858         {
7859           struct coding_system *this;
7860
7861           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7862             this = coding_categories + coding_category_utf_16_le;
7863           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7864             this = coding_categories + coding_category_utf_16_be;
7865           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7866             this = coding_categories + coding_category_utf_16_be_nosig;
7867           else
7868             this = coding_categories + coding_category_utf_16_le_nosig;
7869           val = Fcons (make_number (this->id), Qnil);
7870         }
7871     }
7872   else
7873     {
7874       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7875       val = Fcons (make_number (coding.id), Qnil);
7876     }
7877
7878   /* Then, detect eol-format if necessary.  */
7879   {
7880     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7881     Lisp_Object tail;
7882
7883     if (VECTORP (eol_type))
7884       {
7885         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7886           {
7887             if (null_byte_found)
7888               normal_eol = EOL_SEEN_LF;
7889             else
7890               normal_eol = detect_eol (coding.source, src_bytes,
7891                                        coding_category_raw_text);
7892           }
7893         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7894                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7895           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7896                                       coding_category_utf_16_be);
7897         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7898                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7899           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7900                                       coding_category_utf_16_le);
7901       }
7902     else
7903       {
7904         if (EQ (eol_type, Qunix))
7905           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7906         else if (EQ (eol_type, Qdos))
7907           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7908         else
7909           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7910       }
7911
7912     for (tail = val; CONSP (tail); tail = XCDR (tail))
7913       {
7914         enum coding_category category;
7915         int this_eol;
7916
7917         id = XINT (XCAR (tail));
7918         attrs = CODING_ID_ATTRS (id);
7919         category = XINT (CODING_ATTR_CATEGORY (attrs));
7920         eol_type = CODING_ID_EOL_TYPE (id);
7921         if (VECTORP (eol_type))
7922           {
7923             if (category == coding_category_utf_16_be
7924                 || category == coding_category_utf_16_be_nosig)
7925               this_eol = utf_16_be_eol;
7926             else if (category == coding_category_utf_16_le
7927                      || category == coding_category_utf_16_le_nosig)
7928               this_eol = utf_16_le_eol;
7929             else
7930               this_eol = normal_eol;
7931
7932             if (this_eol == EOL_SEEN_LF)
7933               XSETCAR (tail, AREF (eol_type, 0));
7934             else if (this_eol == EOL_SEEN_CRLF)
7935               XSETCAR (tail, AREF (eol_type, 1));
7936             else if (this_eol == EOL_SEEN_CR)
7937               XSETCAR (tail, AREF (eol_type, 2));
7938             else
7939               XSETCAR (tail, CODING_ID_NAME (id));
7940           }
7941         else
7942           XSETCAR (tail, CODING_ID_NAME (id));
7943       }
7944   }
7945
7946   return (highest ? XCAR (val) : val);
7947 }
7948
7949
7950 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7951        2, 3, 0,
7952        doc: /* Detect coding system of the text in the region between START and END.
7953 Return a list of possible coding systems ordered by priority.
7954
7955 If only ASCII characters are found (except for such ISO-2022 control
7956 characters as ESC), it returns a list of single element `undecided'
7957 or its subsidiary coding system according to a detected end-of-line
7958 format.
7959
7960 If optional argument HIGHEST is non-nil, return the coding system of
7961 highest priority.  */)
7962      (start, end, highest)
7963      Lisp_Object start, end, highest;
7964 {
7965   int from, to;
7966   int from_byte, to_byte;
7967
7968   CHECK_NUMBER_COERCE_MARKER (start);
7969   CHECK_NUMBER_COERCE_MARKER (end);
7970
7971   validate_region (&start, &end);
7972   from = XINT (start), to = XINT (end);
7973   from_byte = CHAR_TO_BYTE (from);
7974   to_byte = CHAR_TO_BYTE (to);
7975
7976   if (from < GPT && to >= GPT)
7977     move_gap_both (to, to_byte);
7978
7979   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7980                                to - from, to_byte - from_byte,
7981                                !NILP (highest),
7982                                !NILP (current_buffer
7983                                       ->enable_multibyte_characters),
7984                                Qnil);
7985 }
7986
7987 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7988        1, 2, 0,
7989        doc: /* Detect coding system of the text in STRING.
7990 Return a list of possible coding systems ordered by priority.
7991
7992 If only ASCII characters are found (except for such ISO-2022 control
7993 characters as ESC), it returns a list of single element `undecided'
7994 or its subsidiary coding system according to a detected end-of-line
7995 format.
7996
7997 If optional argument HIGHEST is non-nil, return the coding system of
7998 highest priority.  */)
7999      (string, highest)
8000      Lisp_Object string, highest;
8001 {
8002   CHECK_STRING (string);
8003
8004   return detect_coding_system (SDATA (string),
8005                                SCHARS (string), SBYTES (string),
8006                                !NILP (highest), STRING_MULTIBYTE (string),
8007                                Qnil);
8008 }
8009
8010
8011 static INLINE int
8012 char_encodable_p (c, attrs)
8013      int c;
8014      Lisp_Object attrs;
8015 {
8016   Lisp_Object tail;
8017   struct charset *charset;
8018   Lisp_Object translation_table;
8019
8020   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8021   if (! NILP (translation_table))
8022     c = translate_char (translation_table, c);
8023   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8024        CONSP (tail); tail = XCDR (tail))
8025     {
8026       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8027       if (CHAR_CHARSET_P (c, charset))
8028         break;
8029     }
8030   return (! NILP (tail));
8031 }
8032
8033
8034 /* Return a list of coding systems that safely encode the text between
8035    START and END.  If EXCLUDE is non-nil, it is a list of coding
8036    systems not to check.  The returned list doesn't contain any such
8037    coding systems.  In any case, if the text contains only ASCII or is
8038    unibyte, return t.  */
8039
8040 DEFUN ("find-coding-systems-region-internal",
8041        Ffind_coding_systems_region_internal,
8042        Sfind_coding_systems_region_internal, 2, 3, 0,
8043        doc: /* Internal use only.  */)
8044      (start, end, exclude)
8045      Lisp_Object start, end, exclude;
8046 {
8047   Lisp_Object coding_attrs_list, safe_codings;
8048   EMACS_INT start_byte, end_byte;
8049   const unsigned char *p, *pbeg, *pend;
8050   int c;
8051   Lisp_Object tail, elt;
8052
8053   if (STRINGP (start))
8054     {
8055       if (!STRING_MULTIBYTE (start)
8056           || SCHARS (start) == SBYTES (start))
8057         return Qt;
8058       start_byte = 0;
8059       end_byte = SBYTES (start);
8060     }
8061   else
8062     {
8063       CHECK_NUMBER_COERCE_MARKER (start);
8064       CHECK_NUMBER_COERCE_MARKER (end);
8065       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8066         args_out_of_range (start, end);
8067       if (NILP (current_buffer->enable_multibyte_characters))
8068         return Qt;
8069       start_byte = CHAR_TO_BYTE (XINT (start));
8070       end_byte = CHAR_TO_BYTE (XINT (end));
8071       if (XINT (end) - XINT (start) == end_byte - start_byte)
8072         return Qt;
8073
8074       if (XINT (start) < GPT && XINT (end) > GPT)
8075         {
8076           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8077             move_gap_both (XINT (start), start_byte);
8078           else
8079             move_gap_both (XINT (end), end_byte);
8080         }
8081     }
8082
8083   coding_attrs_list = Qnil;
8084   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8085     if (NILP (exclude)
8086         || NILP (Fmemq (XCAR (tail), exclude)))
8087       {
8088         Lisp_Object attrs;
8089
8090         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8091         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8092             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8093           {
8094             ASET (attrs, coding_attr_trans_tbl,
8095                   get_translation_table (attrs, 1, NULL));
8096             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8097           }
8098       }
8099
8100   if (STRINGP (start))
8101     p = pbeg = SDATA (start);
8102   else
8103     p = pbeg = BYTE_POS_ADDR (start_byte);
8104   pend = p + (end_byte - start_byte);
8105
8106   while (p < pend && ASCII_BYTE_P (*p)) p++;
8107   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8108
8109   while (p < pend)
8110     {
8111       if (ASCII_BYTE_P (*p))
8112         p++;
8113       else
8114         {
8115           c = STRING_CHAR_ADVANCE (p);
8116
8117           charset_map_loaded = 0;
8118           for (tail = coding_attrs_list; CONSP (tail);)
8119             {
8120               elt = XCAR (tail);
8121               if (NILP (elt))
8122                 tail = XCDR (tail);
8123               else if (char_encodable_p (c, elt))
8124                 tail = XCDR (tail);
8125               else if (CONSP (XCDR (tail)))
8126                 {
8127                   XSETCAR (tail, XCAR (XCDR (tail)));
8128                   XSETCDR (tail, XCDR (XCDR (tail)));
8129                 }
8130               else
8131                 {
8132                   XSETCAR (tail, Qnil);
8133                   tail = XCDR (tail);
8134                 }
8135             }
8136           if (charset_map_loaded)
8137             {
8138               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8139
8140               if (STRINGP (start))
8141                 pbeg = SDATA (start);
8142               else
8143                 pbeg = BYTE_POS_ADDR (start_byte);
8144               p = pbeg + p_offset;
8145               pend = pbeg + pend_offset;
8146             }
8147         }
8148     }
8149
8150   safe_codings = list2 (Qraw_text, Qno_conversion);
8151   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8152     if (! NILP (XCAR (tail)))
8153       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8154
8155   return safe_codings;
8156 }
8157
8158
8159 DEFUN ("unencodable-char-position", Funencodable_char_position,
8160        Sunencodable_char_position, 3, 5, 0,
8161        doc: /*
8162 Return position of first un-encodable character in a region.
8163 START and END specify the region and CODING-SYSTEM specifies the
8164 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8165
8166 If optional 4th argument COUNT is non-nil, it specifies at most how
8167 many un-encodable characters to search.  In this case, the value is a
8168 list of positions.
8169
8170 If optional 5th argument STRING is non-nil, it is a string to search
8171 for un-encodable characters.  In that case, START and END are indexes
8172 to the string.  */)
8173      (start, end, coding_system, count, string)
8174      Lisp_Object start, end, coding_system, count, string;
8175 {
8176   int n;
8177   struct coding_system coding;
8178   Lisp_Object attrs, charset_list, translation_table;
8179   Lisp_Object positions;
8180   int from, to;
8181   const unsigned char *p, *stop, *pend;
8182   int ascii_compatible;
8183
8184   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8185   attrs = CODING_ID_ATTRS (coding.id);
8186   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8187     return Qnil;
8188   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8189   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8190   translation_table = get_translation_table (attrs, 1, NULL);
8191
8192   if (NILP (string))
8193     {
8194       validate_region (&start, &end);
8195       from = XINT (start);
8196       to = XINT (end);
8197       if (NILP (current_buffer->enable_multibyte_characters)
8198           || (ascii_compatible
8199               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8200         return Qnil;
8201       p = CHAR_POS_ADDR (from);
8202       pend = CHAR_POS_ADDR (to);
8203       if (from < GPT && to >= GPT)
8204         stop = GPT_ADDR;
8205       else
8206         stop = pend;
8207     }
8208   else
8209     {
8210       CHECK_STRING (string);
8211       CHECK_NATNUM (start);
8212       CHECK_NATNUM (end);
8213       from = XINT (start);
8214       to = XINT (end);
8215       if (from > to
8216           || to > SCHARS (string))
8217         args_out_of_range_3 (string, start, end);
8218       if (! STRING_MULTIBYTE (string))
8219         return Qnil;
8220       p = SDATA (string) + string_char_to_byte (string, from);
8221       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8222       if (ascii_compatible && (to - from) == (pend - p))
8223         return Qnil;
8224     }
8225
8226   if (NILP (count))
8227     n = 1;
8228   else
8229     {
8230       CHECK_NATNUM (count);
8231       n = XINT (count);
8232     }
8233
8234   positions = Qnil;
8235   while (1)
8236     {
8237       int c;
8238
8239       if (ascii_compatible)
8240         while (p < stop && ASCII_BYTE_P (*p))
8241           p++, from++;
8242       if (p >= stop)
8243         {
8244           if (p >= pend)
8245             break;
8246           stop = pend;
8247           p = GAP_END_ADDR;
8248         }
8249
8250       c = STRING_CHAR_ADVANCE (p);
8251       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8252           && ! char_charset (translate_char (translation_table, c),
8253                              charset_list, NULL))
8254         {
8255           positions = Fcons (make_number (from), positions);
8256           n--;
8257           if (n == 0)
8258             break;
8259         }
8260
8261       from++;
8262     }
8263
8264   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8265 }
8266
8267
8268 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8269        Scheck_coding_systems_region, 3, 3, 0,
8270        doc: /* Check if the region is encodable by coding systems.
8271
8272 START and END are buffer positions specifying the region.
8273 CODING-SYSTEM-LIST is a list of coding systems to check.
8274
8275 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8276 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8277 whole region, POS0, POS1, ... are buffer positions where non-encodable
8278 characters are found.
8279
8280 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8281 value is nil.
8282
8283 START may be a string.  In that case, check if the string is
8284 encodable, and the value contains indices to the string instead of
8285 buffer positions.  END is ignored.  */)
8286      (start, end, coding_system_list)
8287      Lisp_Object start, end, coding_system_list;
8288 {
8289   Lisp_Object list;
8290   EMACS_INT start_byte, end_byte;
8291   int pos;
8292   const unsigned char *p, *pbeg, *pend;
8293   int c;
8294   Lisp_Object tail, elt, attrs;
8295
8296   if (STRINGP (start))
8297     {
8298       if (!STRING_MULTIBYTE (start)
8299           && SCHARS (start) != SBYTES (start))
8300         return Qnil;
8301       start_byte = 0;
8302       end_byte = SBYTES (start);
8303       pos = 0;
8304     }
8305   else
8306     {
8307       CHECK_NUMBER_COERCE_MARKER (start);
8308       CHECK_NUMBER_COERCE_MARKER (end);
8309       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8310         args_out_of_range (start, end);
8311       if (NILP (current_buffer->enable_multibyte_characters))
8312         return Qnil;
8313       start_byte = CHAR_TO_BYTE (XINT (start));
8314       end_byte = CHAR_TO_BYTE (XINT (end));
8315       if (XINT (end) - XINT (start) == end_byte - start_byte)
8316         return Qt;
8317
8318       if (XINT (start) < GPT && XINT (end) > GPT)
8319         {
8320           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8321             move_gap_both (XINT (start), start_byte);
8322           else
8323             move_gap_both (XINT (end), end_byte);
8324         }
8325       pos = XINT (start);
8326     }
8327
8328   list = Qnil;
8329   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8330     {
8331       elt = XCAR (tail);
8332       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8333       ASET (attrs, coding_attr_trans_tbl,
8334             get_translation_table (attrs, 1, NULL));
8335       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8336     }
8337
8338   if (STRINGP (start))
8339     p = pbeg = SDATA (start);
8340   else
8341     p = pbeg = BYTE_POS_ADDR (start_byte);
8342   pend = p + (end_byte - start_byte);
8343
8344   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8345   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8346
8347   while (p < pend)
8348     {
8349       if (ASCII_BYTE_P (*p))
8350         p++;
8351       else
8352         {
8353           c = STRING_CHAR_ADVANCE (p);
8354
8355           charset_map_loaded = 0;
8356           for (tail = list; CONSP (tail); tail = XCDR (tail))
8357             {
8358               elt = XCDR (XCAR (tail));
8359               if (! char_encodable_p (c, XCAR (elt)))
8360                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8361             }
8362           if (charset_map_loaded)
8363             {
8364               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8365
8366               if (STRINGP (start))
8367                 pbeg = SDATA (start);
8368               else
8369                 pbeg = BYTE_POS_ADDR (start_byte);
8370               p = pbeg + p_offset;
8371               pend = pbeg + pend_offset;
8372             }
8373         }
8374       pos++;
8375     }
8376
8377   tail = list;
8378   list = Qnil;
8379   for (; CONSP (tail); tail = XCDR (tail))
8380     {
8381       elt = XCAR (tail);
8382       if (CONSP (XCDR (XCDR (elt))))
8383         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8384                       list);
8385     }
8386
8387   return list;
8388 }
8389
8390
8391 Lisp_Object
8392 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
8393      Lisp_Object start, end, coding_system, dst_object;
8394      int encodep, norecord;
8395 {
8396   struct coding_system coding;
8397   EMACS_INT from, from_byte, to, to_byte;
8398   Lisp_Object src_object;
8399
8400   CHECK_NUMBER_COERCE_MARKER (start);
8401   CHECK_NUMBER_COERCE_MARKER (end);
8402   if (NILP (coding_system))
8403     coding_system = Qno_conversion;
8404   else
8405     CHECK_CODING_SYSTEM (coding_system);
8406   src_object = Fcurrent_buffer ();
8407   if (NILP (dst_object))
8408     dst_object = src_object;
8409   else if (! EQ (dst_object, Qt))
8410     CHECK_BUFFER (dst_object);
8411
8412   validate_region (&start, &end);
8413   from = XFASTINT (start);
8414   from_byte = CHAR_TO_BYTE (from);
8415   to = XFASTINT (end);
8416   to_byte = CHAR_TO_BYTE (to);
8417
8418   setup_coding_system (coding_system, &coding);
8419   coding.mode |= CODING_MODE_LAST_BLOCK;
8420
8421   if (encodep)
8422     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8423                           dst_object);
8424   else
8425     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8426                           dst_object);
8427   if (! norecord)
8428     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8429
8430   return (BUFFERP (dst_object)
8431           ? make_number (coding.produced_char)
8432           : coding.dst_object);
8433 }
8434
8435
8436 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8437        3, 4, "r\nzCoding system: ",
8438        doc: /* Decode the current region from the specified coding system.
8439 When called from a program, takes four arguments:
8440         START, END, CODING-SYSTEM, and DESTINATION.
8441 START and END are buffer positions.
8442
8443 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8444 If nil, the region between START and END is replaced by the decoded text.
8445 If buffer, the decoded text is inserted in that buffer after point (point
8446 does not move).
8447 In those cases, the length of the decoded text is returned.
8448 If DESTINATION is t, the decoded text is returned.
8449
8450 This function sets `last-coding-system-used' to the precise coding system
8451 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8452 not fully specified.)  */)
8453      (start, end, coding_system, destination)
8454      Lisp_Object start, end, coding_system, destination;
8455 {
8456   return code_convert_region (start, end, coding_system, destination, 0, 0);
8457 }
8458
8459 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8460        3, 4, "r\nzCoding system: ",
8461        doc: /* Encode the current region by specified coding system.
8462 When called from a program, takes four arguments:
8463         START, END, CODING-SYSTEM and DESTINATION.
8464 START and END are buffer positions.
8465
8466 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8467 If nil, the region between START and END is replace by the encoded text.
8468 If buffer, the encoded text is inserted in that buffer after point (point
8469 does not move).
8470 In those cases, the length of the encoded text is returned.
8471 If DESTINATION is t, the encoded text is returned.
8472
8473 This function sets `last-coding-system-used' to the precise coding system
8474 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8475 not fully specified.)  */)
8476   (start, end, coding_system, destination)
8477      Lisp_Object start, end, coding_system, destination;
8478 {
8479   return code_convert_region (start, end, coding_system, destination, 1, 0);
8480 }
8481
8482 Lisp_Object
8483 code_convert_string (string, coding_system, dst_object,
8484                      encodep, nocopy, norecord)
8485      Lisp_Object string, coding_system, dst_object;
8486      int encodep, nocopy, norecord;
8487 {
8488   struct coding_system coding;
8489   EMACS_INT chars, bytes;
8490
8491   CHECK_STRING (string);
8492   if (NILP (coding_system))
8493     {
8494       if (! norecord)
8495         Vlast_coding_system_used = Qno_conversion;
8496       if (NILP (dst_object))
8497         return (nocopy ? Fcopy_sequence (string) : string);
8498     }
8499
8500   if (NILP (coding_system))
8501     coding_system = Qno_conversion;
8502   else
8503     CHECK_CODING_SYSTEM (coding_system);
8504   if (NILP (dst_object))
8505     dst_object = Qt;
8506   else if (! EQ (dst_object, Qt))
8507     CHECK_BUFFER (dst_object);
8508
8509   setup_coding_system (coding_system, &coding);
8510   coding.mode |= CODING_MODE_LAST_BLOCK;
8511   chars = SCHARS (string);
8512   bytes = SBYTES (string);
8513   if (encodep)
8514     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8515   else
8516     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8517   if (! norecord)
8518     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8519
8520   return (BUFFERP (dst_object)
8521           ? make_number (coding.produced_char)
8522           : coding.dst_object);
8523 }
8524
8525
8526 /* Encode or decode STRING according to CODING_SYSTEM.
8527    Do not set Vlast_coding_system_used.
8528
8529    This function is called only from macros DECODE_FILE and
8530    ENCODE_FILE, thus we ignore character composition.  */
8531
8532 Lisp_Object
8533 code_convert_string_norecord (string, coding_system, encodep)
8534      Lisp_Object string, coding_system;
8535      int encodep;
8536 {
8537   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8538 }
8539
8540
8541 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8542        2, 4, 0,
8543        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8544
8545 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8546 if the decoding operation is trivial.
8547
8548 Optional fourth arg BUFFER non-nil means that the decoded text is
8549 inserted in that buffer after point (point does not move).  In this
8550 case, the return value is the length of the decoded text.
8551
8552 This function sets `last-coding-system-used' to the precise coding system
8553 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8554 not fully specified.)  */)
8555   (string, coding_system, nocopy, buffer)
8556      Lisp_Object string, coding_system, nocopy, buffer;
8557 {
8558   return code_convert_string (string, coding_system, buffer,
8559                               0, ! NILP (nocopy), 0);
8560 }
8561
8562 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8563        2, 4, 0,
8564        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8565
8566 Optional third arg NOCOPY non-nil means it is OK to return STRING
8567 itself if the encoding operation is trivial.
8568
8569 Optional fourth arg BUFFER non-nil means that the encoded text is
8570 inserted in that buffer after point (point does not move).  In this
8571 case, the return value is the length of the encoded text.
8572
8573 This function sets `last-coding-system-used' to the precise coding system
8574 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8575 not fully specified.)  */)
8576      (string, coding_system, nocopy, buffer)
8577      Lisp_Object string, coding_system, nocopy, buffer;
8578 {
8579   return code_convert_string (string, coding_system, buffer,
8580                               1, ! NILP (nocopy), 1);
8581 }
8582
8583 \f
8584 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8585        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8586 Return the corresponding character.  */)
8587      (code)
8588      Lisp_Object code;
8589 {
8590   Lisp_Object spec, attrs, val;
8591   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8592   int c;
8593
8594   CHECK_NATNUM (code);
8595   c = XFASTINT (code);
8596   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8597   attrs = AREF (spec, 0);
8598
8599   if (ASCII_BYTE_P (c)
8600       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8601     return code;
8602
8603   val = CODING_ATTR_CHARSET_LIST (attrs);
8604   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8605   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8606   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8607
8608   if (c <= 0x7F)
8609     charset = charset_roman;
8610   else if (c >= 0xA0 && c < 0xDF)
8611     {
8612       charset = charset_kana;
8613       c -= 0x80;
8614     }
8615   else
8616     {
8617       int s1 = c >> 8, s2 = c & 0xFF;
8618
8619       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8620           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8621         error ("Invalid code: %d", code);
8622       SJIS_TO_JIS (c);
8623       charset = charset_kanji;
8624     }
8625   c = DECODE_CHAR (charset, c);
8626   if (c < 0)
8627     error ("Invalid code: %d", code);
8628   return make_number (c);
8629 }
8630
8631
8632 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8633        doc: /* Encode a Japanese character CH to shift_jis encoding.
8634 Return the corresponding code in SJIS.  */)
8635      (ch)
8636     Lisp_Object ch;
8637 {
8638   Lisp_Object spec, attrs, charset_list;
8639   int c;
8640   struct charset *charset;
8641   unsigned code;
8642
8643   CHECK_CHARACTER (ch);
8644   c = XFASTINT (ch);
8645   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8646   attrs = AREF (spec, 0);
8647
8648   if (ASCII_CHAR_P (c)
8649       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8650     return ch;
8651
8652   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8653   charset = char_charset (c, charset_list, &code);
8654   if (code == CHARSET_INVALID_CODE (charset))
8655     error ("Can't encode by shift_jis encoding: %d", c);
8656   JIS_TO_SJIS (code);
8657
8658   return make_number (code);
8659 }
8660
8661 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8662        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8663 Return the corresponding character.  */)
8664      (code)
8665      Lisp_Object code;
8666 {
8667   Lisp_Object spec, attrs, val;
8668   struct charset *charset_roman, *charset_big5, *charset;
8669   int c;
8670
8671   CHECK_NATNUM (code);
8672   c = XFASTINT (code);
8673   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8674   attrs = AREF (spec, 0);
8675
8676   if (ASCII_BYTE_P (c)
8677       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8678     return code;
8679
8680   val = CODING_ATTR_CHARSET_LIST (attrs);
8681   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8682   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8683
8684   if (c <= 0x7F)
8685     charset = charset_roman;
8686   else
8687     {
8688       int b1 = c >> 8, b2 = c & 0x7F;
8689       if (b1 < 0xA1 || b1 > 0xFE
8690           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8691         error ("Invalid code: %d", code);
8692       charset = charset_big5;
8693     }
8694   c = DECODE_CHAR (charset, (unsigned )c);
8695   if (c < 0)
8696     error ("Invalid code: %d", code);
8697   return make_number (c);
8698 }
8699
8700 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8701        doc: /* Encode the Big5 character CH to BIG5 coding system.
8702 Return the corresponding character code in Big5.  */)
8703      (ch)
8704      Lisp_Object ch;
8705 {
8706   Lisp_Object spec, attrs, charset_list;
8707   struct charset *charset;
8708   int c;
8709   unsigned code;
8710
8711   CHECK_CHARACTER (ch);
8712   c = XFASTINT (ch);
8713   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8714   attrs = AREF (spec, 0);
8715   if (ASCII_CHAR_P (c)
8716       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8717     return ch;
8718
8719   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8720   charset = char_charset (c, charset_list, &code);
8721   if (code == CHARSET_INVALID_CODE (charset))
8722     error ("Can't encode by Big5 encoding: %d", c);
8723
8724   return make_number (code);
8725 }
8726
8727 \f
8728 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
8729        Sset_terminal_coding_system_internal, 1, 2, 0,
8730        doc: /* Internal use only.  */)
8731      (coding_system, terminal)
8732      Lisp_Object coding_system;
8733      Lisp_Object terminal;
8734 {
8735   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8736   CHECK_SYMBOL (coding_system);
8737   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
8738   /* We had better not send unsafe characters to terminal.  */
8739   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
8740   /* Characer composition should be disabled.  */
8741   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8742   terminal_coding->src_multibyte = 1;
8743   terminal_coding->dst_multibyte = 0;
8744   return Qnil;
8745 }
8746
8747 DEFUN ("set-safe-terminal-coding-system-internal",
8748        Fset_safe_terminal_coding_system_internal,
8749        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8750        doc: /* Internal use only.  */)
8751      (coding_system)
8752      Lisp_Object coding_system;
8753 {
8754   CHECK_SYMBOL (coding_system);
8755   setup_coding_system (Fcheck_coding_system (coding_system),
8756                        &safe_terminal_coding);
8757   /* Characer composition should be disabled.  */
8758   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8759   safe_terminal_coding.src_multibyte = 1;
8760   safe_terminal_coding.dst_multibyte = 0;
8761   return Qnil;
8762 }
8763
8764 DEFUN ("terminal-coding-system", Fterminal_coding_system,
8765        Sterminal_coding_system, 0, 1, 0,
8766        doc: /* Return coding system specified for terminal output on the given terminal.
8767 TERMINAL may be a terminal id, a frame, or nil for the selected
8768 frame's terminal device.  */)
8769      (terminal)
8770      Lisp_Object terminal;
8771 {
8772   struct coding_system *terminal_coding
8773     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
8774   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
8775
8776   /* For backward compatibility, return nil if it is `undecided'. */
8777   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8778 }
8779
8780 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
8781        Sset_keyboard_coding_system_internal, 1, 2, 0,
8782        doc: /* Internal use only.  */)
8783      (coding_system, terminal)
8784      Lisp_Object coding_system;
8785      Lisp_Object terminal;
8786 {
8787   struct terminal *t = get_terminal (terminal, 1);
8788   CHECK_SYMBOL (coding_system);
8789   setup_coding_system (Fcheck_coding_system (coding_system),
8790                        TERMINAL_KEYBOARD_CODING (t));
8791   /* Characer composition should be disabled.  */
8792   TERMINAL_KEYBOARD_CODING (t)->common_flags
8793     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8794   return Qnil;
8795 }
8796
8797 DEFUN ("keyboard-coding-system",
8798        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
8799        doc: /* Return coding system specified for decoding keyboard input.  */)
8800      (terminal)
8801      Lisp_Object terminal;
8802 {
8803   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
8804                          (get_terminal (terminal, 1))->id);
8805 }
8806
8807 \f
8808 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8809        Sfind_operation_coding_system,  1, MANY, 0,
8810        doc: /* Choose a coding system for an operation based on the target name.
8811 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8812 DECODING-SYSTEM is the coding system to use for decoding
8813 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8814 for encoding (in case OPERATION does encoding).
8815
8816 The first argument OPERATION specifies an I/O primitive:
8817   For file I/O, `insert-file-contents' or `write-region'.
8818   For process I/O, `call-process', `call-process-region', or `start-process'.
8819   For network I/O, `open-network-stream'.
8820
8821 The remaining arguments should be the same arguments that were passed
8822 to the primitive.  Depending on which primitive, one of those arguments
8823 is selected as the TARGET.  For example, if OPERATION does file I/O,
8824 whichever argument specifies the file name is TARGET.
8825
8826 TARGET has a meaning which depends on OPERATION:
8827   For file I/O, TARGET is a file name (except for the special case below).
8828   For process I/O, TARGET is a process name.
8829   For network I/O, TARGET is a service name or a port number.
8830
8831 This function looks up what is specified for TARGET in
8832 `file-coding-system-alist', `process-coding-system-alist',
8833 or `network-coding-system-alist' depending on OPERATION.
8834 They may specify a coding system, a cons of coding systems,
8835 or a function symbol to call.
8836 In the last case, we call the function with one argument,
8837 which is a list of all the arguments given to this function.
8838 If the function can't decide a coding system, it can return
8839 `undecided' so that the normal code-detection is performed.
8840
8841 If OPERATION is `insert-file-contents', the argument corresponding to
8842 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
8843 file name to look up, and BUFFER is a buffer that contains the file's
8844 contents (not yet decoded).  If `file-coding-system-alist' specifies a
8845 function to call for FILENAME, that function should examine the
8846 contents of BUFFER instead of reading the file.
8847
8848 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
8849      (nargs, args)
8850      int nargs;
8851      Lisp_Object *args;
8852 {
8853   Lisp_Object operation, target_idx, target, val;
8854   register Lisp_Object chain;
8855
8856   if (nargs < 2)
8857     error ("Too few arguments");
8858   operation = args[0];
8859   if (!SYMBOLP (operation)
8860       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8861     error ("Invalid first argument");
8862   if (nargs < 1 + XINT (target_idx))
8863     error ("Too few arguments for operation: %s",
8864            SDATA (SYMBOL_NAME (operation)));
8865   target = args[XINT (target_idx) + 1];
8866   if (!(STRINGP (target)
8867         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
8868             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
8869         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8870     error ("Invalid %dth argument", XINT (target_idx) + 1);
8871   if (CONSP (target))
8872     target = XCAR (target);
8873
8874   chain = ((EQ (operation, Qinsert_file_contents)
8875             || EQ (operation, Qwrite_region))
8876            ? Vfile_coding_system_alist
8877            : (EQ (operation, Qopen_network_stream)
8878               ? Vnetwork_coding_system_alist
8879               : Vprocess_coding_system_alist));
8880   if (NILP (chain))
8881     return Qnil;
8882
8883   for (; CONSP (chain); chain = XCDR (chain))
8884     {
8885       Lisp_Object elt;
8886
8887       elt = XCAR (chain);
8888       if (CONSP (elt)
8889           && ((STRINGP (target)
8890                && STRINGP (XCAR (elt))
8891                && fast_string_match (XCAR (elt), target) >= 0)
8892               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8893         {
8894           val = XCDR (elt);
8895           /* Here, if VAL is both a valid coding system and a valid
8896              function symbol, we return VAL as a coding system.  */
8897           if (CONSP (val))
8898             return val;
8899           if (! SYMBOLP (val))
8900             return Qnil;
8901           if (! NILP (Fcoding_system_p (val)))
8902             return Fcons (val, val);
8903           if (! NILP (Ffboundp (val)))
8904             {
8905               /* We use call1 rather than safe_call1
8906                  so as to get bug reports about functions called here
8907                  which don't handle the current interface.  */
8908               val = call1 (val, Flist (nargs, args));
8909               if (CONSP (val))
8910                 return val;
8911               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8912                 return Fcons (val, val);
8913             }
8914           return Qnil;
8915         }
8916     }
8917   return Qnil;
8918 }
8919
8920 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8921        Sset_coding_system_priority, 0, MANY, 0,
8922        doc: /* Assign higher priority to the coding systems given as arguments.
8923 If multiple coding systems belong to the same category,
8924 all but the first one are ignored.
8925
8926 usage: (set-coding-system-priority &rest coding-systems)  */)
8927      (nargs, args)
8928      int nargs;
8929      Lisp_Object *args;
8930 {
8931   int i, j;
8932   int changed[coding_category_max];
8933   enum coding_category priorities[coding_category_max];
8934
8935   bzero (changed, sizeof changed);
8936
8937   for (i = j = 0; i < nargs; i++)
8938     {
8939       enum coding_category category;
8940       Lisp_Object spec, attrs;
8941
8942       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8943       attrs = AREF (spec, 0);
8944       category = XINT (CODING_ATTR_CATEGORY (attrs));
8945       if (changed[category])
8946         /* Ignore this coding system because a coding system of the
8947            same category already had a higher priority.  */
8948         continue;
8949       changed[category] = 1;
8950       priorities[j++] = category;
8951       if (coding_categories[category].id >= 0
8952           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8953         setup_coding_system (args[i], &coding_categories[category]);
8954       Fset (AREF (Vcoding_category_table, category), args[i]);
8955     }
8956
8957   /* Now we have decided top J priorities.  Reflect the order of the
8958      original priorities to the remaining priorities.  */
8959
8960   for (i = j, j = 0; i < coding_category_max; i++, j++)
8961     {
8962       while (j < coding_category_max
8963              && changed[coding_priorities[j]])
8964         j++;
8965       if (j == coding_category_max)
8966         abort ();
8967       priorities[i] = coding_priorities[j];
8968     }
8969
8970   bcopy (priorities, coding_priorities, sizeof priorities);
8971
8972   /* Update `coding-category-list'.  */
8973   Vcoding_category_list = Qnil;
8974   for (i = coding_category_max - 1; i >= 0; i--)
8975     Vcoding_category_list
8976       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8977                Vcoding_category_list);
8978
8979   return Qnil;
8980 }
8981
8982 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8983        Scoding_system_priority_list, 0, 1, 0,
8984        doc: /* Return a list of coding systems ordered by their priorities.
8985 HIGHESTP non-nil means just return the highest priority one.  */)
8986      (highestp)
8987      Lisp_Object highestp;
8988 {
8989   int i;
8990   Lisp_Object val;
8991
8992   for (i = 0, val = Qnil; i < coding_category_max; i++)
8993     {
8994       enum coding_category category = coding_priorities[i];
8995       int id = coding_categories[category].id;
8996       Lisp_Object attrs;
8997
8998       if (id < 0)
8999         continue;
9000       attrs = CODING_ID_ATTRS (id);
9001       if (! NILP (highestp))
9002         return CODING_ATTR_BASE_NAME (attrs);
9003       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9004     }
9005   return Fnreverse (val);
9006 }
9007
9008 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9009
9010 static Lisp_Object
9011 make_subsidiaries (base)
9012      Lisp_Object base;
9013 {
9014   Lisp_Object subsidiaries;
9015   int base_name_len = SBYTES (SYMBOL_NAME (base));
9016   char *buf = (char *) alloca (base_name_len + 6);
9017   int i;
9018
9019   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9020   subsidiaries = Fmake_vector (make_number (3), Qnil);
9021   for (i = 0; i < 3; i++)
9022     {
9023       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9024       ASET (subsidiaries, i, intern (buf));
9025     }
9026   return subsidiaries;
9027 }
9028
9029
9030 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9031        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9032        doc: /* For internal use only.
9033 usage: (define-coding-system-internal ...)  */)
9034      (nargs, args)
9035      int nargs;
9036      Lisp_Object *args;
9037 {
9038   Lisp_Object name;
9039   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9040   Lisp_Object attrs;            /* Vector of attributes.  */
9041   Lisp_Object eol_type;
9042   Lisp_Object aliases;
9043   Lisp_Object coding_type, charset_list, safe_charsets;
9044   enum coding_category category;
9045   Lisp_Object tail, val;
9046   int max_charset_id = 0;
9047   int i;
9048
9049   if (nargs < coding_arg_max)
9050     goto short_args;
9051
9052   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9053
9054   name = args[coding_arg_name];
9055   CHECK_SYMBOL (name);
9056   CODING_ATTR_BASE_NAME (attrs) = name;
9057
9058   val = args[coding_arg_mnemonic];
9059   if (! STRINGP (val))
9060     CHECK_CHARACTER (val);
9061   CODING_ATTR_MNEMONIC (attrs) = val;
9062
9063   coding_type = args[coding_arg_coding_type];
9064   CHECK_SYMBOL (coding_type);
9065   CODING_ATTR_TYPE (attrs) = coding_type;
9066
9067   charset_list = args[coding_arg_charset_list];
9068   if (SYMBOLP (charset_list))
9069     {
9070       if (EQ (charset_list, Qiso_2022))
9071         {
9072           if (! EQ (coding_type, Qiso_2022))
9073             error ("Invalid charset-list");
9074           charset_list = Viso_2022_charset_list;
9075         }
9076       else if (EQ (charset_list, Qemacs_mule))
9077         {
9078           if (! EQ (coding_type, Qemacs_mule))
9079             error ("Invalid charset-list");
9080           charset_list = Vemacs_mule_charset_list;
9081         }
9082       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9083         if (max_charset_id < XFASTINT (XCAR (tail)))
9084           max_charset_id = XFASTINT (XCAR (tail));
9085     }
9086   else
9087     {
9088       charset_list = Fcopy_sequence (charset_list);
9089       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9090         {
9091           struct charset *charset;
9092
9093           val = XCAR (tail);
9094           CHECK_CHARSET_GET_CHARSET (val, charset);
9095           if (EQ (coding_type, Qiso_2022)
9096               ? CHARSET_ISO_FINAL (charset) < 0
9097               : EQ (coding_type, Qemacs_mule)
9098               ? CHARSET_EMACS_MULE_ID (charset) < 0
9099               : 0)
9100             error ("Can't handle charset `%s'",
9101                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9102
9103           XSETCAR (tail, make_number (charset->id));
9104           if (max_charset_id < charset->id)
9105             max_charset_id = charset->id;
9106         }
9107     }
9108   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9109
9110   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
9111                                 make_number (255));
9112   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9113     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9114   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9115
9116   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9117
9118   val = args[coding_arg_decode_translation_table];
9119   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9120     CHECK_SYMBOL (val);
9121   CODING_ATTR_DECODE_TBL (attrs) = val;
9122
9123   val = args[coding_arg_encode_translation_table];
9124   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9125     CHECK_SYMBOL (val);
9126   CODING_ATTR_ENCODE_TBL (attrs) = val;
9127
9128   val = args[coding_arg_post_read_conversion];
9129   CHECK_SYMBOL (val);
9130   CODING_ATTR_POST_READ (attrs) = val;
9131
9132   val = args[coding_arg_pre_write_conversion];
9133   CHECK_SYMBOL (val);
9134   CODING_ATTR_PRE_WRITE (attrs) = val;
9135
9136   val = args[coding_arg_default_char];
9137   if (NILP (val))
9138     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9139   else
9140     {
9141       CHECK_CHARACTER (val);
9142       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9143     }
9144
9145   val = args[coding_arg_for_unibyte];
9146   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9147
9148   val = args[coding_arg_plist];
9149   CHECK_LIST (val);
9150   CODING_ATTR_PLIST (attrs) = val;
9151
9152   if (EQ (coding_type, Qcharset))
9153     {
9154       /* Generate a lisp vector of 256 elements.  Each element is nil,
9155          integer, or a list of charset IDs.
9156
9157          If Nth element is nil, the byte code N is invalid in this
9158          coding system.
9159
9160          If Nth element is a number NUM, N is the first byte of a
9161          charset whose ID is NUM.
9162
9163          If Nth element is a list of charset IDs, N is the first byte
9164          of one of them.  The list is sorted by dimensions of the
9165          charsets.  A charset of smaller dimension comes firtst. */
9166       val = Fmake_vector (make_number (256), Qnil);
9167
9168       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9169         {
9170           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9171           int dim = CHARSET_DIMENSION (charset);
9172           int idx = (dim - 1) * 4;
9173
9174           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9175             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9176
9177           for (i = charset->code_space[idx];
9178                i <= charset->code_space[idx + 1]; i++)
9179             {
9180               Lisp_Object tmp, tmp2;
9181               int dim2;
9182
9183               tmp = AREF (val, i);
9184               if (NILP (tmp))
9185                 tmp = XCAR (tail);
9186               else if (NUMBERP (tmp))
9187                 {
9188                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9189                   if (dim < dim2)
9190                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9191                   else
9192                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9193                 }
9194               else
9195                 {
9196                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9197                     {
9198                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9199                       if (dim < dim2)
9200                         break;
9201                     }
9202                   if (NILP (tmp2))
9203                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9204                   else
9205                     {
9206                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9207                       XSETCAR (tmp2, XCAR (tail));
9208                     }
9209                 }
9210               ASET (val, i, tmp);
9211             }
9212         }
9213       ASET (attrs, coding_attr_charset_valids, val);
9214       category = coding_category_charset;
9215     }
9216   else if (EQ (coding_type, Qccl))
9217     {
9218       Lisp_Object valids;
9219
9220       if (nargs < coding_arg_ccl_max)
9221         goto short_args;
9222
9223       val = args[coding_arg_ccl_decoder];
9224       CHECK_CCL_PROGRAM (val);
9225       if (VECTORP (val))
9226         val = Fcopy_sequence (val);
9227       ASET (attrs, coding_attr_ccl_decoder, val);
9228
9229       val = args[coding_arg_ccl_encoder];
9230       CHECK_CCL_PROGRAM (val);
9231       if (VECTORP (val))
9232         val = Fcopy_sequence (val);
9233       ASET (attrs, coding_attr_ccl_encoder, val);
9234
9235       val = args[coding_arg_ccl_valids];
9236       valids = Fmake_string (make_number (256), make_number (0));
9237       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9238         {
9239           int from, to;
9240
9241           val = Fcar (tail);
9242           if (INTEGERP (val))
9243             {
9244               from = to = XINT (val);
9245               if (from < 0 || from > 255)
9246                 args_out_of_range_3 (val, make_number (0), make_number (255));
9247             }
9248           else
9249             {
9250               CHECK_CONS (val);
9251               CHECK_NATNUM_CAR (val);
9252               CHECK_NATNUM_CDR (val);
9253               from = XINT (XCAR (val));
9254               if (from > 255)
9255                 args_out_of_range_3 (XCAR (val),
9256                                      make_number (0), make_number (255));
9257               to = XINT (XCDR (val));
9258               if (to < from || to > 255)
9259                 args_out_of_range_3 (XCDR (val),
9260                                      XCAR (val), make_number (255));
9261             }
9262           for (i = from; i <= to; i++)
9263             SSET (valids, i, 1);
9264         }
9265       ASET (attrs, coding_attr_ccl_valids, valids);
9266
9267       category = coding_category_ccl;
9268     }
9269   else if (EQ (coding_type, Qutf_16))
9270     {
9271       Lisp_Object bom, endian;
9272
9273       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9274
9275       if (nargs < coding_arg_utf16_max)
9276         goto short_args;
9277
9278       bom = args[coding_arg_utf16_bom];
9279       if (! NILP (bom) && ! EQ (bom, Qt))
9280         {
9281           CHECK_CONS (bom);
9282           val = XCAR (bom);
9283           CHECK_CODING_SYSTEM (val);
9284           val = XCDR (bom);
9285           CHECK_CODING_SYSTEM (val);
9286         }
9287       ASET (attrs, coding_attr_utf_bom, bom);
9288
9289       endian = args[coding_arg_utf16_endian];
9290       CHECK_SYMBOL (endian);
9291       if (NILP (endian))
9292         endian = Qbig;
9293       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9294         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9295       ASET (attrs, coding_attr_utf_16_endian, endian);
9296
9297       category = (CONSP (bom)
9298                   ? coding_category_utf_16_auto
9299                   : NILP (bom)
9300                   ? (EQ (endian, Qbig)
9301                      ? coding_category_utf_16_be_nosig
9302                      : coding_category_utf_16_le_nosig)
9303                   : (EQ (endian, Qbig)
9304                      ? coding_category_utf_16_be
9305                      : coding_category_utf_16_le));
9306     }
9307   else if (EQ (coding_type, Qiso_2022))
9308     {
9309       Lisp_Object initial, reg_usage, request, flags;
9310       int i;
9311
9312       if (nargs < coding_arg_iso2022_max)
9313         goto short_args;
9314
9315       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9316       CHECK_VECTOR (initial);
9317       for (i = 0; i < 4; i++)
9318         {
9319           val = Faref (initial, make_number (i));
9320           if (! NILP (val))
9321             {
9322               struct charset *charset;
9323
9324               CHECK_CHARSET_GET_CHARSET (val, charset);
9325               ASET (initial, i, make_number (CHARSET_ID (charset)));
9326               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9327                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9328             }
9329           else
9330             ASET (initial, i, make_number (-1));
9331         }
9332
9333       reg_usage = args[coding_arg_iso2022_reg_usage];
9334       CHECK_CONS (reg_usage);
9335       CHECK_NUMBER_CAR (reg_usage);
9336       CHECK_NUMBER_CDR (reg_usage);
9337
9338       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9339       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9340         {
9341           int id;
9342           Lisp_Object tmp;
9343
9344           val = Fcar (tail);
9345           CHECK_CONS (val);
9346           tmp = XCAR (val);
9347           CHECK_CHARSET_GET_ID (tmp, id);
9348           CHECK_NATNUM_CDR (val);
9349           if (XINT (XCDR (val)) >= 4)
9350             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9351           XSETCAR (val, make_number (id));
9352         }
9353
9354       flags = args[coding_arg_iso2022_flags];
9355       CHECK_NATNUM (flags);
9356       i = XINT (flags);
9357       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9358         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9359
9360       ASET (attrs, coding_attr_iso_initial, initial);
9361       ASET (attrs, coding_attr_iso_usage, reg_usage);
9362       ASET (attrs, coding_attr_iso_request, request);
9363       ASET (attrs, coding_attr_iso_flags, flags);
9364       setup_iso_safe_charsets (attrs);
9365
9366       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9367         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9368                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9369                     ? coding_category_iso_7_else
9370                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9371                     ? coding_category_iso_7
9372                     : coding_category_iso_7_tight);
9373       else
9374         {
9375           int id = XINT (AREF (initial, 1));
9376
9377           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9378                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9379                        || id < 0)
9380                       ? coding_category_iso_8_else
9381                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9382                       ? coding_category_iso_8_1
9383                       : coding_category_iso_8_2);
9384         }
9385       if (category != coding_category_iso_8_1
9386           && category != coding_category_iso_8_2)
9387         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9388     }
9389   else if (EQ (coding_type, Qemacs_mule))
9390     {
9391       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9392         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9393       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9394       category = coding_category_emacs_mule;
9395     }
9396   else if (EQ (coding_type, Qshift_jis))
9397     {
9398
9399       struct charset *charset;
9400
9401       if (XINT (Flength (charset_list)) != 3
9402           && XINT (Flength (charset_list)) != 4)
9403         error ("There should be three or four charsets");
9404
9405       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9406       if (CHARSET_DIMENSION (charset) != 1)
9407         error ("Dimension of charset %s is not one",
9408                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9409       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9410         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9411
9412       charset_list = XCDR (charset_list);
9413       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9414       if (CHARSET_DIMENSION (charset) != 1)
9415         error ("Dimension of charset %s is not one",
9416                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9417
9418       charset_list = XCDR (charset_list);
9419       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9420       if (CHARSET_DIMENSION (charset) != 2)
9421         error ("Dimension of charset %s is not two",
9422                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9423
9424       charset_list = XCDR (charset_list);
9425       if (! NILP (charset_list))
9426         {
9427           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9428           if (CHARSET_DIMENSION (charset) != 2)
9429             error ("Dimension of charset %s is not two",
9430                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9431         }
9432
9433       category = coding_category_sjis;
9434       Vsjis_coding_system = name;
9435     }
9436   else if (EQ (coding_type, Qbig5))
9437     {
9438       struct charset *charset;
9439
9440       if (XINT (Flength (charset_list)) != 2)
9441         error ("There should be just two charsets");
9442
9443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9444       if (CHARSET_DIMENSION (charset) != 1)
9445         error ("Dimension of charset %s is not one",
9446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9447       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9448         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9449
9450       charset_list = XCDR (charset_list);
9451       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9452       if (CHARSET_DIMENSION (charset) != 2)
9453         error ("Dimension of charset %s is not two",
9454                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9455
9456       category = coding_category_big5;
9457       Vbig5_coding_system = name;
9458     }
9459   else if (EQ (coding_type, Qraw_text))
9460     {
9461       category = coding_category_raw_text;
9462       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9463     }
9464   else if (EQ (coding_type, Qutf_8))
9465     {
9466       Lisp_Object bom;
9467
9468       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9469
9470       if (nargs < coding_arg_utf8_max)
9471         goto short_args;
9472
9473       bom = args[coding_arg_utf8_bom];
9474       if (! NILP (bom) && ! EQ (bom, Qt))
9475         {
9476           CHECK_CONS (bom);
9477           val = XCAR (bom);
9478           CHECK_CODING_SYSTEM (val);
9479           val = XCDR (bom);
9480           CHECK_CODING_SYSTEM (val);
9481         }
9482       ASET (attrs, coding_attr_utf_bom, bom);
9483
9484       category = (CONSP (bom) ? coding_category_utf_8_auto
9485                   : NILP (bom) ? coding_category_utf_8_nosig
9486                   : coding_category_utf_8_sig);
9487     }
9488   else if (EQ (coding_type, Qundecided))
9489     category = coding_category_undecided;
9490   else
9491     error ("Invalid coding system type: %s",
9492            SDATA (SYMBOL_NAME (coding_type)));
9493
9494   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9495   CODING_ATTR_PLIST (attrs)
9496     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9497                                 CODING_ATTR_PLIST (attrs)));
9498   CODING_ATTR_PLIST (attrs)
9499     = Fcons (QCascii_compatible_p,
9500              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9501                     CODING_ATTR_PLIST (attrs)));
9502
9503   eol_type = args[coding_arg_eol_type];
9504   if (! NILP (eol_type)
9505       && ! EQ (eol_type, Qunix)
9506       && ! EQ (eol_type, Qdos)
9507       && ! EQ (eol_type, Qmac))
9508     error ("Invalid eol-type");
9509
9510   aliases = Fcons (name, Qnil);
9511
9512   if (NILP (eol_type))
9513     {
9514       eol_type = make_subsidiaries (name);
9515       for (i = 0; i < 3; i++)
9516         {
9517           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9518
9519           this_name = AREF (eol_type, i);
9520           this_aliases = Fcons (this_name, Qnil);
9521           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9522           this_spec = Fmake_vector (make_number (3), attrs);
9523           ASET (this_spec, 1, this_aliases);
9524           ASET (this_spec, 2, this_eol_type);
9525           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9526           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9527           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9528           if (NILP (val))
9529             Vcoding_system_alist
9530               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9531                        Vcoding_system_alist);
9532         }
9533     }
9534
9535   spec_vec = Fmake_vector (make_number (3), attrs);
9536   ASET (spec_vec, 1, aliases);
9537   ASET (spec_vec, 2, eol_type);
9538
9539   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9540   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9541   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9542   if (NILP (val))
9543     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9544                                   Vcoding_system_alist);
9545
9546   {
9547     int id = coding_categories[category].id;
9548
9549     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9550       setup_coding_system (name, &coding_categories[category]);
9551   }
9552
9553   return Qnil;
9554
9555  short_args:
9556   return Fsignal (Qwrong_number_of_arguments,
9557                   Fcons (intern ("define-coding-system-internal"),
9558                          make_number (nargs)));
9559 }
9560
9561
9562 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9563        3, 3, 0,
9564        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9565   (coding_system, prop, val)
9566      Lisp_Object coding_system, prop, val;
9567 {
9568   Lisp_Object spec, attrs;
9569
9570   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9571   attrs = AREF (spec, 0);
9572   if (EQ (prop, QCmnemonic))
9573     {
9574       if (! STRINGP (val))
9575         CHECK_CHARACTER (val);
9576       CODING_ATTR_MNEMONIC (attrs) = val;
9577     }
9578   else if (EQ (prop, QCdefalut_char))
9579     {
9580       if (NILP (val))
9581         val = make_number (' ');
9582       else
9583         CHECK_CHARACTER (val);
9584       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9585     }
9586   else if (EQ (prop, QCdecode_translation_table))
9587     {
9588       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9589         CHECK_SYMBOL (val);
9590       CODING_ATTR_DECODE_TBL (attrs) = val;
9591     }
9592   else if (EQ (prop, QCencode_translation_table))
9593     {
9594       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9595         CHECK_SYMBOL (val);
9596       CODING_ATTR_ENCODE_TBL (attrs) = val;
9597     }
9598   else if (EQ (prop, QCpost_read_conversion))
9599     {
9600       CHECK_SYMBOL (val);
9601       CODING_ATTR_POST_READ (attrs) = val;
9602     }
9603   else if (EQ (prop, QCpre_write_conversion))
9604     {
9605       CHECK_SYMBOL (val);
9606       CODING_ATTR_PRE_WRITE (attrs) = val;
9607     }
9608   else if (EQ (prop, QCascii_compatible_p))
9609     {
9610       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9611     }
9612
9613   CODING_ATTR_PLIST (attrs)
9614     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9615   return val;
9616 }
9617
9618
9619 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9620        Sdefine_coding_system_alias, 2, 2, 0,
9621        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9622      (alias, coding_system)
9623      Lisp_Object alias, coding_system;
9624 {
9625   Lisp_Object spec, aliases, eol_type, val;
9626
9627   CHECK_SYMBOL (alias);
9628   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9629   aliases = AREF (spec, 1);
9630   /* ALIASES should be a list of length more than zero, and the first
9631      element is a base coding system.  Append ALIAS at the tail of the
9632      list.  */
9633   while (!NILP (XCDR (aliases)))
9634     aliases = XCDR (aliases);
9635   XSETCDR (aliases, Fcons (alias, Qnil));
9636
9637   eol_type = AREF (spec, 2);
9638   if (VECTORP (eol_type))
9639     {
9640       Lisp_Object subsidiaries;
9641       int i;
9642
9643       subsidiaries = make_subsidiaries (alias);
9644       for (i = 0; i < 3; i++)
9645         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9646                                      AREF (eol_type, i));
9647     }
9648
9649   Fputhash (alias, spec, Vcoding_system_hash_table);
9650   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9651   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9652   if (NILP (val))
9653     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9654                                   Vcoding_system_alist);
9655
9656   return Qnil;
9657 }
9658
9659 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9660        1, 1, 0,
9661        doc: /* Return the base of CODING-SYSTEM.
9662 Any alias or subsidiary coding system is not a base coding system.  */)
9663   (coding_system)
9664      Lisp_Object coding_system;
9665 {
9666   Lisp_Object spec, attrs;
9667
9668   if (NILP (coding_system))
9669     return (Qno_conversion);
9670   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9671   attrs = AREF (spec, 0);
9672   return CODING_ATTR_BASE_NAME (attrs);
9673 }
9674
9675 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9676        1, 1, 0,
9677        doc: "Return the property list of CODING-SYSTEM.")
9678      (coding_system)
9679      Lisp_Object coding_system;
9680 {
9681   Lisp_Object spec, attrs;
9682
9683   if (NILP (coding_system))
9684     coding_system = Qno_conversion;
9685   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9686   attrs = AREF (spec, 0);
9687   return CODING_ATTR_PLIST (attrs);
9688 }
9689
9690
9691 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9692        1, 1, 0,
9693        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9694      (coding_system)
9695      Lisp_Object coding_system;
9696 {
9697   Lisp_Object spec;
9698
9699   if (NILP (coding_system))
9700     coding_system = Qno_conversion;
9701   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9702   return AREF (spec, 1);
9703 }
9704
9705 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9706        Scoding_system_eol_type, 1, 1, 0,
9707        doc: /* Return eol-type of CODING-SYSTEM.
9708 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
9709
9710 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9711 and CR respectively.
9712
9713 A vector value indicates that a format of end-of-line should be
9714 detected automatically.  Nth element of the vector is the subsidiary
9715 coding system whose eol-type is N.  */)
9716      (coding_system)
9717      Lisp_Object coding_system;
9718 {
9719   Lisp_Object spec, eol_type;
9720   int n;
9721
9722   if (NILP (coding_system))
9723     coding_system = Qno_conversion;
9724   if (! CODING_SYSTEM_P (coding_system))
9725     return Qnil;
9726   spec = CODING_SYSTEM_SPEC (coding_system);
9727   eol_type = AREF (spec, 2);
9728   if (VECTORP (eol_type))
9729     return Fcopy_sequence (eol_type);
9730   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9731   return make_number (n);
9732 }
9733
9734 #endif /* emacs */
9735
9736 \f
9737 /*** 9. Post-amble ***/
9738
9739 void
9740 init_coding_once ()
9741 {
9742   int i;
9743
9744   for (i = 0; i < coding_category_max; i++)
9745     {
9746       coding_categories[i].id = -1;
9747       coding_priorities[i] = i;
9748     }
9749
9750   /* ISO2022 specific initialize routine.  */
9751   for (i = 0; i < 0x20; i++)
9752     iso_code_class[i] = ISO_control_0;
9753   for (i = 0x21; i < 0x7F; i++)
9754     iso_code_class[i] = ISO_graphic_plane_0;
9755   for (i = 0x80; i < 0xA0; i++)
9756     iso_code_class[i] = ISO_control_1;
9757   for (i = 0xA1; i < 0xFF; i++)
9758     iso_code_class[i] = ISO_graphic_plane_1;
9759   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9760   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9761   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9762   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9763   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9764   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9765   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9766   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9767   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9768
9769   for (i = 0; i < 256; i++)
9770     {
9771       emacs_mule_bytes[i] = 1;
9772     }
9773   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9774   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9775   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9776   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9777 }
9778
9779 #ifdef emacs
9780
9781 void
9782 syms_of_coding ()
9783 {
9784   staticpro (&Vcoding_system_hash_table);
9785   {
9786     Lisp_Object args[2];
9787     args[0] = QCtest;
9788     args[1] = Qeq;
9789     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9790   }
9791
9792   staticpro (&Vsjis_coding_system);
9793   Vsjis_coding_system = Qnil;
9794
9795   staticpro (&Vbig5_coding_system);
9796   Vbig5_coding_system = Qnil;
9797
9798   staticpro (&Vcode_conversion_reused_workbuf);
9799   Vcode_conversion_reused_workbuf = Qnil;
9800
9801   staticpro (&Vcode_conversion_workbuf_name);
9802   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9803
9804   reused_workbuf_in_use = 0;
9805
9806   DEFSYM (Qcharset, "charset");
9807   DEFSYM (Qtarget_idx, "target-idx");
9808   DEFSYM (Qcoding_system_history, "coding-system-history");
9809   Fset (Qcoding_system_history, Qnil);
9810
9811   /* Target FILENAME is the first argument.  */
9812   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9813   /* Target FILENAME is the third argument.  */
9814   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9815
9816   DEFSYM (Qcall_process, "call-process");
9817   /* Target PROGRAM is the first argument.  */
9818   Fput (Qcall_process, Qtarget_idx, make_number (0));
9819
9820   DEFSYM (Qcall_process_region, "call-process-region");
9821   /* Target PROGRAM is the third argument.  */
9822   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9823
9824   DEFSYM (Qstart_process, "start-process");
9825   /* Target PROGRAM is the third argument.  */
9826   Fput (Qstart_process, Qtarget_idx, make_number (2));
9827
9828   DEFSYM (Qopen_network_stream, "open-network-stream");
9829   /* Target SERVICE is the fourth argument.  */
9830   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9831
9832   DEFSYM (Qcoding_system, "coding-system");
9833   DEFSYM (Qcoding_aliases, "coding-aliases");
9834
9835   DEFSYM (Qeol_type, "eol-type");
9836   DEFSYM (Qunix, "unix");
9837   DEFSYM (Qdos, "dos");
9838
9839   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9840   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9841   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9842   DEFSYM (Qdefault_char, "default-char");
9843   DEFSYM (Qundecided, "undecided");
9844   DEFSYM (Qno_conversion, "no-conversion");
9845   DEFSYM (Qraw_text, "raw-text");
9846
9847   DEFSYM (Qiso_2022, "iso-2022");
9848
9849   DEFSYM (Qutf_8, "utf-8");
9850   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9851
9852   DEFSYM (Qutf_16, "utf-16");
9853   DEFSYM (Qbig, "big");
9854   DEFSYM (Qlittle, "little");
9855
9856   DEFSYM (Qshift_jis, "shift-jis");
9857   DEFSYM (Qbig5, "big5");
9858
9859   DEFSYM (Qcoding_system_p, "coding-system-p");
9860
9861   DEFSYM (Qcoding_system_error, "coding-system-error");
9862   Fput (Qcoding_system_error, Qerror_conditions,
9863         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9864   Fput (Qcoding_system_error, Qerror_message,
9865         build_string ("Invalid coding system"));
9866
9867   /* Intern this now in case it isn't already done.
9868      Setting this variable twice is harmless.
9869      But don't staticpro it here--that is done in alloc.c.  */
9870   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9871
9872   DEFSYM (Qtranslation_table, "translation-table");
9873   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9874   DEFSYM (Qtranslation_table_id, "translation-table-id");
9875   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9876   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9877
9878   DEFSYM (Qvalid_codes, "valid-codes");
9879
9880   DEFSYM (Qemacs_mule, "emacs-mule");
9881
9882   DEFSYM (QCcategory, ":category");
9883   DEFSYM (QCmnemonic, ":mnemonic");
9884   DEFSYM (QCdefalut_char, ":default-char");
9885   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9886   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9887   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9888   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9889   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9890
9891   Vcoding_category_table
9892     = Fmake_vector (make_number (coding_category_max), Qnil);
9893   staticpro (&Vcoding_category_table);
9894   /* Followings are target of code detection.  */
9895   ASET (Vcoding_category_table, coding_category_iso_7,
9896         intern ("coding-category-iso-7"));
9897   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9898         intern ("coding-category-iso-7-tight"));
9899   ASET (Vcoding_category_table, coding_category_iso_8_1,
9900         intern ("coding-category-iso-8-1"));
9901   ASET (Vcoding_category_table, coding_category_iso_8_2,
9902         intern ("coding-category-iso-8-2"));
9903   ASET (Vcoding_category_table, coding_category_iso_7_else,
9904         intern ("coding-category-iso-7-else"));
9905   ASET (Vcoding_category_table, coding_category_iso_8_else,
9906         intern ("coding-category-iso-8-else"));
9907   ASET (Vcoding_category_table, coding_category_utf_8_auto,
9908         intern ("coding-category-utf-8-auto"));
9909   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
9910         intern ("coding-category-utf-8"));
9911   ASET (Vcoding_category_table, coding_category_utf_8_sig,
9912         intern ("coding-category-utf-8-sig"));
9913   ASET (Vcoding_category_table, coding_category_utf_16_be,
9914         intern ("coding-category-utf-16-be"));
9915   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9916         intern ("coding-category-utf-16-auto"));
9917   ASET (Vcoding_category_table, coding_category_utf_16_le,
9918         intern ("coding-category-utf-16-le"));
9919   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9920         intern ("coding-category-utf-16-be-nosig"));
9921   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9922         intern ("coding-category-utf-16-le-nosig"));
9923   ASET (Vcoding_category_table, coding_category_charset,
9924         intern ("coding-category-charset"));
9925   ASET (Vcoding_category_table, coding_category_sjis,
9926         intern ("coding-category-sjis"));
9927   ASET (Vcoding_category_table, coding_category_big5,
9928         intern ("coding-category-big5"));
9929   ASET (Vcoding_category_table, coding_category_ccl,
9930         intern ("coding-category-ccl"));
9931   ASET (Vcoding_category_table, coding_category_emacs_mule,
9932         intern ("coding-category-emacs-mule"));
9933   /* Followings are NOT target of code detection.  */
9934   ASET (Vcoding_category_table, coding_category_raw_text,
9935         intern ("coding-category-raw-text"));
9936   ASET (Vcoding_category_table, coding_category_undecided,
9937         intern ("coding-category-undecided"));
9938
9939   DEFSYM (Qinsufficient_source, "insufficient-source");
9940   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9941   DEFSYM (Qinvalid_source, "invalid-source");
9942   DEFSYM (Qinterrupted, "interrupted");
9943   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9944   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9945
9946   defsubr (&Scoding_system_p);
9947   defsubr (&Sread_coding_system);
9948   defsubr (&Sread_non_nil_coding_system);
9949   defsubr (&Scheck_coding_system);
9950   defsubr (&Sdetect_coding_region);
9951   defsubr (&Sdetect_coding_string);
9952   defsubr (&Sfind_coding_systems_region_internal);
9953   defsubr (&Sunencodable_char_position);
9954   defsubr (&Scheck_coding_systems_region);
9955   defsubr (&Sdecode_coding_region);
9956   defsubr (&Sencode_coding_region);
9957   defsubr (&Sdecode_coding_string);
9958   defsubr (&Sencode_coding_string);
9959   defsubr (&Sdecode_sjis_char);
9960   defsubr (&Sencode_sjis_char);
9961   defsubr (&Sdecode_big5_char);
9962   defsubr (&Sencode_big5_char);
9963   defsubr (&Sset_terminal_coding_system_internal);
9964   defsubr (&Sset_safe_terminal_coding_system_internal);
9965   defsubr (&Sterminal_coding_system);
9966   defsubr (&Sset_keyboard_coding_system_internal);
9967   defsubr (&Skeyboard_coding_system);
9968   defsubr (&Sfind_operation_coding_system);
9969   defsubr (&Sset_coding_system_priority);
9970   defsubr (&Sdefine_coding_system_internal);
9971   defsubr (&Sdefine_coding_system_alias);
9972   defsubr (&Scoding_system_put);
9973   defsubr (&Scoding_system_base);
9974   defsubr (&Scoding_system_plist);
9975   defsubr (&Scoding_system_aliases);
9976   defsubr (&Scoding_system_eol_type);
9977   defsubr (&Scoding_system_priority_list);
9978
9979   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9980                doc: /* List of coding systems.
9981
9982 Do not alter the value of this variable manually.  This variable should be
9983 updated by the functions `define-coding-system' and
9984 `define-coding-system-alias'.  */);
9985   Vcoding_system_list = Qnil;
9986
9987   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9988                doc: /* Alist of coding system names.
9989 Each element is one element list of coding system name.
9990 This variable is given to `completing-read' as COLLECTION argument.
9991
9992 Do not alter the value of this variable manually.  This variable should be
9993 updated by the functions `make-coding-system' and
9994 `define-coding-system-alias'.  */);
9995   Vcoding_system_alist = Qnil;
9996
9997   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9998                doc: /* List of coding-categories (symbols) ordered by priority.
9999
10000 On detecting a coding system, Emacs tries code detection algorithms
10001 associated with each coding-category one by one in this order.  When
10002 one algorithm agrees with a byte sequence of source text, the coding
10003 system bound to the corresponding coding-category is selected.
10004
10005 Don't modify this variable directly, but use `set-coding-priority'.  */);
10006   {
10007     int i;
10008
10009     Vcoding_category_list = Qnil;
10010     for (i = coding_category_max - 1; i >= 0; i--)
10011       Vcoding_category_list
10012         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10013                  Vcoding_category_list);
10014   }
10015
10016   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10017                doc: /* Specify the coding system for read operations.
10018 It is useful to bind this variable with `let', but do not set it globally.
10019 If the value is a coding system, it is used for decoding on read operation.
10020 If not, an appropriate element is used from one of the coding system alists.
10021 There are three such tables: `file-coding-system-alist',
10022 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10023   Vcoding_system_for_read = Qnil;
10024
10025   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10026                doc: /* Specify the coding system for write operations.
10027 Programs bind this variable with `let', but you should not set it globally.
10028 If the value is a coding system, it is used for encoding of output,
10029 when writing it to a file and when sending it to a file or subprocess.
10030
10031 If this does not specify a coding system, an appropriate element
10032 is used from one of the coding system alists.
10033 There are three such tables: `file-coding-system-alist',
10034 `process-coding-system-alist', and `network-coding-system-alist'.
10035 For output to files, if the above procedure does not specify a coding system,
10036 the value of `buffer-file-coding-system' is used.  */);
10037   Vcoding_system_for_write = Qnil;
10038
10039   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10040                doc: /*
10041 Coding system used in the latest file or process I/O.  */);
10042   Vlast_coding_system_used = Qnil;
10043
10044   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10045                doc: /*
10046 Error status of the last code conversion.
10047
10048 When an error was detected in the last code conversion, this variable
10049 is set to one of the following symbols.
10050   `insufficient-source'
10051   `inconsistent-eol'
10052   `invalid-source'
10053   `interrupted'
10054   `insufficient-memory'
10055 When no error was detected, the value doesn't change.  So, to check
10056 the error status of a code conversion by this variable, you must
10057 explicitly set this variable to nil before performing code
10058 conversion.  */);
10059   Vlast_code_conversion_error = Qnil;
10060
10061   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10062                doc: /*
10063 *Non-nil means always inhibit code conversion of end-of-line format.
10064 See info node `Coding Systems' and info node `Text and Binary' concerning
10065 such conversion.  */);
10066   inhibit_eol_conversion = 0;
10067
10068   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10069                doc: /*
10070 Non-nil means process buffer inherits coding system of process output.
10071 Bind it to t if the process output is to be treated as if it were a file
10072 read from some filesystem.  */);
10073   inherit_process_coding_system = 0;
10074
10075   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10076                doc: /*
10077 Alist to decide a coding system to use for a file I/O operation.
10078 The format is ((PATTERN . VAL) ...),
10079 where PATTERN is a regular expression matching a file name,
10080 VAL is a coding system, a cons of coding systems, or a function symbol.
10081 If VAL is a coding system, it is used for both decoding and encoding
10082 the file contents.
10083 If VAL is a cons of coding systems, the car part is used for decoding,
10084 and the cdr part is used for encoding.
10085 If VAL is a function symbol, the function must return a coding system
10086 or a cons of coding systems which are used as above.  The function is
10087 called with an argument that is a list of the arguments with which
10088 `find-operation-coding-system' was called.  If the function can't decide
10089 a coding system, it can return `undecided' so that the normal
10090 code-detection is performed.
10091
10092 See also the function `find-operation-coding-system'
10093 and the variable `auto-coding-alist'.  */);
10094   Vfile_coding_system_alist = Qnil;
10095
10096   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10097                doc: /*
10098 Alist to decide a coding system to use for a process I/O operation.
10099 The format is ((PATTERN . VAL) ...),
10100 where PATTERN is a regular expression matching a program name,
10101 VAL is a coding system, a cons of coding systems, or a function symbol.
10102 If VAL is a coding system, it is used for both decoding what received
10103 from the program and encoding what sent to the program.
10104 If VAL is a cons of coding systems, the car part is used for decoding,
10105 and the cdr part is used for encoding.
10106 If VAL is a function symbol, the function must return a coding system
10107 or a cons of coding systems which are used as above.
10108
10109 See also the function `find-operation-coding-system'.  */);
10110   Vprocess_coding_system_alist = Qnil;
10111
10112   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10113                doc: /*
10114 Alist to decide a coding system to use for a network I/O operation.
10115 The format is ((PATTERN . VAL) ...),
10116 where PATTERN is a regular expression matching a network service name
10117 or is a port number to connect to,
10118 VAL is a coding system, a cons of coding systems, or a function symbol.
10119 If VAL is a coding system, it is used for both decoding what received
10120 from the network stream and encoding what sent to the network stream.
10121 If VAL is a cons of coding systems, the car part is used for decoding,
10122 and the cdr part is used for encoding.
10123 If VAL is a function symbol, the function must return a coding system
10124 or a cons of coding systems which are used as above.
10125
10126 See also the function `find-operation-coding-system'.  */);
10127   Vnetwork_coding_system_alist = Qnil;
10128
10129   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10130                doc: /* Coding system to use with system messages.
10131 Also used for decoding keyboard input on X Window system.  */);
10132   Vlocale_coding_system = Qnil;
10133
10134   /* The eol mnemonics are reset in startup.el system-dependently.  */
10135   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10136                doc: /*
10137 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10138   eol_mnemonic_unix = build_string (":");
10139
10140   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10141                doc: /*
10142 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10143   eol_mnemonic_dos = build_string ("\\");
10144
10145   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10146                doc: /*
10147 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10148   eol_mnemonic_mac = build_string ("/");
10149
10150   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10151                doc: /*
10152 *String displayed in mode line when end-of-line format is not yet determined.  */);
10153   eol_mnemonic_undecided = build_string (":");
10154
10155   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10156                doc: /*
10157 *Non-nil enables character translation while encoding and decoding.  */);
10158   Venable_character_translation = Qt;
10159
10160   DEFVAR_LISP ("standard-translation-table-for-decode",
10161                &Vstandard_translation_table_for_decode,
10162                doc: /* Table for translating characters while decoding.  */);
10163   Vstandard_translation_table_for_decode = Qnil;
10164
10165   DEFVAR_LISP ("standard-translation-table-for-encode",
10166                &Vstandard_translation_table_for_encode,
10167                doc: /* Table for translating characters while encoding.  */);
10168   Vstandard_translation_table_for_encode = Qnil;
10169
10170   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10171                doc: /* Alist of charsets vs revision numbers.
10172 While encoding, if a charset (car part of an element) is found,
10173 designate it with the escape sequence identifying revision (cdr part
10174 of the element).  */);
10175   Vcharset_revision_table = Qnil;
10176
10177   DEFVAR_LISP ("default-process-coding-system",
10178                &Vdefault_process_coding_system,
10179                doc: /* Cons of coding systems used for process I/O by default.
10180 The car part is used for decoding a process output,
10181 the cdr part is used for encoding a text to be sent to a process.  */);
10182   Vdefault_process_coding_system = Qnil;
10183
10184   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10185                doc: /*
10186 Table of extra Latin codes in the range 128..159 (inclusive).
10187 This is a vector of length 256.
10188 If Nth element is non-nil, the existence of code N in a file
10189 \(or output of subprocess) doesn't prevent it to be detected as
10190 a coding system of ISO 2022 variant which has a flag
10191 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10192 or reading output of a subprocess.
10193 Only 128th through 159th elements have a meaning.  */);
10194   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10195
10196   DEFVAR_LISP ("select-safe-coding-system-function",
10197                &Vselect_safe_coding_system_function,
10198                doc: /*
10199 Function to call to select safe coding system for encoding a text.
10200
10201 If set, this function is called to force a user to select a proper
10202 coding system which can encode the text in the case that a default
10203 coding system used in each operation can't encode the text.  The
10204 function should take care that the buffer is not modified while
10205 the coding system is being selected.
10206
10207 The default value is `select-safe-coding-system' (which see).  */);
10208   Vselect_safe_coding_system_function = Qnil;
10209
10210   DEFVAR_BOOL ("coding-system-require-warning",
10211                &coding_system_require_warning,
10212                doc: /* Internal use only.
10213 If non-nil, on writing a file, `select-safe-coding-system-function' is
10214 called even if `coding-system-for-write' is non-nil.  The command
10215 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10216   coding_system_require_warning = 0;
10217
10218
10219   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10220                &inhibit_iso_escape_detection,
10221                doc: /*
10222 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
10223
10224 By default, on reading a file, Emacs tries to detect how the text is
10225 encoded.  This code detection is sensitive to escape sequences.  If
10226 the sequence is valid as ISO2022, the code is determined as one of
10227 the ISO2022 encodings, and the file is decoded by the corresponding
10228 coding system (e.g. `iso-2022-7bit').
10229
10230 However, there may be a case that you want to read escape sequences in
10231 a file as is.  In such a case, you can set this variable to non-nil.
10232 Then, as the code detection ignores any escape sequences, no file is
10233 detected as encoded in some ISO2022 encoding.  The result is that all
10234 escape sequences become visible in a buffer.
10235
10236 The default value is nil, and it is strongly recommended not to change
10237 it.  That is because many Emacs Lisp source files that contain
10238 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10239 in Emacs's distribution, and they won't be decoded correctly on
10240 reading if you suppress escape sequence detection.
10241
10242 The other way to read escape sequences in a file without decoding is
10243 to explicitly specify some coding system that doesn't use ISO2022's
10244 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10245   inhibit_iso_escape_detection = 0;
10246
10247   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10248                doc: /* Char table for translating self-inserting characters.
10249 This is applied to the result of input methods, not their input.
10250 See also `keyboard-translate-table'.  */);
10251     Vtranslation_table_for_input = Qnil;
10252
10253   {
10254     Lisp_Object args[coding_arg_max];
10255     Lisp_Object plist[16];
10256     int i;
10257
10258     for (i = 0; i < coding_arg_max; i++)
10259       args[i] = Qnil;
10260
10261     plist[0] = intern (":name");
10262     plist[1] = args[coding_arg_name] = Qno_conversion;
10263     plist[2] = intern (":mnemonic");
10264     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10265     plist[4] = intern (":coding-type");
10266     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10267     plist[6] = intern (":ascii-compatible-p");
10268     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10269     plist[8] = intern (":default-char");
10270     plist[9] = args[coding_arg_default_char] = make_number (0);
10271     plist[10] = intern (":for-unibyte");
10272     plist[11] = args[coding_arg_for_unibyte] = Qt;
10273     plist[12] = intern (":docstring");
10274     plist[13] = build_string ("Do no conversion.\n\
10275 \n\
10276 When you visit a file with this coding, the file is read into a\n\
10277 unibyte buffer as is, thus each byte of a file is treated as a\n\
10278 character.");
10279     plist[14] = intern (":eol-type");
10280     plist[15] = args[coding_arg_eol_type] = Qunix;
10281     args[coding_arg_plist] = Flist (16, plist);
10282     Fdefine_coding_system_internal (coding_arg_max, args);
10283
10284     plist[1] = args[coding_arg_name] = Qundecided;
10285     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10286     plist[5] = args[coding_arg_coding_type] = Qundecided;
10287     /* This is already set.
10288        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10289     plist[8] = intern (":charset-list");
10290     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10291     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10292     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10293     plist[15] = args[coding_arg_eol_type] = Qnil;
10294     args[coding_arg_plist] = Flist (16, plist);
10295     Fdefine_coding_system_internal (coding_arg_max, args);
10296   }
10297
10298   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10299
10300   {
10301     int i;
10302
10303     for (i = 0; i < coding_category_max; i++)
10304       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10305   }
10306 #if defined (MSDOS) || defined (WINDOWSNT)
10307   system_eol_type = Qdos;
10308 #else
10309   system_eol_type = Qunix;
10310 #endif
10311   staticpro (&system_eol_type);
10312 }
10313
10314 char *
10315 emacs_strerror (error_number)
10316      int error_number;
10317 {
10318   char *str;
10319
10320   synchronize_system_messages_locale ();
10321   str = strerror (error_number);
10322
10323   if (! NILP (Vlocale_coding_system))
10324     {
10325       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10326                                                       Vlocale_coding_system,
10327                                                       0);
10328       str = (char *) SDATA (dec);
10329     }
10330
10331   return str;
10332 }
10333
10334 #endif /* emacs */
10335
10336 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10337    (do not change this comment) */