src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009, 2010
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_INSUFFICIENT_DST:
 997       /* Don't record this error in Vlast_code_conversion_error
 998          because it happens just temporarily and is resolved when the
 999          whole conversion is finished.  */
1000       break;
1001     case CODING_RESULT_SUCCESS:
1002       break;
1003     default:
1004       Vlast_code_conversion_error = intern ("Unknown error");
1005     }
1006 }
1007
1008 /* This wrapper macro is used to preserve validity of pointers into
1009    buffer text across calls to decode_char, which could cause
1010    relocation of buffers if it loads a charset map, because loading a
1011    charset map allocates large structures.  */
1012 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1013   do {                                                                       \
1014     charset_map_loaded = 0;                                                  \
1015     c = DECODE_CHAR (charset, code);                                         \
1016     if (charset_map_loaded)                                                  \
1017       {                                                                      \
1018         const unsigned char *orig = coding->source;                          \
1019         EMACS_INT offset;                                                    \
1020                                                                              \
1021         coding_set_source (coding);                                          \
1022         offset = coding->source - orig;                                      \
1023         src += offset;                                                       \
1024         src_base += offset;                                                  \
1025         src_end += offset;                                                   \
1026       }                                                                      \
1027   } while (0)
1028
1029
1030 /* If there are at least BYTES length of room at dst, allocate memory
1031    for coding->destination and update dst and dst_end.  We don't have
1032    to take care of coding->source which will be relocated.  It is
1033    handled by calling coding_set_source in encode_coding.  */
1034
1035 #define ASSURE_DESTINATION(bytes)                               \
1036   do {                                                          \
1037     if (dst + (bytes) >= dst_end)                               \
1038       {                                                         \
1039         int more_bytes = charbuf_end - charbuf + (bytes);       \
1040                                                                 \
1041         dst = alloc_destination (coding, more_bytes, dst);      \
1042         dst_end = coding->destination + coding->dst_bytes;      \
1043       }                                                         \
1044   } while (0)
1045
1046
1047 /* Store multibyte form of the character C in P, and advance P to the
1048    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1049    never calls MAYBE_UNIFY_CHAR.  */
1050
1051 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1052   do {                                          \
1053     if ((c) <= MAX_1_BYTE_CHAR)                 \
1054       *(p)++ = (c);                             \
1055     else if ((c) <= MAX_2_BYTE_CHAR)            \
1056       *(p)++ = (0xC0 | ((c) >> 6)),             \
1057         *(p)++ = (0x80 | ((c) & 0x3F));         \
1058     else if ((c) <= MAX_3_BYTE_CHAR)            \
1059       *(p)++ = (0xE0 | ((c) >> 12)),            \
1060         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1061         *(p)++ = (0x80 | ((c) & 0x3F));         \
1062     else if ((c) <= MAX_4_BYTE_CHAR)            \
1063       *(p)++ = (0xF0 | (c >> 18)),              \
1064         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1065         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1066         *(p)++ = (0x80 | (c & 0x3F));           \
1067     else if ((c) <= MAX_5_BYTE_CHAR)            \
1068       *(p)++ = 0xF8,                            \
1069         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1070         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1071         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1072         *(p)++ = (0x80 | (c & 0x3F));           \
1073     else                                        \
1074       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1075   } while (0)
1076
1077
1078 /* Return the character code of character whose multibyte form is at
1079    P, and advance P to the end of the multibyte form.  This is like
1080    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1081
1082 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1083   (!((p)[0] & 0x80)                                             \
1084    ? *(p)++                                                     \
1085    : ! ((p)[0] & 0x20)                                          \
1086    ? ((p) += 2,                                                 \
1087       ((((p)[-2] & 0x1F) << 6)                                  \
1088        | ((p)[-1] & 0x3F)                                       \
1089        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1090    : ! ((p)[0] & 0x10)                                          \
1091    ? ((p) += 3,                                                 \
1092       ((((p)[-3] & 0x0F) << 12)                                 \
1093        | (((p)[-2] & 0x3F) << 6)                                \
1094        | ((p)[-1] & 0x3F)))                                     \
1095    : ! ((p)[0] & 0x08)                                          \
1096    ? ((p) += 4,                                                 \
1097       ((((p)[-4] & 0xF) << 18)                                  \
1098        | (((p)[-3] & 0x3F) << 12)                               \
1099        | (((p)[-2] & 0x3F) << 6)                                \
1100        | ((p)[-1] & 0x3F)))                                     \
1101    : ((p) += 5,                                                 \
1102       ((((p)[-4] & 0x3F) << 18)                                 \
1103        | (((p)[-3] & 0x3F) << 12)                               \
1104        | (((p)[-2] & 0x3F) << 6)                                \
1105        | ((p)[-1] & 0x3F))))
1106
1107
1108 static void
1109 coding_set_source (coding)
1110      struct coding_system *coding;
1111 {
1112   if (BUFFERP (coding->src_object))
1113     {
1114       struct buffer *buf = XBUFFER (coding->src_object);
1115
1116       if (coding->src_pos < 0)
1117         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1118       else
1119         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1120     }
1121   else if (STRINGP (coding->src_object))
1122     {
1123       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1124     }
1125   else
1126     /* Otherwise, the source is C string and is never relocated
1127        automatically.  Thus we don't have to update anything.  */
1128     ;
1129 }
1130
1131 static void
1132 coding_set_destination (coding)
1133      struct coding_system *coding;
1134 {
1135   if (BUFFERP (coding->dst_object))
1136     {
1137       if (coding->src_pos < 0)
1138         {
1139           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1140           coding->dst_bytes = (GAP_END_ADDR
1141                                - (coding->src_bytes - coding->consumed)
1142                                - coding->destination);
1143         }
1144       else
1145         {
1146           /* We are sure that coding->dst_pos_byte is before the gap
1147              of the buffer. */
1148           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1149                                  + coding->dst_pos_byte - BEG_BYTE);
1150           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1151                                - coding->destination);
1152         }
1153     }
1154   else
1155     /* Otherwise, the destination is C string and is never relocated
1156        automatically.  Thus we don't have to update anything.  */
1157     ;
1158 }
1159
1160
1161 static void
1162 coding_alloc_by_realloc (coding, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT bytes;
1165 {
1166   coding->destination = (unsigned char *) xrealloc (coding->destination,
1167                                                     coding->dst_bytes + bytes);
1168   coding->dst_bytes += bytes;
1169 }
1170
1171 static void
1172 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1173      struct coding_system *coding;
1174      EMACS_INT gap_head_used, bytes;
1175 {
1176   if (EQ (coding->src_object, coding->dst_object))
1177     {
1178       /* The gap may contain the produced data at the head and not-yet
1179          consumed data at the tail.  To preserve those data, we at
1180          first make the gap size to zero, then increase the gap
1181          size.  */
1182       EMACS_INT add = GAP_SIZE;
1183
1184       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1185       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1186       make_gap (bytes);
1187       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1188       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1189     }
1190   else
1191     {
1192       Lisp_Object this_buffer;
1193
1194       this_buffer = Fcurrent_buffer ();
1195       set_buffer_internal (XBUFFER (coding->dst_object));
1196       make_gap (bytes);
1197       set_buffer_internal (XBUFFER (this_buffer));
1198     }
1199 }
1200
1201
1202 static unsigned char *
1203 alloc_destination (coding, nbytes, dst)
1204      struct coding_system *coding;
1205      EMACS_INT nbytes;
1206      unsigned char *dst;
1207 {
1208   EMACS_INT offset = dst - coding->destination;
1209
1210   if (BUFFERP (coding->dst_object))
1211     {
1212       struct buffer *buf = XBUFFER (coding->dst_object);
1213
1214       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1215     }
1216   else
1217     coding_alloc_by_realloc (coding, nbytes);
1218   coding_set_destination (coding);
1219   dst = coding->destination + offset;
1220   return dst;
1221 }
1222
1223 /** Macros for annotations.  */
1224
1225 /* An annotation data is stored in the array coding->charbuf in this
1226    format:
1227      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1228    LENGTH is the number of elements in the annotation.
1229    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1230    NCHARS is the number of characters in the text annotated.
1231
1232    The format of the following elements depend on ANNOTATION_MASK.
1233
1234    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1235    follows:
1236      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1237
1238    NBYTES is the number of bytes specified in the header part of
1239    old-style emacs-mule encoding, or 0 for the other kind of
1240    composition.
1241
1242    METHOD is one of enum composition_method.
1243
1244    Optionnal COMPOSITION-COMPONENTS are characters and composition
1245    rules.
1246
1247    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1248    follows.
1249
1250    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1251    recover from an invalid annotation, and should be skipped by
1252    produce_annotation.  */
1253
1254 /* Maximum length of the header of annotation data.  */
1255 #define MAX_ANNOTATION_LENGTH 5
1256
1257 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1258   do {                                                  \
1259     *(buf)++ = -(len);                                  \
1260     *(buf)++ = (mask);                                  \
1261     *(buf)++ = (nchars);                                \
1262     coding->annotated = 1;                              \
1263   } while (0);
1264
1265 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1266   do {                                                                      \
1267     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1268     *buf++ = nbytes;                                                        \
1269     *buf++ = method;                                                        \
1270   } while (0)
1271
1272
1273 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1274   do {                                                                  \
1275     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1276     *buf++ = id;                                                        \
1277   } while (0)
1278
1279 \f
1280 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1281
1282
1283
1284 \f
1285 /*** 3. UTF-8 ***/
1286
1287 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1288    Check if a text is encoded in UTF-8.  If it is, return 1, else
1289    return 0.  */
1290
1291 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1292 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1293 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1294 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1295 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1296 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1297
1298 #define UTF_BOM 0xFEFF
1299 #define UTF_8_BOM_1 0xEF
1300 #define UTF_8_BOM_2 0xBB
1301 #define UTF_8_BOM_3 0xBF
1302
1303 static int
1304 detect_coding_utf_8 (coding, detect_info)
1305      struct coding_system *coding;
1306      struct coding_detection_info *detect_info;
1307 {
1308   const unsigned char *src = coding->source, *src_base;
1309   const unsigned char *src_end = coding->source + coding->src_bytes;
1310   int multibytep = coding->src_multibyte;
1311   int consumed_chars = 0;
1312   int bom_found = 0;
1313   int found = 0;
1314
1315   detect_info->checked |= CATEGORY_MASK_UTF_8;
1316   /* A coding system of this category is always ASCII compatible.  */
1317   src += coding->head_ascii;
1318
1319   while (1)
1320     {
1321       int c, c1, c2, c3, c4;
1322
1323       src_base = src;
1324       ONE_MORE_BYTE (c);
1325       if (c < 0 || UTF_8_1_OCTET_P (c))
1326         continue;
1327       ONE_MORE_BYTE (c1);
1328       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1329         break;
1330       if (UTF_8_2_OCTET_LEADING_P (c))
1331         {
1332           found = 1;
1333           continue;
1334         }
1335       ONE_MORE_BYTE (c2);
1336       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1337         break;
1338       if (UTF_8_3_OCTET_LEADING_P (c))
1339         {
1340           found = 1;
1341           if (src_base == coding->source
1342               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1343             bom_found = 1;
1344           continue;
1345         }
1346       ONE_MORE_BYTE (c3);
1347       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1348         break;
1349       if (UTF_8_4_OCTET_LEADING_P (c))
1350         {
1351           found = 1;
1352           continue;
1353         }
1354       ONE_MORE_BYTE (c4);
1355       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1356         break;
1357       if (UTF_8_5_OCTET_LEADING_P (c))
1358         {
1359           found = 1;
1360           continue;
1361         }
1362       break;
1363     }
1364   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1365   return 0;
1366
1367  no_more_source:
1368   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1371       return 0;
1372     }
1373   if (bom_found)
1374     {
1375       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1376       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1377     }
1378   else
1379     {
1380       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1381       if (found)
1382         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1383     }
1384   return 1;
1385 }
1386
1387
1388 static void
1389 decode_coding_utf_8 (coding)
1390      struct coding_system *coding;
1391 {
1392   const unsigned char *src = coding->source + coding->consumed;
1393   const unsigned char *src_end = coding->source + coding->src_bytes;
1394   const unsigned char *src_base;
1395   int *charbuf = coding->charbuf + coding->charbuf_used;
1396   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1397   int consumed_chars = 0, consumed_chars_base = 0;
1398   int multibytep = coding->src_multibyte;
1399   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1400   Lisp_Object attr, charset_list;
1401   int eol_crlf =
1402     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1403   int byte_after_cr = -1;
1404
1405   CODING_GET_INFO (coding, attr, charset_list);
1406
1407   if (bom != utf_without_bom)
1408     {
1409       int c1, c2, c3;
1410
1411       src_base = src;
1412       ONE_MORE_BYTE (c1);
1413       if (! UTF_8_3_OCTET_LEADING_P (c1))
1414         src = src_base;
1415       else
1416         {
1417           ONE_MORE_BYTE (c2);
1418           if (! UTF_8_EXTRA_OCTET_P (c2))
1419             src = src_base;
1420           else
1421             {
1422               ONE_MORE_BYTE (c3);
1423               if (! UTF_8_EXTRA_OCTET_P (c3))
1424                 src = src_base;
1425               else
1426                 {
1427                   if ((c1 != UTF_8_BOM_1)
1428                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1429                     src = src_base;
1430                   else
1431                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1432                 }
1433             }
1434         }
1435     }
1436   CODING_UTF_8_BOM (coding) = utf_without_bom;
1437
1438
1439
1440   while (1)
1441     {
1442       int c, c1, c2, c3, c4, c5;
1443
1444       src_base = src;
1445       consumed_chars_base = consumed_chars;
1446
1447       if (charbuf >= charbuf_end)
1448         {
1449           if (byte_after_cr >= 0)
1450             src_base--;
1451           break;
1452         }
1453
1454       if (byte_after_cr >= 0)
1455         c1 = byte_after_cr, byte_after_cr = -1;
1456       else
1457         ONE_MORE_BYTE (c1);
1458       if (c1 < 0)
1459         {
1460           c = - c1;
1461         }
1462       else if (UTF_8_1_OCTET_P(c1))
1463         {
1464           if (eol_crlf && c1 == '\r')
1465             ONE_MORE_BYTE (byte_after_cr);
1466           c = c1;
1467         }
1468       else
1469         {
1470           ONE_MORE_BYTE (c2);
1471           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1472             goto invalid_code;
1473           if (UTF_8_2_OCTET_LEADING_P (c1))
1474             {
1475               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1476               /* Reject overlong sequences here and below.  Encoders
1477                  producing them are incorrect, they can be misleading,
1478                  and they mess up read/write invariance.  */
1479               if (c < 128)
1480                 goto invalid_code;
1481             }
1482           else
1483             {
1484               ONE_MORE_BYTE (c3);
1485               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1486                 goto invalid_code;
1487               if (UTF_8_3_OCTET_LEADING_P (c1))
1488                 {
1489                   c = (((c1 & 0xF) << 12)
1490                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1491                   if (c < 0x800
1492                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1493                     goto invalid_code;
1494                 }
1495               else
1496                 {
1497                   ONE_MORE_BYTE (c4);
1498                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1499                     goto invalid_code;
1500                   if (UTF_8_4_OCTET_LEADING_P (c1))
1501                     {
1502                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1503                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1504                     if (c < 0x10000)
1505                       goto invalid_code;
1506                     }
1507                   else
1508                     {
1509                       ONE_MORE_BYTE (c5);
1510                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1511                         goto invalid_code;
1512                       if (UTF_8_5_OCTET_LEADING_P (c1))
1513                         {
1514                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1515                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1516                                | (c5 & 0x3F));
1517                           if ((c > MAX_CHAR) || (c < 0x200000))
1518                             goto invalid_code;
1519                         }
1520                       else
1521                         goto invalid_code;
1522                     }
1523                 }
1524             }
1525         }
1526
1527       *charbuf++ = c;
1528       continue;
1529
1530     invalid_code:
1531       src = src_base;
1532       consumed_chars = consumed_chars_base;
1533       ONE_MORE_BYTE (c);
1534       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1535       coding->errors++;
1536     }
1537
1538  no_more_source:
1539   coding->consumed_char += consumed_chars_base;
1540   coding->consumed = src_base - coding->source;
1541   coding->charbuf_used = charbuf - coding->charbuf;
1542 }
1543
1544
1545 static int
1546 encode_coding_utf_8 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int produced_chars = 0;
1555   int c;
1556
1557   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1558     {
1559       ASSURE_DESTINATION (3);
1560       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1561       CODING_UTF_8_BOM (coding) = utf_without_bom;
1562     }
1563
1564   if (multibytep)
1565     {
1566       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1567
1568       while (charbuf < charbuf_end)
1569         {
1570           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1571
1572           ASSURE_DESTINATION (safe_room);
1573           c = *charbuf++;
1574           if (CHAR_BYTE8_P (c))
1575             {
1576               c = CHAR_TO_BYTE8 (c);
1577               EMIT_ONE_BYTE (c);
1578             }
1579           else
1580             {
1581               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1582               for (p = str; p < pend; p++)
1583                 EMIT_ONE_BYTE (*p);
1584             }
1585         }
1586     }
1587   else
1588     {
1589       int safe_room = MAX_MULTIBYTE_LENGTH;
1590
1591       while (charbuf < charbuf_end)
1592         {
1593           ASSURE_DESTINATION (safe_room);
1594           c = *charbuf++;
1595           if (CHAR_BYTE8_P (c))
1596             *dst++ = CHAR_TO_BYTE8 (c);
1597           else
1598             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1599           produced_chars++;
1600         }
1601     }
1602   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1603   coding->produced_char += produced_chars;
1604   coding->produced = dst - coding->destination;
1605   return 0;
1606 }
1607
1608
1609 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1610    Check if a text is encoded in one of UTF-16 based coding systems.
1611    If it is, return 1, else return 0.  */
1612
1613 #define UTF_16_HIGH_SURROGATE_P(val) \
1614   (((val) & 0xFC00) == 0xD800)
1615
1616 #define UTF_16_LOW_SURROGATE_P(val) \
1617   (((val) & 0xFC00) == 0xDC00)
1618
1619 #define UTF_16_INVALID_P(val)   \
1620   (((val) == 0xFFFE)            \
1621    || ((val) == 0xFFFF)         \
1622    || UTF_16_LOW_SURROGATE_P (val))
1623
1624
1625 static int
1626 detect_coding_utf_16 (coding, detect_info)
1627      struct coding_system *coding;
1628      struct coding_detection_info *detect_info;
1629 {
1630   const unsigned char *src = coding->source, *src_base = src;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   int multibytep = coding->src_multibyte;
1633   int consumed_chars = 0;
1634   int c1, c2;
1635
1636   detect_info->checked |= CATEGORY_MASK_UTF_16;
1637   if (coding->mode & CODING_MODE_LAST_BLOCK
1638       && (coding->src_chars & 1))
1639     {
1640       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1641       return 0;
1642     }
1643
1644   TWO_MORE_BYTES (c1, c2);
1645   if ((c1 == 0xFF) && (c2 == 0xFE))
1646     {
1647       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1648                              | CATEGORY_MASK_UTF_16_AUTO);
1649       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1650                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1651                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1652     }
1653   else if ((c1 == 0xFE) && (c2 == 0xFF))
1654     {
1655       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1656                              | CATEGORY_MASK_UTF_16_AUTO);
1657       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1658                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1659                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1660     }
1661   else if (c2 < 0)
1662     {
1663       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1664       return 0;
1665     }
1666   else
1667     {
1668       /* We check the dispersion of Eth and Oth bytes where E is even and
1669          O is odd.  If both are high, we assume binary data.*/
1670       unsigned char e[256], o[256];
1671       unsigned e_num = 1, o_num = 1;
1672
1673       memset (e, 0, 256);
1674       memset (o, 0, 256);
1675       e[c1] = 1;
1676       o[c2] = 1;
1677
1678       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1679                                 |CATEGORY_MASK_UTF_16_BE
1680                                 | CATEGORY_MASK_UTF_16_LE);
1681
1682       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1683              != CATEGORY_MASK_UTF_16)
1684         {
1685           TWO_MORE_BYTES (c1, c2);
1686           if (c2 < 0)
1687             break;
1688           if (! e[c1])
1689             {
1690               e[c1] = 1;
1691               e_num++;
1692               if (e_num >= 128)
1693                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1694             }
1695           if (! o[c2])
1696             {
1697               o[c2] = 1;
1698               o_num++;
1699               if (o_num >= 128)
1700                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1701             }
1702         }
1703       return 0;
1704     }
1705
1706  no_more_source:
1707   return 1;
1708 }
1709
1710 static void
1711 decode_coding_utf_16 (coding)
1712      struct coding_system *coding;
1713 {
1714   const unsigned char *src = coding->source + coding->consumed;
1715   const unsigned char *src_end = coding->source + coding->src_bytes;
1716   const unsigned char *src_base;
1717   int *charbuf = coding->charbuf + coding->charbuf_used;
1718   /* We may produces at most 3 chars in one loop.  */
1719   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1720   int consumed_chars = 0, consumed_chars_base = 0;
1721   int multibytep = coding->src_multibyte;
1722   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1723   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1724   int surrogate = CODING_UTF_16_SURROGATE (coding);
1725   Lisp_Object attr, charset_list;
1726   int eol_crlf =
1727     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1728   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1729
1730   CODING_GET_INFO (coding, attr, charset_list);
1731
1732   if (bom == utf_with_bom)
1733     {
1734       int c, c1, c2;
1735
1736       src_base = src;
1737       ONE_MORE_BYTE (c1);
1738       ONE_MORE_BYTE (c2);
1739       c = (c1 << 8) | c2;
1740
1741       if (endian == utf_16_big_endian
1742           ? c != 0xFEFF : c != 0xFFFE)
1743         {
1744           /* The first two bytes are not BOM.  Treat them as bytes
1745              for a normal character.  */
1746           src = src_base;
1747           coding->errors++;
1748         }
1749       CODING_UTF_16_BOM (coding) = utf_without_bom;
1750     }
1751   else if (bom == utf_detect_bom)
1752     {
1753       /* We have already tried to detect BOM and failed in
1754          detect_coding.  */
1755       CODING_UTF_16_BOM (coding) = utf_without_bom;
1756     }
1757
1758   while (1)
1759     {
1760       int c, c1, c2;
1761
1762       src_base = src;
1763       consumed_chars_base = consumed_chars;
1764
1765       if (charbuf >= charbuf_end)
1766         {
1767           if (byte_after_cr1 >= 0)
1768             src_base -= 2;
1769           break;
1770         }
1771
1772       if (byte_after_cr1 >= 0)
1773         c1 = byte_after_cr1, byte_after_cr1 = -1;
1774       else
1775         ONE_MORE_BYTE (c1);
1776       if (c1 < 0)
1777         {
1778           *charbuf++ = -c1;
1779           continue;
1780         }
1781       if (byte_after_cr2 >= 0)
1782         c2 = byte_after_cr2, byte_after_cr2 = -1;
1783       else
1784         ONE_MORE_BYTE (c2);
1785       if (c2 < 0)
1786         {
1787           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1788           *charbuf++ = -c2;
1789           continue;
1790         }
1791       c = (endian == utf_16_big_endian
1792            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1793
1794       if (surrogate)
1795         {
1796           if (! UTF_16_LOW_SURROGATE_P (c))
1797             {
1798               if (endian == utf_16_big_endian)
1799                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1800               else
1801                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1802               *charbuf++ = c1;
1803               *charbuf++ = c2;
1804               coding->errors++;
1805               if (UTF_16_HIGH_SURROGATE_P (c))
1806                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1807               else
1808                 *charbuf++ = c;
1809             }
1810           else
1811             {
1812               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1813               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1814               *charbuf++ = 0x10000 + c;
1815             }
1816         }
1817       else
1818         {
1819           if (UTF_16_HIGH_SURROGATE_P (c))
1820             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1821           else
1822             {
1823               if (eol_crlf && c == '\r')
1824                 {
1825                   ONE_MORE_BYTE (byte_after_cr1);
1826                   ONE_MORE_BYTE (byte_after_cr2);
1827                 }
1828               *charbuf++ = c;
1829             }
1830         }
1831     }
1832
1833  no_more_source:
1834   coding->consumed_char += consumed_chars_base;
1835   coding->consumed = src_base - coding->source;
1836   coding->charbuf_used = charbuf - coding->charbuf;
1837 }
1838
1839 static int
1840 encode_coding_utf_16 (coding)
1841      struct coding_system *coding;
1842 {
1843   int multibytep = coding->dst_multibyte;
1844   int *charbuf = coding->charbuf;
1845   int *charbuf_end = charbuf + coding->charbuf_used;
1846   unsigned char *dst = coding->destination + coding->produced;
1847   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1848   int safe_room = 8;
1849   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1850   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1851   int produced_chars = 0;
1852   Lisp_Object attrs, charset_list;
1853   int c;
1854
1855   CODING_GET_INFO (coding, attrs, charset_list);
1856
1857   if (bom != utf_without_bom)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       if (big_endian)
1861         EMIT_TWO_BYTES (0xFE, 0xFF);
1862       else
1863         EMIT_TWO_BYTES (0xFF, 0xFE);
1864       CODING_UTF_16_BOM (coding) = utf_without_bom;
1865     }
1866
1867   while (charbuf < charbuf_end)
1868     {
1869       ASSURE_DESTINATION (safe_room);
1870       c = *charbuf++;
1871       if (c > MAX_UNICODE_CHAR)
1872         c = coding->default_char;
1873
1874       if (c < 0x10000)
1875         {
1876           if (big_endian)
1877             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1878           else
1879             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1880         }
1881       else
1882         {
1883           int c1, c2;
1884
1885           c -= 0x10000;
1886           c1 = (c >> 10) + 0xD800;
1887           c2 = (c & 0x3FF) + 0xDC00;
1888           if (big_endian)
1889             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1890           else
1891             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1892         }
1893     }
1894   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1895   coding->produced = dst - coding->destination;
1896   coding->produced_char += produced_chars;
1897   return 0;
1898 }
1899
1900 \f
1901 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1902
1903 /* Emacs' internal format for representation of multiple character
1904    sets is a kind of multi-byte encoding, i.e. characters are
1905    represented by variable-length sequences of one-byte codes.
1906
1907    ASCII characters and control characters (e.g. `tab', `newline') are
1908    represented by one-byte sequences which are their ASCII codes, in
1909    the range 0x00 through 0x7F.
1910
1911    8-bit characters of the range 0x80..0x9F are represented by
1912    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1913    code + 0x20).
1914
1915    8-bit characters of the range 0xA0..0xFF are represented by
1916    one-byte sequences which are their 8-bit code.
1917
1918    The other characters are represented by a sequence of `base
1919    leading-code', optional `extended leading-code', and one or two
1920    `position-code's.  The length of the sequence is determined by the
1921    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1922    whereas extended leading-code and position-code take the range 0xA0
1923    through 0xFF.  See `charset.h' for more details about leading-code
1924    and position-code.
1925
1926    --- CODE RANGE of Emacs' internal format ---
1927    character set        range
1928    -------------        -----
1929    ascii                0x00..0x7F
1930    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1931    eight-bit-graphic    0xA0..0xBF
1932    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1933    ---------------------------------------------
1934
1935    As this is the internal character representation, the format is
1936    usually not used externally (i.e. in a file or in a data sent to a
1937    process).  But, it is possible to have a text externally in this
1938    format (i.e. by encoding by the coding system `emacs-mule').
1939
1940    In that case, a sequence of one-byte codes has a slightly different
1941    form.
1942
1943    At first, all characters in eight-bit-control are represented by
1944    one-byte sequences which are their 8-bit code.
1945
1946    Next, character composition data are represented by the byte
1947    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1948    where,
1949         METHOD is 0xF2 plus one of composition method (enum
1950         composition_method),
1951
1952         BYTES is 0xA0 plus a byte length of this composition data,
1953
1954         CHARS is 0xA0 plus a number of characters composed by this
1955         data,
1956
1957         COMPONENTs are characters of multibye form or composition
1958         rules encoded by two-byte of ASCII codes.
1959
1960    In addition, for backward compatibility, the following formats are
1961    also recognized as composition data on decoding.
1962
1963    0x80 MSEQ ...
1964    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1965
1966    Here,
1967         MSEQ is a multibyte form but in these special format:
1968           ASCII: 0xA0 ASCII_CODE+0x80,
1969           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1970         RULE is a one byte code of the range 0xA0..0xF0 that
1971         represents a composition rule.
1972   */
1973
1974 char emacs_mule_bytes[256];
1975
1976
1977 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1978    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1979    else return 0.  */
1980
1981 static int
1982 detect_coding_emacs_mule (coding, detect_info)
1983      struct coding_system *coding;
1984      struct coding_detection_info *detect_info;
1985 {
1986   const unsigned char *src = coding->source, *src_base;
1987   const unsigned char *src_end = coding->source + coding->src_bytes;
1988   int multibytep = coding->src_multibyte;
1989   int consumed_chars = 0;
1990   int c;
1991   int found = 0;
1992
1993   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1994   /* A coding system of this category is always ASCII compatible.  */
1995   src += coding->head_ascii;
1996
1997   while (1)
1998     {
1999       src_base = src;
2000       ONE_MORE_BYTE (c);
2001       if (c < 0)
2002         continue;
2003       if (c == 0x80)
2004         {
2005           /* Perhaps the start of composite character.  We simply skip
2006              it because analyzing it is too heavy for detecting.  But,
2007              at least, we check that the composite character
2008              constitutes of more than 4 bytes.  */
2009           const unsigned char *src_base;
2010
2011         repeat:
2012           src_base = src;
2013           do
2014             {
2015               ONE_MORE_BYTE (c);
2016             }
2017           while (c >= 0xA0);
2018
2019           if (src - src_base <= 4)
2020             break;
2021           found = CATEGORY_MASK_EMACS_MULE;
2022           if (c == 0x80)
2023             goto repeat;
2024         }
2025
2026       if (c < 0x80)
2027         {
2028           if (c < 0x20
2029               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2030             break;
2031         }
2032       else
2033         {
2034           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2035
2036           while (more_bytes > 0)
2037             {
2038               ONE_MORE_BYTE (c);
2039               if (c < 0xA0)
2040                 {
2041                   src--;        /* Unread the last byte.  */
2042                   break;
2043                 }
2044               more_bytes--;
2045             }
2046           if (more_bytes != 0)
2047             break;
2048           found = CATEGORY_MASK_EMACS_MULE;
2049         }
2050     }
2051   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2052   return 0;
2053
2054  no_more_source:
2055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2056     {
2057       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2058       return 0;
2059     }
2060   detect_info->found |= found;
2061   return 1;
2062 }
2063
2064
2065 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2066    character.  If CMP_STATUS indicates that we must expect MSEQ or
2067    RULE described above, decode it and return the negative value of
2068    the decoded character or rule.  If an invalid byte is found, return
2069    -1.  If SRC is too short, return -2.  */
2070
2071 int
2072 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2073      struct coding_system *coding;
2074      const unsigned char *src;
2075      int *nbytes, *nchars, *id;
2076      struct composition_status *cmp_status;
2077 {
2078   const unsigned char *src_end = coding->source + coding->src_bytes;
2079   const unsigned char *src_base = src;
2080   int multibytep = coding->src_multibyte;
2081   struct charset *charset;
2082   unsigned code;
2083   int c;
2084   int consumed_chars = 0;
2085   int mseq_found = 0;
2086
2087   ONE_MORE_BYTE (c);
2088   if (c < 0)
2089     {
2090       c = -c;
2091       charset = emacs_mule_charset[0];
2092     }
2093   else
2094     {
2095       if (c >= 0xA0)
2096         {
2097           if (cmp_status->state != COMPOSING_NO
2098               && cmp_status->old_form)
2099             {
2100               if (cmp_status->state == COMPOSING_CHAR)
2101                 {
2102                   if (c == 0xA0)
2103                     {
2104                       ONE_MORE_BYTE (c);
2105                       c -= 0x80;
2106                       if (c < 0)
2107                         goto invalid_code;
2108                     }
2109                   else
2110                     c -= 0x20;
2111                   mseq_found = 1;
2112                 }
2113               else
2114                 {
2115                   *nbytes = src - src_base;
2116                   *nchars = consumed_chars;
2117                   return -c;
2118                 }
2119             }
2120           else
2121             goto invalid_code;
2122         }
2123
2124       switch (emacs_mule_bytes[c])
2125         {
2126         case 2:
2127           if (! (charset = emacs_mule_charset[c]))
2128             goto invalid_code;
2129           ONE_MORE_BYTE (c);
2130           if (c < 0xA0)
2131             goto invalid_code;
2132           code = c & 0x7F;
2133           break;
2134
2135         case 3:
2136           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2137               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2138             {
2139               ONE_MORE_BYTE (c);
2140               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = c & 0x7F;
2146             }
2147           else
2148             {
2149               if (! (charset = emacs_mule_charset[c]))
2150                 goto invalid_code;
2151               ONE_MORE_BYTE (c);
2152               if (c < 0xA0)
2153                 goto invalid_code;
2154               code = (c & 0x7F) << 8;
2155               ONE_MORE_BYTE (c);
2156               if (c < 0xA0)
2157                 goto invalid_code;
2158               code |= c & 0x7F;
2159             }
2160           break;
2161
2162         case 4:
2163           ONE_MORE_BYTE (c);
2164           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2165             goto invalid_code;
2166           ONE_MORE_BYTE (c);
2167           if (c < 0xA0)
2168             goto invalid_code;
2169           code = (c & 0x7F) << 8;
2170           ONE_MORE_BYTE (c);
2171           if (c < 0xA0)
2172             goto invalid_code;
2173           code |= c & 0x7F;
2174           break;
2175
2176         case 1:
2177           code = c;
2178           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2179                                      ? charset_ascii : charset_eight_bit);
2180           break;
2181
2182         default:
2183           abort ();
2184         }
2185       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, code, c);
2186       if (c < 0)
2187         goto invalid_code;
2188     }
2189   *nbytes = src - src_base;
2190   *nchars = consumed_chars;
2191   if (id)
2192     *id = charset->id;
2193   return (mseq_found ? -c : c);
2194
2195  no_more_source:
2196   return -2;
2197
2198  invalid_code:
2199   return -1;
2200 }
2201
2202
2203 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2204
2205 /* Handle these composition sequence ('|': the end of header elements,
2206    BYTES and CHARS >= 0xA0):
2207
2208    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2209    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2210    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2211
2212    and these old form:
2213
2214    (4) relative composition: 0x80 | MSEQ ... MSEQ
2215    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2216
2217    When the starter 0x80 and the following header elements are found,
2218    this annotation header is produced.
2219
2220         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2221
2222    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2223    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2224
2225    Then, upon reading the following elements, these codes are produced
2226    until the composition end is found:
2227
2228    (1) CHAR ... CHAR
2229    (2) ALT ... ALT CHAR ... CHAR
2230    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2231    (4) CHAR ... CHAR
2232    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2233
2234    When the composition end is found, LENGTH and NCHARS in the
2235    annotation header is updated as below:
2236
2237    (1) LENGTH: unchanged, NCHARS: unchanged
2238    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2239    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2240    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2241    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2242
2243    If an error is found while composing, the annotation header is
2244    changed to the original composition header (plus filler -1s) as
2245    below:
2246
2247    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2248    (5)          [ 0x80 0xFF -1 -1- -1 ]
2249
2250    and the sequence [ -2 DECODED-RULE ] is changed to the original
2251    byte sequence as below:
2252         o the original byte sequence is B: [ B -1 ]
2253         o the original byte sequence is B1 B2: [ B1 B2 ]
2254
2255    Most of the routines are implemented by macros because many
2256    variables and labels in the caller decode_coding_emacs_mule must be
2257    accessible, and they are usually called just once (thus doesn't
2258    increase the size of compiled object).  */
2259
2260 /* Decode a composition rule represented by C as a component of
2261    composition sequence of Emacs 20 style.  Set RULE to the decoded
2262    rule. */
2263
2264 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2265   do {                                                  \
2266     int gref, nref;                                     \
2267                                                         \
2268     c -= 0xA0;                                          \
2269     if (c < 0 || c >= 81)                               \
2270       goto invalid_code;                                \
2271     gref = c / 9, nref = c % 9;                         \
2272     if (gref == 4) gref = 10;                           \
2273     if (nref == 4) nref = 10;                           \
2274     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2275   } while (0)
2276
2277
2278 /* Decode a composition rule represented by C and the following byte
2279    at SRC as a component of composition sequence of Emacs 21 style.
2280    Set RULE to the decoded rule.  */
2281
2282 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2283   do {                                                  \
2284     int gref, nref;                                     \
2285                                                         \
2286     gref = c - 0x20;                                    \
2287     if (gref < 0 || gref >= 81)                         \
2288       goto invalid_code;                                \
2289     ONE_MORE_BYTE (c);                                  \
2290     nref = c - 0x20;                                    \
2291     if (nref < 0 || nref >= 81)                         \
2292       goto invalid_code;                                \
2293     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2294   } while (0)
2295
2296
2297 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2298    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2299    byte length of this composition information, CHARS is the number of
2300    characters composed by this composition.  */
2301
2302 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2303   do {                                                                  \
2304     enum composition_method method = c - 0xF2;                          \
2305     int *charbuf_base = charbuf;                                        \
2306     int nbytes, nchars;                                                 \
2307                                                                         \
2308     ONE_MORE_BYTE (c);                                                  \
2309     if (c < 0)                                                          \
2310       goto invalid_code;                                                \
2311     nbytes = c - 0xA0;                                                  \
2312     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2313       goto invalid_code;                                                \
2314     ONE_MORE_BYTE (c);                                                  \
2315     nchars = c - 0xA0;                                                  \
2316     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2317       goto invalid_code;                                                \
2318     cmp_status->old_form = 0;                                           \
2319     cmp_status->method = method;                                        \
2320     if (method == COMPOSITION_RELATIVE)                                 \
2321       cmp_status->state = COMPOSING_CHAR;                               \
2322     else                                                                \
2323       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2324     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2325     cmp_status->nchars = nchars;                                        \
2326     cmp_status->ncomps = nbytes - 4;                                    \
2327     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2328   } while (0)
2329
2330
2331 /* Start of Emacs 20 style format for relative composition.  */
2332
2333 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2334   do {                                                          \
2335     cmp_status->old_form = 1;                                   \
2336     cmp_status->method = COMPOSITION_RELATIVE;                  \
2337     cmp_status->state = COMPOSING_CHAR;                         \
2338     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2339     cmp_status->nchars = cmp_status->ncomps = 0;                \
2340     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2341   } while (0)
2342
2343
2344 /* Start of Emacs 20 style format for rule-base composition.  */
2345
2346 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2347   do {                                                          \
2348     cmp_status->old_form = 1;                                   \
2349     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2350     cmp_status->state = COMPOSING_CHAR;                         \
2351     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2352     cmp_status->nchars = cmp_status->ncomps = 0;                \
2353     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2354   } while (0)
2355
2356
2357 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2358   do {                                                  \
2359     const unsigned char *current_src = src;             \
2360                                                         \
2361     ONE_MORE_BYTE (c);                                  \
2362     if (c < 0)                                          \
2363       goto invalid_code;                                \
2364     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2365         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2366       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2367     else if (c < 0xA0)                                  \
2368       goto invalid_code;                                \
2369     else if (c < 0xC0)                                  \
2370       {                                                 \
2371         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2372         /* Re-read C as a composition component.  */    \
2373         src = current_src;                              \
2374       }                                                 \
2375     else if (c == 0xFF)                                 \
2376       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2377     else                                                \
2378       goto invalid_code;                                \
2379   } while (0)
2380
2381 #define EMACS_MULE_COMPOSITION_END()                            \
2382   do {                                                          \
2383     int idx = - cmp_status->length;                             \
2384                                                                 \
2385     if (cmp_status->old_form)                                   \
2386       charbuf[idx + 2] = cmp_status->nchars;                    \
2387     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2388       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2389     cmp_status->state = COMPOSING_NO;                           \
2390   } while (0)
2391
2392
2393 static int
2394 emacs_mule_finish_composition (charbuf, cmp_status)
2395      int *charbuf;
2396      struct composition_status *cmp_status;
2397 {
2398   int idx = - cmp_status->length;
2399   int new_chars;
2400
2401   if (cmp_status->old_form && cmp_status->nchars > 0)
2402     {
2403       charbuf[idx + 2] = cmp_status->nchars;
2404       new_chars = 0;
2405       if (cmp_status->method == COMPOSITION_WITH_RULE
2406           && cmp_status->state == COMPOSING_CHAR)
2407         {
2408           /* The last rule was invalid.  */
2409           int rule = charbuf[-1] + 0xA0;
2410
2411           charbuf[-2] = BYTE8_TO_CHAR (rule);
2412           charbuf[-1] = -1;
2413           new_chars = 1;
2414         }
2415     }
2416   else
2417     {
2418       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2419
2420       if (cmp_status->method == COMPOSITION_WITH_RULE)
2421         {
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2423           charbuf[idx++] = -3;
2424           charbuf[idx++] = 0;
2425           new_chars = 1;
2426         }
2427       else
2428         {
2429           int nchars = charbuf[idx + 1] + 0xA0;
2430           int nbytes = charbuf[idx + 2] + 0xA0;
2431
2432           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2433           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2434           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2435           charbuf[idx++] = -1;
2436           new_chars = 4;
2437         }
2438     }
2439   cmp_status->state = COMPOSING_NO;
2440   return new_chars;
2441 }
2442
2443 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2444   do {                                                                    \
2445     if (cmp_status->state != COMPOSING_NO)                                \
2446       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2447   } while (0)
2448
2449
2450 static void
2451 decode_coding_emacs_mule (coding)
2452      struct coding_system *coding;
2453 {
2454   const unsigned char *src = coding->source + coding->consumed;
2455   const unsigned char *src_end = coding->source + coding->src_bytes;
2456   const unsigned char *src_base;
2457   int *charbuf = coding->charbuf + coding->charbuf_used;
2458   /* We may produce two annocations (charset and composition) in one
2459      loop and one more charset annocation at the end.  */
2460   int *charbuf_end
2461     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2462   int consumed_chars = 0, consumed_chars_base;
2463   int multibytep = coding->src_multibyte;
2464   Lisp_Object attrs, charset_list;
2465   int char_offset = coding->produced_char;
2466   int last_offset = char_offset;
2467   int last_id = charset_ascii;
2468   int eol_crlf =
2469     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2470   int byte_after_cr = -1;
2471   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2472
2473   CODING_GET_INFO (coding, attrs, charset_list);
2474
2475   if (cmp_status->state != COMPOSING_NO)
2476     {
2477       int i;
2478
2479       for (i = 0; i < cmp_status->length; i++)
2480         *charbuf++ = cmp_status->carryover[i];
2481       coding->annotated = 1;
2482     }
2483
2484   while (1)
2485     {
2486       int c, id;
2487
2488       src_base = src;
2489       consumed_chars_base = consumed_chars;
2490
2491       if (charbuf >= charbuf_end)
2492         {
2493           if (byte_after_cr >= 0)
2494             src_base--;
2495           break;
2496         }
2497
2498       if (byte_after_cr >= 0)
2499         c = byte_after_cr, byte_after_cr = -1;
2500       else
2501         ONE_MORE_BYTE (c);
2502
2503       if (c < 0 || c == 0x80)
2504         {
2505           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2506           if (c < 0)
2507             {
2508               *charbuf++ = -c;
2509               char_offset++;
2510             }
2511           else
2512             DECODE_EMACS_MULE_COMPOSITION_START ();
2513           continue;
2514         }
2515
2516       if (c < 0x80)
2517         {
2518           if (eol_crlf && c == '\r')
2519             ONE_MORE_BYTE (byte_after_cr);
2520           id = charset_ascii;
2521           if (cmp_status->state != COMPOSING_NO)
2522             {
2523               if (cmp_status->old_form)
2524                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2525               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2526                 cmp_status->ncomps--;
2527             }
2528         }
2529       else
2530         {
2531           int nchars, nbytes;
2532           /* emacs_mule_char can load a charset map from a file, which
2533              allocates a large structure and might cause buffer text
2534              to be relocated as result.  Thus, we need to remember the
2535              original pointer to buffer text, and fixup all related
2536              pointers after the call.  */
2537           const unsigned char *orig = coding->source;
2538           EMACS_INT offset;
2539
2540           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2541                                cmp_status);
2542           offset = coding->source - orig;
2543           if (offset)
2544             {
2545               src += offset;
2546               src_base += offset;
2547               src_end += offset;
2548             }
2549           if (c < 0)
2550             {
2551               if (c == -1)
2552                 goto invalid_code;
2553               if (c == -2)
2554                 break;
2555             }
2556           src = src_base + nbytes;
2557           consumed_chars = consumed_chars_base + nchars;
2558           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2559             cmp_status->ncomps -= nchars;
2560         }
2561
2562       /* Now if C >= 0, we found a normally encoded characer, if C <
2563          0, we found an old-style composition component character or
2564          rule.  */
2565
2566       if (cmp_status->state == COMPOSING_NO)
2567         {
2568           if (last_id != id)
2569             {
2570               if (last_id != charset_ascii)
2571                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2572                                   last_id);
2573               last_id = id;
2574               last_offset = char_offset;
2575             }
2576           *charbuf++ = c;
2577           char_offset++;
2578         }
2579       else if (cmp_status->state == COMPOSING_CHAR)
2580         {
2581           if (cmp_status->old_form)
2582             {
2583               if (c >= 0)
2584                 {
2585                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2586                   *charbuf++ = c;
2587                   char_offset++;
2588                 }
2589               else
2590                 {
2591                   *charbuf++ = -c;
2592                   cmp_status->nchars++;
2593                   cmp_status->length++;
2594                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2595                     EMACS_MULE_COMPOSITION_END ();
2596                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2597                     cmp_status->state = COMPOSING_RULE;
2598                 }
2599             }
2600           else
2601             {
2602               *charbuf++ = c;
2603               cmp_status->length++;
2604               cmp_status->nchars--;
2605               if (cmp_status->nchars == 0)
2606                 EMACS_MULE_COMPOSITION_END ();
2607             }
2608         }
2609       else if (cmp_status->state == COMPOSING_RULE)
2610         {
2611           int rule;
2612
2613           if (c >= 0)
2614             {
2615               EMACS_MULE_COMPOSITION_END ();
2616               *charbuf++ = c;
2617               char_offset++;
2618             }
2619           else
2620             {
2621               c = -c;
2622               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2623               if (rule < 0)
2624                 goto invalid_code;
2625               *charbuf++ = -2;
2626               *charbuf++ = rule;
2627               cmp_status->length += 2;
2628               cmp_status->state = COMPOSING_CHAR;
2629             }
2630         }
2631       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2632         {
2633           *charbuf++ = c;
2634           cmp_status->length++;
2635           if (cmp_status->ncomps == 0)
2636             cmp_status->state = COMPOSING_CHAR;
2637           else if (cmp_status->ncomps > 0)
2638             {
2639               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2640                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2641             }
2642           else
2643             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2644         }
2645       else                      /* COMPOSING_COMPONENT_RULE */
2646         {
2647           int rule;
2648
2649           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2650           if (rule < 0)
2651             goto invalid_code;
2652           *charbuf++ = -2;
2653           *charbuf++ = rule;
2654           cmp_status->length += 2;
2655           cmp_status->ncomps--;
2656           if (cmp_status->ncomps > 0)
2657             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2658           else
2659             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660         }
2661       continue;
2662
2663     retry:
2664       src = src_base;
2665       consumed_chars = consumed_chars_base;
2666       continue;
2667
2668     invalid_code:
2669       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2670       src = src_base;
2671       consumed_chars = consumed_chars_base;
2672       ONE_MORE_BYTE (c);
2673       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2674       char_offset++;
2675       coding->errors++;
2676     }
2677
2678  no_more_source:
2679   if (cmp_status->state != COMPOSING_NO)
2680     {
2681       if (coding->mode & CODING_MODE_LAST_BLOCK)
2682         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2683       else
2684         {
2685           int i;
2686
2687           charbuf -= cmp_status->length;
2688           for (i = 0; i < cmp_status->length; i++)
2689             cmp_status->carryover[i] = charbuf[i];
2690         }
2691     }
2692   if (last_id != charset_ascii)
2693     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2694   coding->consumed_char += consumed_chars_base;
2695   coding->consumed = src_base - coding->source;
2696   coding->charbuf_used = charbuf - coding->charbuf;
2697 }
2698
2699
2700 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2701   do {                                          \
2702     if (id < 0xA0)                              \
2703       codes[0] = id, codes[1] = 0;              \
2704     else if (id < 0xE0)                         \
2705       codes[0] = 0x9A, codes[1] = id;           \
2706     else if (id < 0xF0)                         \
2707       codes[0] = 0x9B, codes[1] = id;           \
2708     else if (id < 0xF5)                         \
2709       codes[0] = 0x9C, codes[1] = id;           \
2710     else                                        \
2711       codes[0] = 0x9D, codes[1] = id;           \
2712   } while (0);
2713
2714
2715 static int
2716 encode_coding_emacs_mule (coding)
2717      struct coding_system *coding;
2718 {
2719   int multibytep = coding->dst_multibyte;
2720   int *charbuf = coding->charbuf;
2721   int *charbuf_end = charbuf + coding->charbuf_used;
2722   unsigned char *dst = coding->destination + coding->produced;
2723   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2724   int safe_room = 8;
2725   int produced_chars = 0;
2726   Lisp_Object attrs, charset_list;
2727   int c;
2728   int preferred_charset_id = -1;
2729
2730   CODING_GET_INFO (coding, attrs, charset_list);
2731   if (! EQ (charset_list, Vemacs_mule_charset_list))
2732     {
2733       CODING_ATTR_CHARSET_LIST (attrs)
2734         = charset_list = Vemacs_mule_charset_list;
2735     }
2736
2737   while (charbuf < charbuf_end)
2738     {
2739       ASSURE_DESTINATION (safe_room);
2740       c = *charbuf++;
2741
2742       if (c < 0)
2743         {
2744           /* Handle an annotation.  */
2745           switch (*charbuf)
2746             {
2747             case CODING_ANNOTATE_COMPOSITION_MASK:
2748               /* Not yet implemented.  */
2749               break;
2750             case CODING_ANNOTATE_CHARSET_MASK:
2751               preferred_charset_id = charbuf[3];
2752               if (preferred_charset_id >= 0
2753                   && NILP (Fmemq (make_number (preferred_charset_id),
2754                                   charset_list)))
2755                 preferred_charset_id = -1;
2756               break;
2757             default:
2758               abort ();
2759             }
2760           charbuf += -c - 1;
2761           continue;
2762         }
2763
2764       if (ASCII_CHAR_P (c))
2765         EMIT_ONE_ASCII_BYTE (c);
2766       else if (CHAR_BYTE8_P (c))
2767         {
2768           c = CHAR_TO_BYTE8 (c);
2769           EMIT_ONE_BYTE (c);
2770         }
2771       else
2772         {
2773           struct charset *charset;
2774           unsigned code;
2775           int dimension;
2776           int emacs_mule_id;
2777           unsigned char leading_codes[2];
2778
2779           if (preferred_charset_id >= 0)
2780             {
2781               charset = CHARSET_FROM_ID (preferred_charset_id);
2782               if (CHAR_CHARSET_P (c, charset))
2783                 code = ENCODE_CHAR (charset, c);
2784               else
2785                 charset = char_charset (c, charset_list, &code);
2786             }
2787           else
2788             charset = char_charset (c, charset_list, &code);
2789           if (! charset)
2790             {
2791               c = coding->default_char;
2792               if (ASCII_CHAR_P (c))
2793                 {
2794                   EMIT_ONE_ASCII_BYTE (c);
2795                   continue;
2796                 }
2797               charset = char_charset (c, charset_list, &code);
2798             }
2799           dimension = CHARSET_DIMENSION (charset);
2800           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2801           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2802           EMIT_ONE_BYTE (leading_codes[0]);
2803           if (leading_codes[1])
2804             EMIT_ONE_BYTE (leading_codes[1]);
2805           if (dimension == 1)
2806             EMIT_ONE_BYTE (code | 0x80);
2807           else
2808             {
2809               code |= 0x8080;
2810               EMIT_ONE_BYTE (code >> 8);
2811               EMIT_ONE_BYTE (code & 0xFF);
2812             }
2813         }
2814     }
2815   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2816   coding->produced_char += produced_chars;
2817   coding->produced = dst - coding->destination;
2818   return 0;
2819 }
2820
2821 \f
2822 /*** 7. ISO2022 handlers ***/
2823
2824 /* The following note describes the coding system ISO2022 briefly.
2825    Since the intention of this note is to help understand the
2826    functions in this file, some parts are NOT ACCURATE or are OVERLY
2827    SIMPLIFIED.  For thorough understanding, please refer to the
2828    original document of ISO2022.  This is equivalent to the standard
2829    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2830
2831    ISO2022 provides many mechanisms to encode several character sets
2832    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2833    is encoded using bytes less than 128.  This may make the encoded
2834    text a little bit longer, but the text passes more easily through
2835    several types of gateway, some of which strip off the MSB (Most
2836    Significant Bit).
2837
2838    There are two kinds of character sets: control character sets and
2839    graphic character sets.  The former contain control characters such
2840    as `newline' and `escape' to provide control functions (control
2841    functions are also provided by escape sequences).  The latter
2842    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2843    two control character sets and many graphic character sets.
2844
2845    Graphic character sets are classified into one of the following
2846    four classes, according to the number of bytes (DIMENSION) and
2847    number of characters in one dimension (CHARS) of the set:
2848    - DIMENSION1_CHARS94
2849    - DIMENSION1_CHARS96
2850    - DIMENSION2_CHARS94
2851    - DIMENSION2_CHARS96
2852
2853    In addition, each character set is assigned an identification tag,
2854    unique for each set, called the "final character" (denoted as <F>
2855    hereafter).  The <F> of each character set is decided by ECMA(*)
2856    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2857    (0x30..0x3F are for private use only).
2858
2859    Note (*): ECMA = European Computer Manufacturers Association
2860
2861    Here are examples of graphic character sets [NAME(<F>)]:
2862         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2863         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2864         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2865         o DIMENSION2_CHARS96 -- none for the moment
2866
2867    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2868         C0 [0x00..0x1F] -- control character plane 0
2869         GL [0x20..0x7F] -- graphic character plane 0
2870         C1 [0x80..0x9F] -- control character plane 1
2871         GR [0xA0..0xFF] -- graphic character plane 1
2872
2873    A control character set is directly designated and invoked to C0 or
2874    C1 by an escape sequence.  The most common case is that:
2875    - ISO646's  control character set is designated/invoked to C0, and
2876    - ISO6429's control character set is designated/invoked to C1,
2877    and usually these designations/invocations are omitted in encoded
2878    text.  In a 7-bit environment, only C0 can be used, and a control
2879    character for C1 is encoded by an appropriate escape sequence to
2880    fit into the environment.  All control characters for C1 are
2881    defined to have corresponding escape sequences.
2882
2883    A graphic character set is at first designated to one of four
2884    graphic registers (G0 through G3), then these graphic registers are
2885    invoked to GL or GR.  These designations and invocations can be
2886    done independently.  The most common case is that G0 is invoked to
2887    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2888    these invocations and designations are omitted in encoded text.
2889    In a 7-bit environment, only GL can be used.
2890
2891    When a graphic character set of CHARS94 is invoked to GL, codes
2892    0x20 and 0x7F of the GL area work as control characters SPACE and
2893    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2894    be used.
2895
2896    There are two ways of invocation: locking-shift and single-shift.
2897    With locking-shift, the invocation lasts until the next different
2898    invocation, whereas with single-shift, the invocation affects the
2899    following character only and doesn't affect the locking-shift
2900    state.  Invocations are done by the following control characters or
2901    escape sequences:
2902
2903    ----------------------------------------------------------------------
2904    abbrev  function                  cntrl escape seq   description
2905    ----------------------------------------------------------------------
2906    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2907    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2908    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2909    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2910    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2911    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2912    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2913    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2914    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2915    ----------------------------------------------------------------------
2916    (*) These are not used by any known coding system.
2917
2918    Control characters for these functions are defined by macros
2919    ISO_CODE_XXX in `coding.h'.
2920
2921    Designations are done by the following escape sequences:
2922    ----------------------------------------------------------------------
2923    escape sequence      description
2924    ----------------------------------------------------------------------
2925    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2926    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2927    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2928    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2929    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2930    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2931    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2932    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2933    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2934    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2935    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2936    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2937    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2938    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2939    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2940    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2941    ----------------------------------------------------------------------
2942
2943    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2944    of dimension 1, chars 94, and final character <F>, etc...
2945
2946    Note (*): Although these designations are not allowed in ISO2022,
2947    Emacs accepts them on decoding, and produces them on encoding
2948    CHARS96 character sets in a coding system which is characterized as
2949    7-bit environment, non-locking-shift, and non-single-shift.
2950
2951    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2952    '(' must be omitted.  We refer to this as "short-form" hereafter.
2953
2954    Now you may notice that there are a lot of ways of encoding the
2955    same multilingual text in ISO2022.  Actually, there exist many
2956    coding systems such as Compound Text (used in X11's inter client
2957    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2958    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2959    localized platforms), and all of these are variants of ISO2022.
2960
2961    In addition to the above, Emacs handles two more kinds of escape
2962    sequences: ISO6429's direction specification and Emacs' private
2963    sequence for specifying character composition.
2964
2965    ISO6429's direction specification takes the following form:
2966         o CSI ']'      -- end of the current direction
2967         o CSI '0' ']'  -- end of the current direction
2968         o CSI '1' ']'  -- start of left-to-right text
2969         o CSI '2' ']'  -- start of right-to-left text
2970    The control character CSI (0x9B: control sequence introducer) is
2971    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2972
2973    Character composition specification takes the following form:
2974         o ESC '0' -- start relative composition
2975         o ESC '1' -- end composition
2976         o ESC '2' -- start rule-base composition (*)
2977         o ESC '3' -- start relative composition with alternate chars  (**)
2978         o ESC '4' -- start rule-base composition with alternate chars  (**)
2979   Since these are not standard escape sequences of any ISO standard,
2980   the use of them with these meanings is restricted to Emacs only.
2981
2982   (*) This form is used only in Emacs 20.7 and older versions,
2983   but newer versions can safely decode it.
2984   (**) This form is used only in Emacs 21.1 and newer versions,
2985   and older versions can't decode it.
2986
2987   Here's a list of example usages of these composition escape
2988   sequences (categorized by `enum composition_method').
2989
2990   COMPOSITION_RELATIVE:
2991         ESC 0 CHAR [ CHAR ] ESC 1
2992   COMPOSITION_WITH_RULE:
2993         ESC 2 CHAR [ RULE CHAR ] ESC 1
2994   COMPOSITION_WITH_ALTCHARS:
2995         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2996   COMPOSITION_WITH_RULE_ALTCHARS:
2997         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2998
2999 enum iso_code_class_type iso_code_class[256];
3000
3001 #define SAFE_CHARSET_P(coding, id)      \
3002   ((id) <= (coding)->max_charset_id     \
3003    && (coding)->safe_charsets[id] != 255)
3004
3005
3006 #define SHIFT_OUT_OK(category)  \
3007   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
3008
3009 static void
3010 setup_iso_safe_charsets (attrs)
3011      Lisp_Object attrs;
3012 {
3013   Lisp_Object charset_list, safe_charsets;
3014   Lisp_Object request;
3015   Lisp_Object reg_usage;
3016   Lisp_Object tail;
3017   int reg94, reg96;
3018   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
3019   int max_charset_id;
3020
3021   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3022   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3023       && ! EQ (charset_list, Viso_2022_charset_list))
3024     {
3025       CODING_ATTR_CHARSET_LIST (attrs)
3026         = charset_list = Viso_2022_charset_list;
3027       ASET (attrs, coding_attr_safe_charsets, Qnil);
3028     }
3029
3030   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3031     return;
3032
3033   max_charset_id = 0;
3034   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3035     {
3036       int id = XINT (XCAR (tail));
3037       if (max_charset_id < id)
3038         max_charset_id = id;
3039     }
3040
3041   safe_charsets = make_uninit_string (max_charset_id + 1);
3042   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3043   request = AREF (attrs, coding_attr_iso_request);
3044   reg_usage = AREF (attrs, coding_attr_iso_usage);
3045   reg94 = XINT (XCAR (reg_usage));
3046   reg96 = XINT (XCDR (reg_usage));
3047
3048   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3049     {
3050       Lisp_Object id;
3051       Lisp_Object reg;
3052       struct charset *charset;
3053
3054       id = XCAR (tail);
3055       charset = CHARSET_FROM_ID (XINT (id));
3056       reg = Fcdr (Fassq (id, request));
3057       if (! NILP (reg))
3058         SSET (safe_charsets, XINT (id), XINT (reg));
3059       else if (charset->iso_chars_96)
3060         {
3061           if (reg96 < 4)
3062             SSET (safe_charsets, XINT (id), reg96);
3063         }
3064       else
3065         {
3066           if (reg94 < 4)
3067             SSET (safe_charsets, XINT (id), reg94);
3068         }
3069     }
3070   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3071 }
3072
3073
3074 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3075    Check if a text is encoded in one of ISO-2022 based codig systems.
3076    If it is, return 1, else return 0.  */
3077
3078 static int
3079 detect_coding_iso_2022 (coding, detect_info)
3080      struct coding_system *coding;
3081      struct coding_detection_info *detect_info;
3082 {
3083   const unsigned char *src = coding->source, *src_base = src;
3084   const unsigned char *src_end = coding->source + coding->src_bytes;
3085   int multibytep = coding->src_multibyte;
3086   int single_shifting = 0;
3087   int id;
3088   int c, c1;
3089   int consumed_chars = 0;
3090   int i;
3091   int rejected = 0;
3092   int found = 0;
3093   int composition_count = -1;
3094
3095   detect_info->checked |= CATEGORY_MASK_ISO;
3096
3097   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3098     {
3099       struct coding_system *this = &(coding_categories[i]);
3100       Lisp_Object attrs, val;
3101
3102       if (this->id < 0)
3103         continue;
3104       attrs = CODING_ID_ATTRS (this->id);
3105       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3106           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3107         setup_iso_safe_charsets (attrs);
3108       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3109       this->max_charset_id = SCHARS (val) - 1;
3110       this->safe_charsets = SDATA (val);
3111     }
3112
3113   /* A coding system of this category is always ASCII compatible.  */
3114   src += coding->head_ascii;
3115
3116   while (rejected != CATEGORY_MASK_ISO)
3117     {
3118       src_base = src;
3119       ONE_MORE_BYTE (c);
3120       switch (c)
3121         {
3122         case ISO_CODE_ESC:
3123           if (inhibit_iso_escape_detection)
3124             break;
3125           single_shifting = 0;
3126           ONE_MORE_BYTE (c);
3127           if (c >= '(' && c <= '/')
3128             {
3129               /* Designation sequence for a charset of dimension 1.  */
3130               ONE_MORE_BYTE (c1);
3131               if (c1 < ' ' || c1 >= 0x80
3132                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3133                 /* Invalid designation sequence.  Just ignore.  */
3134                 break;
3135             }
3136           else if (c == '$')
3137             {
3138               /* Designation sequence for a charset of dimension 2.  */
3139               ONE_MORE_BYTE (c);
3140               if (c >= '@' && c <= 'B')
3141                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3142                 id = iso_charset_table[1][0][c];
3143               else if (c >= '(' && c <= '/')
3144                 {
3145                   ONE_MORE_BYTE (c1);
3146                   if (c1 < ' ' || c1 >= 0x80
3147                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3148                     /* Invalid designation sequence.  Just ignore.  */
3149                     break;
3150                 }
3151               else
3152                 /* Invalid designation sequence.  Just ignore it.  */
3153                 break;
3154             }
3155           else if (c == 'N' || c == 'O')
3156             {
3157               /* ESC <Fe> for SS2 or SS3.  */
3158               single_shifting = 1;
3159               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160               break;
3161             }
3162           else if (c == '1')
3163             {
3164               /* End of composition.  */
3165               if (composition_count < 0
3166                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3167                 /* Invalid */
3168                 break;
3169               composition_count = -1;
3170               found |= CATEGORY_MASK_ISO;
3171             }
3172           else if (c >= '0' && c <= '4')
3173             {
3174               /* ESC <Fp> for start/end composition.  */
3175               composition_count = 0;
3176               break;
3177             }
3178           else
3179             {
3180               /* Invalid escape sequence.  Just ignore it.  */
3181               break;
3182             }
3183
3184           /* We found a valid designation sequence for CHARSET.  */
3185           rejected |= CATEGORY_MASK_ISO_8BIT;
3186           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3187                               id))
3188             found |= CATEGORY_MASK_ISO_7;
3189           else
3190             rejected |= CATEGORY_MASK_ISO_7;
3191           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3192                               id))
3193             found |= CATEGORY_MASK_ISO_7_TIGHT;
3194           else
3195             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3196           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3197                               id))
3198             found |= CATEGORY_MASK_ISO_7_ELSE;
3199           else
3200             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3201           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3202                               id))
3203             found |= CATEGORY_MASK_ISO_8_ELSE;
3204           else
3205             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3206           break;
3207
3208         case ISO_CODE_SO:
3209         case ISO_CODE_SI:
3210           /* Locking shift out/in.  */
3211           if (inhibit_iso_escape_detection)
3212             break;
3213           single_shifting = 0;
3214           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3215           break;
3216
3217         case ISO_CODE_CSI:
3218           /* Control sequence introducer.  */
3219           single_shifting = 0;
3220           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3221           found |= CATEGORY_MASK_ISO_8_ELSE;
3222           goto check_extra_latin;
3223
3224         case ISO_CODE_SS2:
3225         case ISO_CODE_SS3:
3226           /* Single shift.   */
3227           if (inhibit_iso_escape_detection)
3228             break;
3229           single_shifting = 0;
3230           rejected |= CATEGORY_MASK_ISO_7BIT;
3231           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3232               & CODING_ISO_FLAG_SINGLE_SHIFT)
3233             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3234           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3235               & CODING_ISO_FLAG_SINGLE_SHIFT)
3236             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3237           if (single_shifting)
3238             break;
3239           goto check_extra_latin;
3240
3241         default:
3242           if (c < 0)
3243             continue;
3244           if (c < 0x80)
3245             {
3246               if (composition_count >= 0)
3247                 composition_count++;
3248               single_shifting = 0;
3249               break;
3250             }
3251           if (c >= 0xA0)
3252             {
3253               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3254               found |= CATEGORY_MASK_ISO_8_1;
3255               /* Check the length of succeeding codes of the range
3256                  0xA0..0FF.  If the byte length is even, we include
3257                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3258                  only when we are not single shifting.  */
3259               if (! single_shifting
3260                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3261                 {
3262                   int i = 1;
3263                   while (src < src_end)
3264                     {
3265                       src_base = src;
3266                       ONE_MORE_BYTE (c);
3267                       if (c < 0xA0)
3268                         {
3269                           src = src_base;
3270                           break;
3271                         }
3272                       i++;
3273                     }
3274
3275                   if (i & 1 && src < src_end)
3276                     {
3277                       rejected |= CATEGORY_MASK_ISO_8_2;
3278                       if (composition_count >= 0)
3279                         composition_count += i;
3280                     }
3281                   else
3282                     {
3283                       found |= CATEGORY_MASK_ISO_8_2;
3284                       if (composition_count >= 0)
3285                         composition_count += i / 2;
3286                     }
3287                 }
3288               break;
3289             }
3290         check_extra_latin:
3291           single_shifting = 0;
3292           if (! VECTORP (Vlatin_extra_code_table)
3293               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3294             {
3295               rejected = CATEGORY_MASK_ISO;
3296               break;
3297             }
3298           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3299               & CODING_ISO_FLAG_LATIN_EXTRA)
3300             found |= CATEGORY_MASK_ISO_8_1;
3301           else
3302             rejected |= CATEGORY_MASK_ISO_8_1;
3303           rejected |= CATEGORY_MASK_ISO_8_2;
3304         }
3305     }
3306   detect_info->rejected |= CATEGORY_MASK_ISO;
3307   return 0;
3308
3309  no_more_source:
3310   detect_info->rejected |= rejected;
3311   detect_info->found |= (found & ~rejected);
3312   return 1;
3313 }
3314
3315
3316 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3317    escape sequence should be kept.  */
3318 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3319   do {                                                                  \
3320     int id, prev;                                                       \
3321                                                                         \
3322     if (final < '0' || final >= 128                                     \
3323         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3324         || !SAFE_CHARSET_P (coding, id))                                \
3325       {                                                                 \
3326         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3327         chars_96 = -1;                                                  \
3328         break;                                                          \
3329       }                                                                 \
3330     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3331     if (id == charset_jisx0201_roman)                                   \
3332       {                                                                 \
3333         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3334           id = charset_ascii;                                           \
3335       }                                                                 \
3336     else if (id == charset_jisx0208_1978)                               \
3337       {                                                                 \
3338         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3339           id = charset_jisx0208;                                        \
3340       }                                                                 \
3341     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3342     /* If there was an invalid designation to REG previously, and this  \
3343        designation is ASCII to REG, we should keep this designation     \
3344        sequence.  */                                                    \
3345     if (prev == -2 && id == charset_ascii)                              \
3346       chars_96 = -1;                                                    \
3347   } while (0)
3348
3349
3350 /* Handle these composition sequence (ALT: alternate char):
3351
3352    (1) relative composition: ESC 0 CHAR ... ESC 1
3353    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3354    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3355    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3356
3357    When the start sequence (ESC 0/2/3/4) is found, this annotation
3358    header is produced.
3359
3360         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3361
3362    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3363    produced until the end sequence (ESC 1) is found:
3364
3365    (1) CHAR ... CHAR
3366    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3367    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3368    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3369
3370    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3371    annotation header is updated as below:
3372
3373    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3374    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3375    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3376    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3377
3378    If an error is found while composing, the annotation header is
3379    changed to:
3380
3381         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3382
3383    and the sequence [ -2 DECODED-RULE ] is changed to the original
3384    byte sequence as below:
3385         o the original byte sequence is B: [ B -1 ]
3386         o the original byte sequence is B1 B2: [ B1 B2 ]
3387    and the sequence [ -1 -1 ] is changed to the original byte
3388    sequence:
3389         [ ESC '0' ]
3390 */
3391
3392 /* Decode a composition rule C1 and maybe one more byte from the
3393    source, and set RULE to the encoded composition rule, NBYTES to the
3394    length of the composition rule.  If the rule is invalid, set RULE
3395    to some negative value.  */
3396
3397 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3398   do {                                                                  \
3399     rule = c1 - 32;                                                     \
3400     if (rule < 0)                                                       \
3401       break;                                                            \
3402     if (rule < 81)              /* old format (before ver.21) */        \
3403       {                                                                 \
3404         int gref = (rule) / 9;                                          \
3405         int nref = (rule) % 9;                                          \
3406         if (gref == 4) gref = 10;                                       \
3407         if (nref == 4) nref = 10;                                       \
3408         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3409         nbytes = 1;                                                     \
3410       }                                                                 \
3411     else                        /* new format (after ver.21) */         \
3412       {                                                                 \
3413         int c;                                                          \
3414                                                                         \
3415         ONE_MORE_BYTE (c);                                              \
3416         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3417         if (rule >= 0)                                                  \
3418           rule += 0x100;   /* to destinguish it from the old format */  \
3419         nbytes = 2;                                                     \
3420       }                                                                 \
3421   } while (0)
3422
3423 #define ENCODE_COMPOSITION_RULE(rule)                           \
3424   do {                                                          \
3425     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3426                                                                 \
3427     if (rule < 0x100)           /* old format */                \
3428       {                                                         \
3429         if (gref == 10) gref = 4;                               \
3430         if (nref == 10) nref = 4;                               \
3431         charbuf[idx] = 32 + gref * 9 + nref;                    \
3432         charbuf[idx + 1] = -1;                                  \
3433         new_chars++;                                            \
3434       }                                                         \
3435     else                                /* new format */        \
3436       {                                                         \
3437         charbuf[idx] = 32 + 81 + gref;                          \
3438         charbuf[idx + 1] = 32 + nref;                           \
3439         new_chars += 2;                                         \
3440       }                                                         \
3441   } while (0)
3442
3443 /* Finish the current composition as invalid.  */
3444
3445 static int finish_composition P_ ((int *, struct composition_status *));
3446
3447 static int
3448 finish_composition (charbuf, cmp_status)
3449      int *charbuf;
3450      struct composition_status *cmp_status;
3451 {
3452   int idx = - cmp_status->length;
3453   int new_chars;
3454
3455   /* Recover the original ESC sequence */
3456   charbuf[idx++] = ISO_CODE_ESC;
3457   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3458                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3459                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3460                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3461                     : '4');
3462   charbuf[idx++] = -2;
3463   charbuf[idx++] = 0;
3464   charbuf[idx++] = -1;
3465   new_chars = cmp_status->nchars;
3466   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3467     for (; idx < 0; idx++)
3468       {
3469         int elt = charbuf[idx];
3470
3471         if (elt == -2)
3472           {
3473             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3474             idx++;
3475           }
3476         else if (elt == -1)
3477           {
3478             charbuf[idx++] = ISO_CODE_ESC;
3479             charbuf[idx] = '0';
3480             new_chars += 2;
3481           }
3482       }
3483   cmp_status->state = COMPOSING_NO;
3484   return new_chars;
3485 }
3486
3487 /* If characers are under composition, finish the composition.  */
3488 #define MAYBE_FINISH_COMPOSITION()                              \
3489   do {                                                          \
3490     if (cmp_status->state != COMPOSING_NO)                      \
3491       char_offset += finish_composition (charbuf, cmp_status);  \
3492   } while (0)
3493
3494 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3495
3496    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3497    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3498    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3499    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3500
3501    Produce this annotation sequence now:
3502
3503    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3504 */
3505
3506 #define DECODE_COMPOSITION_START(c1)                                       \
3507   do {                                                                     \
3508     if (c1 == '0'                                                          \
3509         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3510              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3511             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3512                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3513       {                                                                    \
3514         *charbuf++ = -1;                                                   \
3515         *charbuf++= -1;                                                    \
3516         cmp_status->state = COMPOSING_CHAR;                                \
3517         cmp_status->length += 2;                                           \
3518       }                                                                    \
3519     else                                                                   \
3520       {                                                                    \
3521         MAYBE_FINISH_COMPOSITION ();                                       \
3522         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3523                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3524                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3525                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3526         cmp_status->state                                                  \
3527           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3528         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3529         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3530         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3531         coding->annotated = 1;                                             \
3532       }                                                                    \
3533   } while (0)
3534
3535
3536 /* Handle composition end sequence ESC 1.  */
3537
3538 #define DECODE_COMPOSITION_END()                                        \
3539   do {                                                                  \
3540     if (cmp_status->nchars == 0                                         \
3541         || ((cmp_status->state == COMPOSING_CHAR)                       \
3542             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3543       {                                                                 \
3544         MAYBE_FINISH_COMPOSITION ();                                    \
3545         goto invalid_code;                                              \
3546       }                                                                 \
3547     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3548       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3549     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3550       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3551     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3552     char_offset += cmp_status->nchars;                                  \
3553     cmp_status->state = COMPOSING_NO;                                   \
3554   } while (0)
3555
3556 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3557
3558 #define STORE_COMPOSITION_RULE(rule)    \
3559   do {                                  \
3560     *charbuf++ = -2;                    \
3561     *charbuf++ = rule;                  \
3562     cmp_status->length += 2;            \
3563     cmp_status->state--;                \
3564   } while (0)
3565
3566 /* Store a composed char or a component char C in charbuf, and update
3567    cmp_status.  */
3568
3569 #define STORE_COMPOSITION_CHAR(c)                                       \
3570   do {                                                                  \
3571     *charbuf++ = (c);                                                   \
3572     cmp_status->length++;                                               \
3573     if (cmp_status->state == COMPOSING_CHAR)                            \
3574       cmp_status->nchars++;                                             \
3575     else                                                                \
3576       cmp_status->ncomps++;                                             \
3577     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3578         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3579             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3580       cmp_status->state++;                                              \
3581   } while (0)
3582
3583
3584 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3585
3586 static void
3587 decode_coding_iso_2022 (coding)
3588      struct coding_system *coding;
3589 {
3590   const unsigned char *src = coding->source + coding->consumed;
3591   const unsigned char *src_end = coding->source + coding->src_bytes;
3592   const unsigned char *src_base;
3593   int *charbuf = coding->charbuf + coding->charbuf_used;
3594   /* We may produce two annocations (charset and composition) in one
3595      loop and one more charset annocation at the end.  */
3596   int *charbuf_end
3597     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3598   int consumed_chars = 0, consumed_chars_base;
3599   int multibytep = coding->src_multibyte;
3600   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3601   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3602   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3603   int charset_id_2, charset_id_3;
3604   struct charset *charset;
3605   int c;
3606   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3607   Lisp_Object attrs, charset_list;
3608   int char_offset = coding->produced_char;
3609   int last_offset = char_offset;
3610   int last_id = charset_ascii;
3611   int eol_crlf =
3612     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3613   int byte_after_cr = -1;
3614   int i;
3615
3616   CODING_GET_INFO (coding, attrs, charset_list);
3617   setup_iso_safe_charsets (attrs);
3618   /* Charset list may have been changed.  */
3619   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3620   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3621
3622   if (cmp_status->state != COMPOSING_NO)
3623     {
3624       for (i = 0; i < cmp_status->length; i++)
3625         *charbuf++ = cmp_status->carryover[i];
3626       coding->annotated = 1;
3627     }
3628
3629   while (1)
3630     {
3631       int c1, c2, c3;
3632
3633       src_base = src;
3634       consumed_chars_base = consumed_chars;
3635
3636       if (charbuf >= charbuf_end)
3637         {
3638           if (byte_after_cr >= 0)
3639             src_base--;
3640           break;
3641         }
3642
3643       if (byte_after_cr >= 0)
3644         c1 = byte_after_cr, byte_after_cr = -1;
3645       else
3646         ONE_MORE_BYTE (c1);
3647       if (c1 < 0)
3648         goto invalid_code;
3649
3650       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3651         {
3652           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653           char_offset++;
3654           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3655           continue;
3656         }
3657
3658       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3659         {
3660           if (c1 == ISO_CODE_ESC)
3661             {
3662               if (src + 1 >= src_end)
3663                 goto no_more_source;
3664               *charbuf++ = ISO_CODE_ESC;
3665               char_offset++;
3666               if (src[0] == '%' && src[1] == '@')
3667                 {
3668                   src += 2;
3669                   consumed_chars += 2;
3670                   char_offset += 2;
3671                   /* We are sure charbuf can contain two more chars. */
3672                   *charbuf++ = '%';
3673                   *charbuf++ = '@';
3674                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3675                 }
3676             }
3677           else
3678             {
3679               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3680               char_offset++;
3681             }
3682           continue;
3683         }
3684
3685       if ((cmp_status->state == COMPOSING_RULE
3686            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3687           && c1 != ISO_CODE_ESC)
3688         {
3689           int rule, nbytes;
3690
3691           DECODE_COMPOSITION_RULE (rule, nbytes);
3692           if (rule < 0)
3693             goto invalid_code;
3694           STORE_COMPOSITION_RULE (rule);
3695           continue;
3696         }
3697
3698       /* We produce at most one character.  */
3699       switch (iso_code_class [c1])
3700         {
3701         case ISO_0x20_or_0x7F:
3702           if (charset_id_0 < 0
3703               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3704             /* This is SPACE or DEL.  */
3705             charset = CHARSET_FROM_ID (charset_ascii);
3706           else
3707             charset = CHARSET_FROM_ID (charset_id_0);
3708           break;
3709
3710         case ISO_graphic_plane_0:
3711           if (charset_id_0 < 0)
3712             charset = CHARSET_FROM_ID (charset_ascii);
3713           else
3714             charset = CHARSET_FROM_ID (charset_id_0);
3715           break;
3716
3717         case ISO_0xA0_or_0xFF:
3718           if (charset_id_1 < 0
3719               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3720               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3721             goto invalid_code;
3722           /* This is a graphic character, we fall down ... */
3723
3724         case ISO_graphic_plane_1:
3725           if (charset_id_1 < 0)
3726             goto invalid_code;
3727           charset = CHARSET_FROM_ID (charset_id_1);
3728           break;
3729
3730         case ISO_control_0:
3731           if (eol_crlf && c1 == '\r')
3732             ONE_MORE_BYTE (byte_after_cr);
3733           MAYBE_FINISH_COMPOSITION ();
3734           charset = CHARSET_FROM_ID (charset_ascii);
3735           break;
3736
3737         case ISO_control_1:
3738           goto invalid_code;
3739
3740         case ISO_shift_out:
3741           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3742               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3743             goto invalid_code;
3744           CODING_ISO_INVOCATION (coding, 0) = 1;
3745           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3746           continue;
3747
3748         case ISO_shift_in:
3749           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3750             goto invalid_code;
3751           CODING_ISO_INVOCATION (coding, 0) = 0;
3752           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3753           continue;
3754
3755         case ISO_single_shift_2_7:
3756           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3757             goto invalid_code;
3758         case ISO_single_shift_2:
3759           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3760             goto invalid_code;
3761           /* SS2 is handled as an escape sequence of ESC 'N' */
3762           c1 = 'N';
3763           goto label_escape_sequence;
3764
3765         case ISO_single_shift_3:
3766           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3767             goto invalid_code;
3768           /* SS2 is handled as an escape sequence of ESC 'O' */
3769           c1 = 'O';
3770           goto label_escape_sequence;
3771
3772         case ISO_control_sequence_introducer:
3773           /* CSI is handled as an escape sequence of ESC '[' ...  */
3774           c1 = '[';
3775           goto label_escape_sequence;
3776
3777         case ISO_escape:
3778           ONE_MORE_BYTE (c1);
3779         label_escape_sequence:
3780           /* Escape sequences handled here are invocation,
3781              designation, direction specification, and character
3782              composition specification.  */
3783           switch (c1)
3784             {
3785             case '&':           /* revision of following character set */
3786               ONE_MORE_BYTE (c1);
3787               if (!(c1 >= '@' && c1 <= '~'))
3788                 goto invalid_code;
3789               ONE_MORE_BYTE (c1);
3790               if (c1 != ISO_CODE_ESC)
3791                 goto invalid_code;
3792               ONE_MORE_BYTE (c1);
3793               goto label_escape_sequence;
3794
3795             case '$':           /* designation of 2-byte character set */
3796               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3797                 goto invalid_code;
3798               {
3799                 int reg, chars96;
3800
3801                 ONE_MORE_BYTE (c1);
3802                 if (c1 >= '@' && c1 <= 'B')
3803                   {     /* designation of JISX0208.1978, GB2312.1980,
3804                            or JISX0208.1980 */
3805                     reg = 0, chars96 = 0;
3806                   }
3807                 else if (c1 >= 0x28 && c1 <= 0x2B)
3808                   { /* designation of DIMENSION2_CHARS94 character set */
3809                     reg = c1 - 0x28, chars96 = 0;
3810                     ONE_MORE_BYTE (c1);
3811                   }
3812                 else if (c1 >= 0x2C && c1 <= 0x2F)
3813                   { /* designation of DIMENSION2_CHARS96 character set */
3814                     reg = c1 - 0x2C, chars96 = 1;
3815                     ONE_MORE_BYTE (c1);
3816                   }
3817                 else
3818                   goto invalid_code;
3819                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3820                 /* We must update these variables now.  */
3821                 if (reg == 0)
3822                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3823                 else if (reg == 1)
3824                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3825                 if (chars96 < 0)
3826                   goto invalid_code;
3827               }
3828               continue;
3829
3830             case 'n':           /* invocation of locking-shift-2 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3833                 goto invalid_code;
3834               CODING_ISO_INVOCATION (coding, 0) = 2;
3835               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3836               continue;
3837
3838             case 'o':           /* invocation of locking-shift-3 */
3839               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3840                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3841                 goto invalid_code;
3842               CODING_ISO_INVOCATION (coding, 0) = 3;
3843               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3844               continue;
3845
3846             case 'N':           /* invocation of single-shift-2 */
3847               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3848                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3849                 goto invalid_code;
3850               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3851               if (charset_id_2 < 0)
3852                 charset = CHARSET_FROM_ID (charset_ascii);
3853               else
3854                 charset = CHARSET_FROM_ID (charset_id_2);
3855               ONE_MORE_BYTE (c1);
3856               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3857                 goto invalid_code;
3858               break;
3859
3860             case 'O':           /* invocation of single-shift-3 */
3861               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3862                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3863                 goto invalid_code;
3864               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3865               if (charset_id_3 < 0)
3866                 charset = CHARSET_FROM_ID (charset_ascii);
3867               else
3868                 charset = CHARSET_FROM_ID (charset_id_3);
3869               ONE_MORE_BYTE (c1);
3870               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3871                 goto invalid_code;
3872               break;
3873
3874             case '0': case '2': case '3': case '4': /* start composition */
3875               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3876                 goto invalid_code;
3877               if (last_id != charset_ascii)
3878                 {
3879                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3880                   last_id = charset_ascii;
3881                   last_offset = char_offset;
3882                 }
3883               DECODE_COMPOSITION_START (c1);
3884               continue;
3885
3886             case '1':           /* end composition */
3887               if (cmp_status->state == COMPOSING_NO)
3888                 goto invalid_code;
3889               DECODE_COMPOSITION_END ();
3890               continue;
3891
3892             case '[':           /* specification of direction */
3893               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3894                 goto invalid_code;
3895               /* For the moment, nested direction is not supported.
3896                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3897                  left-to-right, and nozero means right-to-left.  */
3898               ONE_MORE_BYTE (c1);
3899               switch (c1)
3900                 {
3901                 case ']':       /* end of the current direction */
3902                   coding->mode &= ~CODING_MODE_DIRECTION;
3903
3904                 case '0':       /* end of the current direction */
3905                 case '1':       /* start of left-to-right direction */
3906                   ONE_MORE_BYTE (c1);
3907                   if (c1 == ']')
3908                     coding->mode &= ~CODING_MODE_DIRECTION;
3909                   else
3910                     goto invalid_code;
3911                   break;
3912
3913                 case '2':       /* start of right-to-left direction */
3914                   ONE_MORE_BYTE (c1);
3915                   if (c1 == ']')
3916                     coding->mode |= CODING_MODE_DIRECTION;
3917                   else
3918                     goto invalid_code;
3919                   break;
3920
3921                 default:
3922                   goto invalid_code;
3923                 }
3924               continue;
3925
3926             case '%':
3927               ONE_MORE_BYTE (c1);
3928               if (c1 == '/')
3929                 {
3930                   /* CTEXT extended segment:
3931                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3932                      We keep these bytes as is for the moment.
3933                      They may be decoded by post-read-conversion.  */
3934                   int dim, M, L;
3935                   int size;
3936
3937                   ONE_MORE_BYTE (dim);
3938                   if (dim < 0 || dim > 4)
3939                     goto invalid_code;
3940                   ONE_MORE_BYTE (M);
3941                   if (M < 128)
3942                     goto invalid_code;
3943                   ONE_MORE_BYTE (L);
3944                   if (L < 128)
3945                     goto invalid_code;
3946                   size = ((M - 128) * 128) + (L - 128);
3947                   if (charbuf + 6 > charbuf_end)
3948                     goto break_loop;
3949                   *charbuf++ = ISO_CODE_ESC;
3950                   *charbuf++ = '%';
3951                   *charbuf++ = '/';
3952                   *charbuf++ = dim;
3953                   *charbuf++ = BYTE8_TO_CHAR (M);
3954                   *charbuf++ = BYTE8_TO_CHAR (L);
3955                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3956                 }
3957               else if (c1 == 'G')
3958                 {
3959                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3960                      ESC % G --UTF-8-BYTES-- ESC % @
3961                      We keep these bytes as is for the moment.
3962                      They may be decoded by post-read-conversion.  */
3963                   if (charbuf + 3 > charbuf_end)
3964                     goto break_loop;
3965                   *charbuf++ = ISO_CODE_ESC;
3966                   *charbuf++ = '%';
3967                   *charbuf++ = 'G';
3968                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3969                 }
3970               else
3971                 goto invalid_code;
3972               continue;
3973               break;
3974
3975             default:
3976               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3977                 goto invalid_code;
3978               {
3979                 int reg, chars96;
3980
3981                 if (c1 >= 0x28 && c1 <= 0x2B)
3982                   { /* designation of DIMENSION1_CHARS94 character set */
3983                     reg = c1 - 0x28, chars96 = 0;
3984                     ONE_MORE_BYTE (c1);
3985                   }
3986                 else if (c1 >= 0x2C && c1 <= 0x2F)
3987                   { /* designation of DIMENSION1_CHARS96 character set */
3988                     reg = c1 - 0x2C, chars96 = 1;
3989                     ONE_MORE_BYTE (c1);
3990                   }
3991                 else
3992                   goto invalid_code;
3993                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3994                 /* We must update these variables now.  */
3995                 if (reg == 0)
3996                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3997                 else if (reg == 1)
3998                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3999                 if (chars96 < 0)
4000                   goto invalid_code;
4001               }
4002               continue;
4003             }
4004         }
4005
4006       if (cmp_status->state == COMPOSING_NO
4007           && charset->id != charset_ascii
4008           && last_id != charset->id)
4009         {
4010           if (last_id != charset_ascii)
4011             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4012           last_id = charset->id;
4013           last_offset = char_offset;
4014         }
4015
4016       /* Now we know CHARSET and 1st position code C1 of a character.
4017          Produce a decoded character while getting 2nd and 3rd
4018          position codes C2, C3 if necessary.  */
4019       if (CHARSET_DIMENSION (charset) > 1)
4020         {
4021           ONE_MORE_BYTE (c2);
4022           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
4023               || ((c1 & 0x80) != (c2 & 0x80)))
4024             /* C2 is not in a valid range.  */
4025             goto invalid_code;
4026           if (CHARSET_DIMENSION (charset) == 2)
4027             c1 = (c1 << 8) | c2;
4028           else
4029             {
4030               ONE_MORE_BYTE (c3);
4031               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4032                   || ((c1 & 0x80) != (c3 & 0x80)))
4033                 /* C3 is not in a valid range.  */
4034                 goto invalid_code;
4035               c1 = (c1 << 16) | (c2 << 8) | c2;
4036             }
4037         }
4038       c1 &= 0x7F7F7F;
4039       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4040       if (c < 0)
4041         {
4042           MAYBE_FINISH_COMPOSITION ();
4043           for (; src_base < src; src_base++, char_offset++)
4044             {
4045               if (ASCII_BYTE_P (*src_base))
4046                 *charbuf++ = *src_base;
4047               else
4048                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4049             }
4050         }
4051       else if (cmp_status->state == COMPOSING_NO)
4052         {
4053           *charbuf++ = c;
4054           char_offset++;
4055         }
4056       else if ((cmp_status->state == COMPOSING_CHAR
4057                 ? cmp_status->nchars
4058                 : cmp_status->ncomps)
4059                >= MAX_COMPOSITION_COMPONENTS)
4060         {
4061           /* Too long composition.  */
4062           MAYBE_FINISH_COMPOSITION ();
4063           *charbuf++ = c;
4064           char_offset++;
4065         }
4066       else
4067         STORE_COMPOSITION_CHAR (c);
4068       continue;
4069
4070     invalid_code:
4071       MAYBE_FINISH_COMPOSITION ();
4072       src = src_base;
4073       consumed_chars = consumed_chars_base;
4074       ONE_MORE_BYTE (c);
4075       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4076       char_offset++;
4077       coding->errors++;
4078       continue;
4079
4080     break_loop:
4081       break;
4082     }
4083
4084  no_more_source:
4085   if (cmp_status->state != COMPOSING_NO)
4086     {
4087       if (coding->mode & CODING_MODE_LAST_BLOCK)
4088         MAYBE_FINISH_COMPOSITION ();
4089       else
4090         {
4091           charbuf -= cmp_status->length;
4092           for (i = 0; i < cmp_status->length; i++)
4093             cmp_status->carryover[i] = charbuf[i];
4094         }
4095     }
4096   else if (last_id != charset_ascii)
4097     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4098   coding->consumed_char += consumed_chars_base;
4099   coding->consumed = src_base - coding->source;
4100   coding->charbuf_used = charbuf - coding->charbuf;
4101 }
4102
4103
4104 /* ISO2022 encoding stuff.  */
4105
4106 /*
4107    It is not enough to say just "ISO2022" on encoding, we have to
4108    specify more details.  In Emacs, each coding system of ISO2022
4109    variant has the following specifications:
4110         1. Initial designation to G0 thru G3.
4111         2. Allows short-form designation?
4112         3. ASCII should be designated to G0 before control characters?
4113         4. ASCII should be designated to G0 at end of line?
4114         5. 7-bit environment or 8-bit environment?
4115         6. Use locking-shift?
4116         7. Use Single-shift?
4117    And the following two are only for Japanese:
4118         8. Use ASCII in place of JIS0201-1976-Roman?
4119         9. Use JISX0208-1983 in place of JISX0208-1978?
4120    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4121    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4122    details.
4123 */
4124
4125 /* Produce codes (escape sequence) for designating CHARSET to graphic
4126    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4127    '@', 'A', or 'B' and the coding system CODING allows, produce
4128    designation sequence of short-form.  */
4129
4130 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4131   do {                                                                  \
4132     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4133     char *intermediate_char_94 = "()*+";                                \
4134     char *intermediate_char_96 = ",-./";                                \
4135     int revision = -1;                                                  \
4136     int c;                                                              \
4137                                                                         \
4138     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4139       revision = CHARSET_ISO_REVISION (charset);                        \
4140                                                                         \
4141     if (revision >= 0)                                                  \
4142       {                                                                 \
4143         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4144         EMIT_ONE_BYTE ('@' + revision);                                 \
4145       }                                                                 \
4146     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4147     if (CHARSET_DIMENSION (charset) == 1)                               \
4148       {                                                                 \
4149         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4150           c = intermediate_char_94[reg];                                \
4151         else                                                            \
4152           c = intermediate_char_96[reg];                                \
4153         EMIT_ONE_ASCII_BYTE (c);                                        \
4154       }                                                                 \
4155     else                                                                \
4156       {                                                                 \
4157         EMIT_ONE_ASCII_BYTE ('$');                                      \
4158         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4159           {                                                             \
4160             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4161                 || reg != 0                                             \
4162                 || final_char < '@' || final_char > 'B')                \
4163               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4164           }                                                             \
4165         else                                                            \
4166           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4167       }                                                                 \
4168     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4169                                                                         \
4170     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4171   } while (0)
4172
4173
4174 /* The following two macros produce codes (control character or escape
4175    sequence) for ISO2022 single-shift functions (single-shift-2 and
4176    single-shift-3).  */
4177
4178 #define ENCODE_SINGLE_SHIFT_2                                           \
4179   do {                                                                  \
4180     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4181       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4182     else                                                                \
4183       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4184     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4185   } while (0)
4186
4187
4188 #define ENCODE_SINGLE_SHIFT_3                                           \
4189   do {                                                                  \
4190     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4191       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4192     else                                                                \
4193       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4194     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4195   } while (0)
4196
4197
4198 /* The following four macros produce codes (control character or
4199    escape sequence) for ISO2022 locking-shift functions (shift-in,
4200    shift-out, locking-shift-2, and locking-shift-3).  */
4201
4202 #define ENCODE_SHIFT_IN                                 \
4203   do {                                                  \
4204     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4205     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4206   } while (0)
4207
4208
4209 #define ENCODE_SHIFT_OUT                                \
4210   do {                                                  \
4211     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4212     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4213   } while (0)
4214
4215
4216 #define ENCODE_LOCKING_SHIFT_2                          \
4217   do {                                                  \
4218     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4219     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4220   } while (0)
4221
4222
4223 #define ENCODE_LOCKING_SHIFT_3                          \
4224   do {                                                  \
4225     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4226     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4227   } while (0)
4228
4229
4230 /* Produce codes for a DIMENSION1 character whose character set is
4231    CHARSET and whose position-code is C1.  Designation and invocation
4232    sequences are also produced in advance if necessary.  */
4233
4234 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4235   do {                                                                  \
4236     int id = CHARSET_ID (charset);                                      \
4237                                                                         \
4238     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4239         && id == charset_ascii)                                         \
4240       {                                                                 \
4241         id = charset_jisx0201_roman;                                    \
4242         charset = CHARSET_FROM_ID (id);                                 \
4243       }                                                                 \
4244                                                                         \
4245     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4246       {                                                                 \
4247         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4248           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4249         else                                                            \
4250           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4251         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4252         break;                                                          \
4253       }                                                                 \
4254     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4255       {                                                                 \
4256         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4257         break;                                                          \
4258       }                                                                 \
4259     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4260       {                                                                 \
4261         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4262         break;                                                          \
4263       }                                                                 \
4264     else                                                                \
4265       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4266          must invoke it, or, at first, designate it to some graphic     \
4267          register.  Then repeat the loop to actually produce the        \
4268          character.  */                                                 \
4269       dst = encode_invocation_designation (charset, coding, dst,        \
4270                                            &produced_chars);            \
4271   } while (1)
4272
4273
4274 /* Produce codes for a DIMENSION2 character whose character set is
4275    CHARSET and whose position-codes are C1 and C2.  Designation and
4276    invocation codes are also produced in advance if necessary.  */
4277
4278 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4279   do {                                                                  \
4280     int id = CHARSET_ID (charset);                                      \
4281                                                                         \
4282     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4283         && id == charset_jisx0208)                                      \
4284       {                                                                 \
4285         id = charset_jisx0208_1978;                                     \
4286         charset = CHARSET_FROM_ID (id);                                 \
4287       }                                                                 \
4288                                                                         \
4289     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4290       {                                                                 \
4291         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4292           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4293         else                                                            \
4294           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4295         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4296         break;                                                          \
4297       }                                                                 \
4298     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4299       {                                                                 \
4300         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4301         break;                                                          \
4302       }                                                                 \
4303     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4304       {                                                                 \
4305         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4306         break;                                                          \
4307       }                                                                 \
4308     else                                                                \
4309       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4310          must invoke it, or, at first, designate it to some graphic     \
4311          register.  Then repeat the loop to actually produce the        \
4312          character.  */                                                 \
4313       dst = encode_invocation_designation (charset, coding, dst,        \
4314                                            &produced_chars);            \
4315   } while (1)
4316
4317
4318 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4319   do {                                                                     \
4320     int code = ENCODE_CHAR ((charset),(c));                                \
4321                                                                            \
4322     if (CHARSET_DIMENSION (charset) == 1)                                  \
4323       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4324     else                                                                   \
4325       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4326   } while (0)
4327
4328
4329 /* Produce designation and invocation codes at a place pointed by DST
4330    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4331    Return new DST.  */
4332
4333 unsigned char *
4334 encode_invocation_designation (charset, coding, dst, p_nchars)
4335      struct charset *charset;
4336      struct coding_system *coding;
4337      unsigned char *dst;
4338      int *p_nchars;
4339 {
4340   int multibytep = coding->dst_multibyte;
4341   int produced_chars = *p_nchars;
4342   int reg;                      /* graphic register number */
4343   int id = CHARSET_ID (charset);
4344
4345   /* At first, check designations.  */
4346   for (reg = 0; reg < 4; reg++)
4347     if (id == CODING_ISO_DESIGNATION (coding, reg))
4348       break;
4349
4350   if (reg >= 4)
4351     {
4352       /* CHARSET is not yet designated to any graphic registers.  */
4353       /* At first check the requested designation.  */
4354       reg = CODING_ISO_REQUEST (coding, id);
4355       if (reg < 0)
4356         /* Since CHARSET requests no special designation, designate it
4357            to graphic register 0.  */
4358         reg = 0;
4359
4360       ENCODE_DESIGNATION (charset, reg, coding);
4361     }
4362
4363   if (CODING_ISO_INVOCATION (coding, 0) != reg
4364       && CODING_ISO_INVOCATION (coding, 1) != reg)
4365     {
4366       /* Since the graphic register REG is not invoked to any graphic
4367          planes, invoke it to graphic plane 0.  */
4368       switch (reg)
4369         {
4370         case 0:                 /* graphic register 0 */
4371           ENCODE_SHIFT_IN;
4372           break;
4373
4374         case 1:                 /* graphic register 1 */
4375           ENCODE_SHIFT_OUT;
4376           break;
4377
4378         case 2:                 /* graphic register 2 */
4379           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4380             ENCODE_SINGLE_SHIFT_2;
4381           else
4382             ENCODE_LOCKING_SHIFT_2;
4383           break;
4384
4385         case 3:                 /* graphic register 3 */
4386           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4387             ENCODE_SINGLE_SHIFT_3;
4388           else
4389             ENCODE_LOCKING_SHIFT_3;
4390           break;
4391         }
4392     }
4393
4394   *p_nchars = produced_chars;
4395   return dst;
4396 }
4397
4398 /* The following three macros produce codes for indicating direction
4399    of text.  */
4400 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4401   do {                                                                  \
4402     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4403       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4404     else                                                                \
4405       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4406   } while (0)
4407
4408
4409 #define ENCODE_DIRECTION_R2L()                  \
4410   do {                                          \
4411     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4412     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4413   } while (0)
4414
4415
4416 #define ENCODE_DIRECTION_L2R()                  \
4417   do {                                          \
4418     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4419     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4420   } while (0)
4421
4422
4423 /* Produce codes for designation and invocation to reset the graphic
4424    planes and registers to initial state.  */
4425 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4426   do {                                                                  \
4427     int reg;                                                            \
4428     struct charset *charset;                                            \
4429                                                                         \
4430     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4431       ENCODE_SHIFT_IN;                                                  \
4432     for (reg = 0; reg < 4; reg++)                                       \
4433       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4434           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4435               != CODING_ISO_INITIAL (coding, reg)))                     \
4436         {                                                               \
4437           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4438           ENCODE_DESIGNATION (charset, reg, coding);                    \
4439         }                                                               \
4440   } while (0)
4441
4442
4443 /* Produce designation sequences of charsets in the line started from
4444    SRC to a place pointed by DST, and return updated DST.
4445
4446    If the current block ends before any end-of-line, we may fail to
4447    find all the necessary designations.  */
4448
4449 static unsigned char *
4450 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4451      struct coding_system *coding;
4452      int *charbuf, *charbuf_end;
4453      unsigned char *dst;
4454 {
4455   struct charset *charset;
4456   /* Table of charsets to be designated to each graphic register.  */
4457   int r[4];
4458   int c, found = 0, reg;
4459   int produced_chars = 0;
4460   int multibytep = coding->dst_multibyte;
4461   Lisp_Object attrs;
4462   Lisp_Object charset_list;
4463
4464   attrs = CODING_ID_ATTRS (coding->id);
4465   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4466   if (EQ (charset_list, Qiso_2022))
4467     charset_list = Viso_2022_charset_list;
4468
4469   for (reg = 0; reg < 4; reg++)
4470     r[reg] = -1;
4471
4472   while (found < 4)
4473     {
4474       int id;
4475
4476       c = *charbuf++;
4477       if (c == '\n')
4478         break;
4479       charset = char_charset (c, charset_list, NULL);
4480       id = CHARSET_ID (charset);
4481       reg = CODING_ISO_REQUEST (coding, id);
4482       if (reg >= 0 && r[reg] < 0)
4483         {
4484           found++;
4485           r[reg] = id;
4486         }
4487     }
4488
4489   if (found)
4490     {
4491       for (reg = 0; reg < 4; reg++)
4492         if (r[reg] >= 0
4493             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4494           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4495     }
4496
4497   return dst;
4498 }
4499
4500 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4501
4502 static int
4503 encode_coding_iso_2022 (coding)
4504      struct coding_system *coding;
4505 {
4506   int multibytep = coding->dst_multibyte;
4507   int *charbuf = coding->charbuf;
4508   int *charbuf_end = charbuf + coding->charbuf_used;
4509   unsigned char *dst = coding->destination + coding->produced;
4510   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4511   int safe_room = 16;
4512   int bol_designation
4513     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4514        && CODING_ISO_BOL (coding));
4515   int produced_chars = 0;
4516   Lisp_Object attrs, eol_type, charset_list;
4517   int ascii_compatible;
4518   int c;
4519   int preferred_charset_id = -1;
4520
4521   CODING_GET_INFO (coding, attrs, charset_list);
4522   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4523   if (VECTORP (eol_type))
4524     eol_type = Qunix;
4525
4526   setup_iso_safe_charsets (attrs);
4527   /* Charset list may have been changed.  */
4528   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4529   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4530
4531   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4532
4533   while (charbuf < charbuf_end)
4534     {
4535       ASSURE_DESTINATION (safe_room);
4536
4537       if (bol_designation)
4538         {
4539           unsigned char *dst_prev = dst;
4540
4541           /* We have to produce designation sequences if any now.  */
4542           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4543           bol_designation = 0;
4544           /* We are sure that designation sequences are all ASCII bytes.  */
4545           produced_chars += dst - dst_prev;
4546         }
4547
4548       c = *charbuf++;
4549
4550       if (c < 0)
4551         {
4552           /* Handle an annotation.  */
4553           switch (*charbuf)
4554             {
4555             case CODING_ANNOTATE_COMPOSITION_MASK:
4556               /* Not yet implemented.  */
4557               break;
4558             case CODING_ANNOTATE_CHARSET_MASK:
4559               preferred_charset_id = charbuf[2];
4560               if (preferred_charset_id >= 0
4561                   && NILP (Fmemq (make_number (preferred_charset_id),
4562                                   charset_list)))
4563                 preferred_charset_id = -1;
4564               break;
4565             default:
4566               abort ();
4567             }
4568           charbuf += -c - 1;
4569           continue;
4570         }
4571
4572       /* Now encode the character C.  */
4573       if (c < 0x20 || c == 0x7F)
4574         {
4575           if (c == '\n'
4576               || (c == '\r' && EQ (eol_type, Qmac)))
4577             {
4578               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4579                 ENCODE_RESET_PLANE_AND_REGISTER ();
4580               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4581                 {
4582                   int i;
4583
4584                   for (i = 0; i < 4; i++)
4585                     CODING_ISO_DESIGNATION (coding, i)
4586                       = CODING_ISO_INITIAL (coding, i);
4587                 }
4588               bol_designation
4589                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4590             }
4591           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4592             ENCODE_RESET_PLANE_AND_REGISTER ();
4593           EMIT_ONE_ASCII_BYTE (c);
4594         }
4595       else if (ASCII_CHAR_P (c))
4596         {
4597           if (ascii_compatible)
4598             EMIT_ONE_ASCII_BYTE (c);
4599           else
4600             {
4601               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4602               ENCODE_ISO_CHARACTER (charset, c);
4603             }
4604         }
4605       else if (CHAR_BYTE8_P (c))
4606         {
4607           c = CHAR_TO_BYTE8 (c);
4608           EMIT_ONE_BYTE (c);
4609         }
4610       else
4611         {
4612           struct charset *charset;
4613
4614           if (preferred_charset_id >= 0)
4615             {
4616               charset = CHARSET_FROM_ID (preferred_charset_id);
4617               if (! CHAR_CHARSET_P (c, charset))
4618                 charset = char_charset (c, charset_list, NULL);
4619             }
4620           else
4621             charset = char_charset (c, charset_list, NULL);
4622           if (!charset)
4623             {
4624               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4625                 {
4626                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4627                   charset = CHARSET_FROM_ID (charset_ascii);
4628                 }
4629               else
4630                 {
4631                   c = coding->default_char;
4632                   charset = char_charset (c, charset_list, NULL);
4633                 }
4634             }
4635           ENCODE_ISO_CHARACTER (charset, c);
4636         }
4637     }
4638
4639   if (coding->mode & CODING_MODE_LAST_BLOCK
4640       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4641     {
4642       ASSURE_DESTINATION (safe_room);
4643       ENCODE_RESET_PLANE_AND_REGISTER ();
4644     }
4645   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4646   CODING_ISO_BOL (coding) = bol_designation;
4647   coding->produced_char += produced_chars;
4648   coding->produced = dst - coding->destination;
4649   return 0;
4650 }
4651
4652 \f
4653 /*** 8,9. SJIS and BIG5 handlers ***/
4654
4655 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4656    quite widely.  So, for the moment, Emacs supports them in the bare
4657    C code.  But, in the future, they may be supported only by CCL.  */
4658
4659 /* SJIS is a coding system encoding three character sets: ASCII, right
4660    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4661    as is.  A character of charset katakana-jisx0201 is encoded by
4662    "position-code + 0x80".  A character of charset japanese-jisx0208
4663    is encoded in 2-byte but two position-codes are divided and shifted
4664    so that it fit in the range below.
4665
4666    --- CODE RANGE of SJIS ---
4667    (character set)      (range)
4668    ASCII                0x00 .. 0x7F
4669    KATAKANA-JISX0201    0xA0 .. 0xDF
4670    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4671             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4672    -------------------------------
4673
4674 */
4675
4676 /* BIG5 is a coding system encoding two character sets: ASCII and
4677    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4678    character set and is encoded in two-byte.
4679
4680    --- CODE RANGE of BIG5 ---
4681    (character set)      (range)
4682    ASCII                0x00 .. 0x7F
4683    Big5 (1st byte)      0xA1 .. 0xFE
4684         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4685    --------------------------
4686
4687   */
4688
4689 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4690    Check if a text is encoded in SJIS.  If it is, return
4691    CATEGORY_MASK_SJIS, else return 0.  */
4692
4693 static int
4694 detect_coding_sjis (coding, detect_info)
4695      struct coding_system *coding;
4696      struct coding_detection_info *detect_info;
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   int consumed_chars = 0;
4702   int found = 0;
4703   int c;
4704   Lisp_Object attrs, charset_list;
4705   int max_first_byte_of_2_byte_code;
4706
4707   CODING_GET_INFO (coding, attrs, charset_list);
4708   max_first_byte_of_2_byte_code
4709     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4710
4711   detect_info->checked |= CATEGORY_MASK_SJIS;
4712   /* A coding system of this category is always ASCII compatible.  */
4713   src += coding->head_ascii;
4714
4715   while (1)
4716     {
4717       src_base = src;
4718       ONE_MORE_BYTE (c);
4719       if (c < 0x80)
4720         continue;
4721       if ((c >= 0x81 && c <= 0x9F)
4722           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4723         {
4724           ONE_MORE_BYTE (c);
4725           if (c < 0x40 || c == 0x7F || c > 0xFC)
4726             break;
4727           found = CATEGORY_MASK_SJIS;
4728         }
4729       else if (c >= 0xA0 && c < 0xE0)
4730         found = CATEGORY_MASK_SJIS;
4731       else
4732         break;
4733     }
4734   detect_info->rejected |= CATEGORY_MASK_SJIS;
4735   return 0;
4736
4737  no_more_source:
4738   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4739     {
4740       detect_info->rejected |= CATEGORY_MASK_SJIS;
4741       return 0;
4742     }
4743   detect_info->found |= found;
4744   return 1;
4745 }
4746
4747 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4748    Check if a text is encoded in BIG5.  If it is, return
4749    CATEGORY_MASK_BIG5, else return 0.  */
4750
4751 static int
4752 detect_coding_big5 (coding, detect_info)
4753      struct coding_system *coding;
4754      struct coding_detection_info *detect_info;
4755 {
4756   const unsigned char *src = coding->source, *src_base;
4757   const unsigned char *src_end = coding->source + coding->src_bytes;
4758   int multibytep = coding->src_multibyte;
4759   int consumed_chars = 0;
4760   int found = 0;
4761   int c;
4762
4763   detect_info->checked |= CATEGORY_MASK_BIG5;
4764   /* A coding system of this category is always ASCII compatible.  */
4765   src += coding->head_ascii;
4766
4767   while (1)
4768     {
4769       src_base = src;
4770       ONE_MORE_BYTE (c);
4771       if (c < 0x80)
4772         continue;
4773       if (c >= 0xA1)
4774         {
4775           ONE_MORE_BYTE (c);
4776           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4777             return 0;
4778           found = CATEGORY_MASK_BIG5;
4779         }
4780       else
4781         break;
4782     }
4783   detect_info->rejected |= CATEGORY_MASK_BIG5;
4784   return 0;
4785
4786  no_more_source:
4787   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4788     {
4789       detect_info->rejected |= CATEGORY_MASK_BIG5;
4790       return 0;
4791     }
4792   detect_info->found |= found;
4793   return 1;
4794 }
4795
4796 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4797    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4798
4799 static void
4800 decode_coding_sjis (coding)
4801      struct coding_system *coding;
4802 {
4803   const unsigned char *src = coding->source + coding->consumed;
4804   const unsigned char *src_end = coding->source + coding->src_bytes;
4805   const unsigned char *src_base;
4806   int *charbuf = coding->charbuf + coding->charbuf_used;
4807   /* We may produce one charset annocation in one loop and one more at
4808      the end.  */
4809   int *charbuf_end
4810     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4811   int consumed_chars = 0, consumed_chars_base;
4812   int multibytep = coding->src_multibyte;
4813   struct charset *charset_roman, *charset_kanji, *charset_kana;
4814   struct charset *charset_kanji2;
4815   Lisp_Object attrs, charset_list, val;
4816   int char_offset = coding->produced_char;
4817   int last_offset = char_offset;
4818   int last_id = charset_ascii;
4819   int eol_crlf =
4820     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4821   int byte_after_cr = -1;
4822
4823   CODING_GET_INFO (coding, attrs, charset_list);
4824
4825   val = charset_list;
4826   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4827   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4828   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4829   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4830
4831   while (1)
4832     {
4833       int c, c1;
4834       struct charset *charset;
4835
4836       src_base = src;
4837       consumed_chars_base = consumed_chars;
4838
4839       if (charbuf >= charbuf_end)
4840         {
4841           if (byte_after_cr >= 0)
4842             src_base--;
4843           break;
4844         }
4845
4846       if (byte_after_cr >= 0)
4847         c = byte_after_cr, byte_after_cr = -1;
4848       else
4849         ONE_MORE_BYTE (c);
4850       if (c < 0)
4851         goto invalid_code;
4852       if (c < 0x80)
4853         {
4854           if (eol_crlf && c == '\r')
4855             ONE_MORE_BYTE (byte_after_cr);
4856           charset = charset_roman;
4857         }
4858       else if (c == 0x80 || c == 0xA0)
4859         goto invalid_code;
4860       else if (c >= 0xA1 && c <= 0xDF)
4861         {
4862           /* SJIS -> JISX0201-Kana */
4863           c &= 0x7F;
4864           charset = charset_kana;
4865         }
4866       else if (c <= 0xEF)
4867         {
4868           /* SJIS -> JISX0208 */
4869           ONE_MORE_BYTE (c1);
4870           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4871             goto invalid_code;
4872           c = (c << 8) | c1;
4873           SJIS_TO_JIS (c);
4874           charset = charset_kanji;
4875         }
4876       else if (c <= 0xFC && charset_kanji2)
4877         {
4878           /* SJIS -> JISX0213-2 */
4879           ONE_MORE_BYTE (c1);
4880           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4881             goto invalid_code;
4882           c = (c << 8) | c1;
4883           SJIS_TO_JIS2 (c);
4884           charset = charset_kanji2;
4885         }
4886       else
4887         goto invalid_code;
4888       if (charset->id != charset_ascii
4889           && last_id != charset->id)
4890         {
4891           if (last_id != charset_ascii)
4892             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4893           last_id = charset->id;
4894           last_offset = char_offset;
4895         }
4896       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4897       *charbuf++ = c;
4898       char_offset++;
4899       continue;
4900
4901     invalid_code:
4902       src = src_base;
4903       consumed_chars = consumed_chars_base;
4904       ONE_MORE_BYTE (c);
4905       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4906       char_offset++;
4907       coding->errors++;
4908     }
4909
4910  no_more_source:
4911   if (last_id != charset_ascii)
4912     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4913   coding->consumed_char += consumed_chars_base;
4914   coding->consumed = src_base - coding->source;
4915   coding->charbuf_used = charbuf - coding->charbuf;
4916 }
4917
4918 static void
4919 decode_coding_big5 (coding)
4920      struct coding_system *coding;
4921 {
4922   const unsigned char *src = coding->source + coding->consumed;
4923   const unsigned char *src_end = coding->source + coding->src_bytes;
4924   const unsigned char *src_base;
4925   int *charbuf = coding->charbuf + coding->charbuf_used;
4926   /* We may produce one charset annocation in one loop and one more at
4927      the end.  */
4928   int *charbuf_end
4929     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4930   int consumed_chars = 0, consumed_chars_base;
4931   int multibytep = coding->src_multibyte;
4932   struct charset *charset_roman, *charset_big5;
4933   Lisp_Object attrs, charset_list, val;
4934   int char_offset = coding->produced_char;
4935   int last_offset = char_offset;
4936   int last_id = charset_ascii;
4937   int eol_crlf =
4938     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4939   int byte_after_cr = -1;
4940
4941   CODING_GET_INFO (coding, attrs, charset_list);
4942   val = charset_list;
4943   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4944   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4945
4946   while (1)
4947     {
4948       int c, c1;
4949       struct charset *charset;
4950
4951       src_base = src;
4952       consumed_chars_base = consumed_chars;
4953
4954       if (charbuf >= charbuf_end)
4955         {
4956           if (byte_after_cr >= 0)
4957             src_base--;
4958           break;
4959         }
4960
4961       if (byte_after_cr >= 0)
4962         c = byte_after_cr, byte_after_cr = -1;
4963       else
4964         ONE_MORE_BYTE (c);
4965
4966       if (c < 0)
4967         goto invalid_code;
4968       if (c < 0x80)
4969         {
4970           if (eol_crlf && c == '\r')
4971             ONE_MORE_BYTE (byte_after_cr);
4972           charset = charset_roman;
4973         }
4974       else
4975         {
4976           /* BIG5 -> Big5 */
4977           if (c < 0xA1 || c > 0xFE)
4978             goto invalid_code;
4979           ONE_MORE_BYTE (c1);
4980           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4981             goto invalid_code;
4982           c = c << 8 | c1;
4983           charset = charset_big5;
4984         }
4985       if (charset->id != charset_ascii
4986           && last_id != charset->id)
4987         {
4988           if (last_id != charset_ascii)
4989             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4990           last_id = charset->id;
4991           last_offset = char_offset;
4992         }
4993       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4994       *charbuf++ = c;
4995       char_offset++;
4996       continue;
4997
4998     invalid_code:
4999       src = src_base;
5000       consumed_chars = consumed_chars_base;
5001       ONE_MORE_BYTE (c);
5002       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
5003       char_offset++;
5004       coding->errors++;
5005     }
5006
5007  no_more_source:
5008   if (last_id != charset_ascii)
5009     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5010   coding->consumed_char += consumed_chars_base;
5011   coding->consumed = src_base - coding->source;
5012   coding->charbuf_used = charbuf - coding->charbuf;
5013 }
5014
5015 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
5016    This function can encode charsets `ascii', `katakana-jisx0201',
5017    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
5018    are sure that all these charsets are registered as official charset
5019    (i.e. do not have extended leading-codes).  Characters of other
5020    charsets are produced without any encoding.  If SJIS_P is 1, encode
5021    SJIS text, else encode BIG5 text.  */
5022
5023 static int
5024 encode_coding_sjis (coding)
5025      struct coding_system *coding;
5026 {
5027   int multibytep = coding->dst_multibyte;
5028   int *charbuf = coding->charbuf;
5029   int *charbuf_end = charbuf + coding->charbuf_used;
5030   unsigned char *dst = coding->destination + coding->produced;
5031   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5032   int safe_room = 4;
5033   int produced_chars = 0;
5034   Lisp_Object attrs, charset_list, val;
5035   int ascii_compatible;
5036   struct charset *charset_roman, *charset_kanji, *charset_kana;
5037   struct charset *charset_kanji2;
5038   int c;
5039
5040   CODING_GET_INFO (coding, attrs, charset_list);
5041   val = charset_list;
5042   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5043   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5044   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5045   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5046
5047   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5048
5049   while (charbuf < charbuf_end)
5050     {
5051       ASSURE_DESTINATION (safe_room);
5052       c = *charbuf++;
5053       /* Now encode the character C.  */
5054       if (ASCII_CHAR_P (c) && ascii_compatible)
5055         EMIT_ONE_ASCII_BYTE (c);
5056       else if (CHAR_BYTE8_P (c))
5057         {
5058           c = CHAR_TO_BYTE8 (c);
5059           EMIT_ONE_BYTE (c);
5060         }
5061       else
5062         {
5063           unsigned code;
5064           struct charset *charset = char_charset (c, charset_list, &code);
5065
5066           if (!charset)
5067             {
5068               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5069                 {
5070                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5071                   charset = CHARSET_FROM_ID (charset_ascii);
5072                 }
5073               else
5074                 {
5075                   c = coding->default_char;
5076                   charset = char_charset (c, charset_list, &code);
5077                 }
5078             }
5079           if (code == CHARSET_INVALID_CODE (charset))
5080             abort ();
5081           if (charset == charset_kanji)
5082             {
5083               int c1, c2;
5084               JIS_TO_SJIS (code);
5085               c1 = code >> 8, c2 = code & 0xFF;
5086               EMIT_TWO_BYTES (c1, c2);
5087             }
5088           else if (charset == charset_kana)
5089             EMIT_ONE_BYTE (code | 0x80);
5090           else if (charset_kanji2 && charset == charset_kanji2)
5091             {
5092               int c1, c2;
5093
5094               c1 = code >> 8;
5095               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5096                   || c1 == 0x28
5097                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5098                 {
5099                   JIS_TO_SJIS2 (code);
5100                   c1 = code >> 8, c2 = code & 0xFF;
5101                   EMIT_TWO_BYTES (c1, c2);
5102                 }
5103               else
5104                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5105             }
5106           else
5107             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5108         }
5109     }
5110   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5111   coding->produced_char += produced_chars;
5112   coding->produced = dst - coding->destination;
5113   return 0;
5114 }
5115
5116 static int
5117 encode_coding_big5 (coding)
5118      struct coding_system *coding;
5119 {
5120   int multibytep = coding->dst_multibyte;
5121   int *charbuf = coding->charbuf;
5122   int *charbuf_end = charbuf + coding->charbuf_used;
5123   unsigned char *dst = coding->destination + coding->produced;
5124   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5125   int safe_room = 4;
5126   int produced_chars = 0;
5127   Lisp_Object attrs, charset_list, val;
5128   int ascii_compatible;
5129   struct charset *charset_roman, *charset_big5;
5130   int c;
5131
5132   CODING_GET_INFO (coding, attrs, charset_list);
5133   val = charset_list;
5134   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5135   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5136   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5137
5138   while (charbuf < charbuf_end)
5139     {
5140       ASSURE_DESTINATION (safe_room);
5141       c = *charbuf++;
5142       /* Now encode the character C.  */
5143       if (ASCII_CHAR_P (c) && ascii_compatible)
5144         EMIT_ONE_ASCII_BYTE (c);
5145       else if (CHAR_BYTE8_P (c))
5146         {
5147           c = CHAR_TO_BYTE8 (c);
5148           EMIT_ONE_BYTE (c);
5149         }
5150       else
5151         {
5152           unsigned code;
5153           struct charset *charset = char_charset (c, charset_list, &code);
5154
5155           if (! charset)
5156             {
5157               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5158                 {
5159                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5160                   charset = CHARSET_FROM_ID (charset_ascii);
5161                 }
5162               else
5163                 {
5164                   c = coding->default_char;
5165                   charset = char_charset (c, charset_list, &code);
5166                 }
5167             }
5168           if (code == CHARSET_INVALID_CODE (charset))
5169             abort ();
5170           if (charset == charset_big5)
5171             {
5172               int c1, c2;
5173
5174               c1 = code >> 8, c2 = code & 0xFF;
5175               EMIT_TWO_BYTES (c1, c2);
5176             }
5177           else
5178             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5179         }
5180     }
5181   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5182   coding->produced_char += produced_chars;
5183   coding->produced = dst - coding->destination;
5184   return 0;
5185 }
5186
5187 \f
5188 /*** 10. CCL handlers ***/
5189
5190 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5191    Check if a text is encoded in a coding system of which
5192    encoder/decoder are written in CCL program.  If it is, return
5193    CATEGORY_MASK_CCL, else return 0.  */
5194
5195 static int
5196 detect_coding_ccl (coding, detect_info)
5197      struct coding_system *coding;
5198      struct coding_detection_info *detect_info;
5199 {
5200   const unsigned char *src = coding->source, *src_base;
5201   const unsigned char *src_end = coding->source + coding->src_bytes;
5202   int multibytep = coding->src_multibyte;
5203   int consumed_chars = 0;
5204   int found = 0;
5205   unsigned char *valids;
5206   int head_ascii = coding->head_ascii;
5207   Lisp_Object attrs;
5208
5209   detect_info->checked |= CATEGORY_MASK_CCL;
5210
5211   coding = &coding_categories[coding_category_ccl];
5212   valids = CODING_CCL_VALIDS (coding);
5213   attrs = CODING_ID_ATTRS (coding->id);
5214   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5215     src += head_ascii;
5216
5217   while (1)
5218     {
5219       int c;
5220
5221       src_base = src;
5222       ONE_MORE_BYTE (c);
5223       if (c < 0 || ! valids[c])
5224         break;
5225       if ((valids[c] > 1))
5226         found = CATEGORY_MASK_CCL;
5227     }
5228   detect_info->rejected |= CATEGORY_MASK_CCL;
5229   return 0;
5230
5231  no_more_source:
5232   detect_info->found |= found;
5233   return 1;
5234 }
5235
5236 static void
5237 decode_coding_ccl (coding)
5238      struct coding_system *coding;
5239 {
5240   const unsigned char *src = coding->source + coding->consumed;
5241   const unsigned char *src_end = coding->source + coding->src_bytes;
5242   int *charbuf = coding->charbuf + coding->charbuf_used;
5243   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5244   int consumed_chars = 0;
5245   int multibytep = coding->src_multibyte;
5246   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5247   int source_charbuf[1024];
5248   int source_byteidx[1025];
5249   Lisp_Object attrs, charset_list;
5250
5251   CODING_GET_INFO (coding, attrs, charset_list);
5252
5253   while (1)
5254     {
5255       const unsigned char *p = src;
5256       int i = 0;
5257
5258       if (multibytep)
5259         {
5260           while (i < 1024 && p < src_end)
5261             {
5262               source_byteidx[i] = p - src;
5263               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5264             }
5265           source_byteidx[i] = p - src;
5266         }
5267       else
5268         while (i < 1024 && p < src_end)
5269           source_charbuf[i++] = *p++;
5270
5271       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5272         ccl->last_block = 1;
5273       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5274                   charset_list);
5275       charbuf += ccl->produced;
5276       if (multibytep)
5277         src += source_byteidx[ccl->consumed];
5278       else
5279         src += ccl->consumed;
5280       consumed_chars += ccl->consumed;
5281       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5282         break;
5283     }
5284
5285   switch (ccl->status)
5286     {
5287     case CCL_STAT_SUSPEND_BY_SRC:
5288       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5289       break;
5290     case CCL_STAT_SUSPEND_BY_DST:
5291       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5292       break;
5293     case CCL_STAT_QUIT:
5294     case CCL_STAT_INVALID_CMD:
5295       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5296       break;
5297     default:
5298       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5299       break;
5300     }
5301   coding->consumed_char += consumed_chars;
5302   coding->consumed = src - coding->source;
5303   coding->charbuf_used = charbuf - coding->charbuf;
5304 }
5305
5306 static int
5307 encode_coding_ccl (coding)
5308      struct coding_system *coding;
5309 {
5310   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5311   int multibytep = coding->dst_multibyte;
5312   int *charbuf = coding->charbuf;
5313   int *charbuf_end = charbuf + coding->charbuf_used;
5314   unsigned char *dst = coding->destination + coding->produced;
5315   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5316   int destination_charbuf[1024];
5317   int i, produced_chars = 0;
5318   Lisp_Object attrs, charset_list;
5319
5320   CODING_GET_INFO (coding, attrs, charset_list);
5321   if (coding->consumed_char == coding->src_chars
5322       && coding->mode & CODING_MODE_LAST_BLOCK)
5323     ccl->last_block = 1;
5324
5325   while (charbuf < charbuf_end)
5326     {
5327       ccl_driver (ccl, charbuf, destination_charbuf,
5328                   charbuf_end - charbuf, 1024, charset_list);
5329       if (multibytep)
5330         {
5331           ASSURE_DESTINATION (ccl->produced * 2);
5332           for (i = 0; i < ccl->produced; i++)
5333             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5334         }
5335       else
5336         {
5337           ASSURE_DESTINATION (ccl->produced);
5338           for (i = 0; i < ccl->produced; i++)
5339             *dst++ = destination_charbuf[i] & 0xFF;
5340           produced_chars += ccl->produced;
5341         }
5342       charbuf += ccl->consumed;
5343       if (ccl->status == CCL_STAT_QUIT
5344           || ccl->status == CCL_STAT_INVALID_CMD)
5345         break;
5346     }
5347
5348   switch (ccl->status)
5349     {
5350     case CCL_STAT_SUSPEND_BY_SRC:
5351       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5352       break;
5353     case CCL_STAT_SUSPEND_BY_DST:
5354       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5355       break;
5356     case CCL_STAT_QUIT:
5357     case CCL_STAT_INVALID_CMD:
5358       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5359       break;
5360     default:
5361       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5362       break;
5363     }
5364
5365   coding->produced_char += produced_chars;
5366   coding->produced = dst - coding->destination;
5367   return 0;
5368 }
5369
5370
5371 \f
5372 /*** 10, 11. no-conversion handlers ***/
5373
5374 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5375
5376 static void
5377 decode_coding_raw_text (coding)
5378      struct coding_system *coding;
5379 {
5380   int eol_crlf =
5381     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5382
5383   coding->chars_at_source = 1;
5384   coding->consumed_char = coding->src_chars;
5385   coding->consumed = coding->src_bytes;
5386   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5387     {
5388       coding->consumed_char--;
5389       coding->consumed--;
5390       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5391     }
5392   else
5393     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5394 }
5395
5396 static int
5397 encode_coding_raw_text (coding)
5398      struct coding_system *coding;
5399 {
5400   int multibytep = coding->dst_multibyte;
5401   int *charbuf = coding->charbuf;
5402   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5403   unsigned char *dst = coding->destination + coding->produced;
5404   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5405   int produced_chars = 0;
5406   int c;
5407
5408   if (multibytep)
5409     {
5410       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5411
5412       if (coding->src_multibyte)
5413         while (charbuf < charbuf_end)
5414           {
5415             ASSURE_DESTINATION (safe_room);
5416             c = *charbuf++;
5417             if (ASCII_CHAR_P (c))
5418               EMIT_ONE_ASCII_BYTE (c);
5419             else if (CHAR_BYTE8_P (c))
5420               {
5421                 c = CHAR_TO_BYTE8 (c);
5422                 EMIT_ONE_BYTE (c);
5423               }
5424             else
5425               {
5426                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5427
5428                 CHAR_STRING_ADVANCE (c, p1);
5429                 while (p0 < p1)
5430                   {
5431                     EMIT_ONE_BYTE (*p0);
5432                     p0++;
5433                   }
5434               }
5435           }
5436       else
5437         while (charbuf < charbuf_end)
5438           {
5439             ASSURE_DESTINATION (safe_room);
5440             c = *charbuf++;
5441             EMIT_ONE_BYTE (c);
5442           }
5443     }
5444   else
5445     {
5446       if (coding->src_multibyte)
5447         {
5448           int safe_room = MAX_MULTIBYTE_LENGTH;
5449
5450           while (charbuf < charbuf_end)
5451             {
5452               ASSURE_DESTINATION (safe_room);
5453               c = *charbuf++;
5454               if (ASCII_CHAR_P (c))
5455                 *dst++ = c;
5456               else if (CHAR_BYTE8_P (c))
5457                 *dst++ = CHAR_TO_BYTE8 (c);
5458               else
5459                 CHAR_STRING_ADVANCE (c, dst);
5460             }
5461         }
5462       else
5463         {
5464           ASSURE_DESTINATION (charbuf_end - charbuf);
5465           while (charbuf < charbuf_end && dst < dst_end)
5466             *dst++ = *charbuf++;
5467         }
5468       produced_chars = dst - (coding->destination + coding->produced);
5469     }
5470   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5471   coding->produced_char += produced_chars;
5472   coding->produced = dst - coding->destination;
5473   return 0;
5474 }
5475
5476 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5477    Check if a text is encoded in a charset-based coding system.  If it
5478    is, return 1, else return 0.  */
5479
5480 static int
5481 detect_coding_charset (coding, detect_info)
5482      struct coding_system *coding;
5483      struct coding_detection_info *detect_info;
5484 {
5485   const unsigned char *src = coding->source, *src_base;
5486   const unsigned char *src_end = coding->source + coding->src_bytes;
5487   int multibytep = coding->src_multibyte;
5488   int consumed_chars = 0;
5489   Lisp_Object attrs, valids, name;
5490   int found = 0;
5491   int head_ascii = coding->head_ascii;
5492   int check_latin_extra = 0;
5493
5494   detect_info->checked |= CATEGORY_MASK_CHARSET;
5495
5496   coding = &coding_categories[coding_category_charset];
5497   attrs = CODING_ID_ATTRS (coding->id);
5498   valids = AREF (attrs, coding_attr_charset_valids);
5499   name = CODING_ID_NAME (coding->id);
5500   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5501                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5502       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5503                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5504     check_latin_extra = 1;
5505
5506   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5507     src += head_ascii;
5508
5509   while (1)
5510     {
5511       int c;
5512       Lisp_Object val;
5513       struct charset *charset;
5514       int dim, idx;
5515
5516       src_base = src;
5517       ONE_MORE_BYTE (c);
5518       if (c < 0)
5519         continue;
5520       val = AREF (valids, c);
5521       if (NILP (val))
5522         break;
5523       if (c >= 0x80)
5524         {
5525           if (c < 0xA0
5526               && check_latin_extra
5527               && (!VECTORP (Vlatin_extra_code_table)
5528                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5529             break;
5530           found = CATEGORY_MASK_CHARSET;
5531         }
5532       if (INTEGERP (val))
5533         {
5534           charset = CHARSET_FROM_ID (XFASTINT (val));
5535           dim = CHARSET_DIMENSION (charset);
5536           for (idx = 1; idx < dim; idx++)
5537             {
5538               if (src == src_end)
5539                 goto too_short;
5540               ONE_MORE_BYTE (c);
5541               if (c < charset->code_space[(dim - 1 - idx) * 2]
5542                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5543                 break;
5544             }
5545           if (idx < dim)
5546             break;
5547         }
5548       else
5549         {
5550           idx = 1;
5551           for (; CONSP (val); val = XCDR (val))
5552             {
5553               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5554               dim = CHARSET_DIMENSION (charset);
5555               while (idx < dim)
5556                 {
5557                   if (src == src_end)
5558                     goto too_short;
5559                   ONE_MORE_BYTE (c);
5560                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5561                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5562                     break;
5563                   idx++;
5564                 }
5565               if (idx == dim)
5566                 {
5567                   val = Qnil;
5568                   break;
5569                 }
5570             }
5571           if (CONSP (val))
5572             break;
5573         }
5574     }
5575  too_short:
5576   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5577   return 0;
5578
5579  no_more_source:
5580   detect_info->found |= found;
5581   return 1;
5582 }
5583
5584 static void
5585 decode_coding_charset (coding)
5586      struct coding_system *coding;
5587 {
5588   const unsigned char *src = coding->source + coding->consumed;
5589   const unsigned char *src_end = coding->source + coding->src_bytes;
5590   const unsigned char *src_base;
5591   int *charbuf = coding->charbuf + coding->charbuf_used;
5592   /* We may produce one charset annocation in one loop and one more at
5593      the end.  */
5594   int *charbuf_end
5595     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5596   int consumed_chars = 0, consumed_chars_base;
5597   int multibytep = coding->src_multibyte;
5598   Lisp_Object attrs, charset_list, valids;
5599   int char_offset = coding->produced_char;
5600   int last_offset = char_offset;
5601   int last_id = charset_ascii;
5602   int eol_crlf =
5603     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5604   int byte_after_cr = -1;
5605
5606   CODING_GET_INFO (coding, attrs, charset_list);
5607   valids = AREF (attrs, coding_attr_charset_valids);
5608
5609   while (1)
5610     {
5611       int c;
5612       Lisp_Object val;
5613       struct charset *charset;
5614       int dim;
5615       int len = 1;
5616       unsigned code;
5617
5618       src_base = src;
5619       consumed_chars_base = consumed_chars;
5620
5621       if (charbuf >= charbuf_end)
5622         {
5623           if (byte_after_cr >= 0)
5624             src_base--;
5625           break;
5626         }
5627
5628       if (byte_after_cr >= 0)
5629         {
5630           c = byte_after_cr;
5631           byte_after_cr = -1;
5632         }
5633       else
5634         {
5635           ONE_MORE_BYTE (c);
5636           if (eol_crlf && c == '\r')
5637             ONE_MORE_BYTE (byte_after_cr);
5638         }
5639       if (c < 0)
5640         goto invalid_code;
5641       code = c;
5642
5643       val = AREF (valids, c);
5644       if (! INTEGERP (val) && ! CONSP (val))
5645         goto invalid_code;
5646       if (INTEGERP (val))
5647         {
5648           charset = CHARSET_FROM_ID (XFASTINT (val));
5649           dim = CHARSET_DIMENSION (charset);
5650           while (len < dim)
5651             {
5652               ONE_MORE_BYTE (c);
5653               code = (code << 8) | c;
5654               len++;
5655             }
5656           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5657                               charset, code, c);
5658         }
5659       else
5660         {
5661           /* VAL is a list of charset IDs.  It is assured that the
5662              list is sorted by charset dimensions (smaller one
5663              comes first).  */
5664           while (CONSP (val))
5665             {
5666               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5667               dim = CHARSET_DIMENSION (charset);
5668               while (len < dim)
5669                 {
5670                   ONE_MORE_BYTE (c);
5671                   code = (code << 8) | c;
5672                   len++;
5673                 }
5674               CODING_DECODE_CHAR (coding, src, src_base,
5675                                   src_end, charset, code, c);
5676               if (c >= 0)
5677                 break;
5678               val = XCDR (val);
5679             }
5680         }
5681       if (c < 0)
5682         goto invalid_code;
5683       if (charset->id != charset_ascii
5684           && last_id != charset->id)
5685         {
5686           if (last_id != charset_ascii)
5687             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5688           last_id = charset->id;
5689           last_offset = char_offset;
5690         }
5691
5692       *charbuf++ = c;
5693       char_offset++;
5694       continue;
5695
5696     invalid_code:
5697       src = src_base;
5698       consumed_chars = consumed_chars_base;
5699       ONE_MORE_BYTE (c);
5700       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5701       char_offset++;
5702       coding->errors++;
5703     }
5704
5705  no_more_source:
5706   if (last_id != charset_ascii)
5707     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5708   coding->consumed_char += consumed_chars_base;
5709   coding->consumed = src_base - coding->source;
5710   coding->charbuf_used = charbuf - coding->charbuf;
5711 }
5712
5713 static int
5714 encode_coding_charset (coding)
5715      struct coding_system *coding;
5716 {
5717   int multibytep = coding->dst_multibyte;
5718   int *charbuf = coding->charbuf;
5719   int *charbuf_end = charbuf + coding->charbuf_used;
5720   unsigned char *dst = coding->destination + coding->produced;
5721   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5722   int safe_room = MAX_MULTIBYTE_LENGTH;
5723   int produced_chars = 0;
5724   Lisp_Object attrs, charset_list;
5725   int ascii_compatible;
5726   int c;
5727
5728   CODING_GET_INFO (coding, attrs, charset_list);
5729   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5730
5731   while (charbuf < charbuf_end)
5732     {
5733       struct charset *charset;
5734       unsigned code;
5735
5736       ASSURE_DESTINATION (safe_room);
5737       c = *charbuf++;
5738       if (ascii_compatible && ASCII_CHAR_P (c))
5739         EMIT_ONE_ASCII_BYTE (c);
5740       else if (CHAR_BYTE8_P (c))
5741         {
5742           c = CHAR_TO_BYTE8 (c);
5743           EMIT_ONE_BYTE (c);
5744         }
5745       else
5746         {
5747           charset = char_charset (c, charset_list, &code);
5748           if (charset)
5749             {
5750               if (CHARSET_DIMENSION (charset) == 1)
5751                 EMIT_ONE_BYTE (code);
5752               else if (CHARSET_DIMENSION (charset) == 2)
5753                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5754               else if (CHARSET_DIMENSION (charset) == 3)
5755                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5756               else
5757                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5758                                  (code >> 8) & 0xFF, code & 0xFF);
5759             }
5760           else
5761             {
5762               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5763                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5764               else
5765                 c = coding->default_char;
5766               EMIT_ONE_BYTE (c);
5767             }
5768         }
5769     }
5770
5771   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5772   coding->produced_char += produced_chars;
5773   coding->produced = dst - coding->destination;
5774   return 0;
5775 }
5776
5777 \f
5778 /*** 7. C library functions ***/
5779
5780 /* Setup coding context CODING from information about CODING_SYSTEM.
5781    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5782    CODING_SYSTEM is invalid, signal an error.  */
5783
5784 void
5785 setup_coding_system (coding_system, coding)
5786      Lisp_Object coding_system;
5787      struct coding_system *coding;
5788 {
5789   Lisp_Object attrs;
5790   Lisp_Object eol_type;
5791   Lisp_Object coding_type;
5792   Lisp_Object val;
5793
5794   if (NILP (coding_system))
5795     coding_system = Qundecided;
5796
5797   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5798
5799   attrs = CODING_ID_ATTRS (coding->id);
5800   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5801
5802   coding->mode = 0;
5803   coding->head_ascii = -1;
5804   if (VECTORP (eol_type))
5805     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5806                             | CODING_REQUIRE_DETECTION_MASK);
5807   else if (! EQ (eol_type, Qunix))
5808     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5809                             | CODING_REQUIRE_ENCODING_MASK);
5810   else
5811     coding->common_flags = 0;
5812   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5813     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5814   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5815     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5816   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5817     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5818
5819   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5820   coding->max_charset_id = SCHARS (val) - 1;
5821   coding->safe_charsets = SDATA (val);
5822   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5823   coding->carryover_bytes = 0;
5824
5825   coding_type = CODING_ATTR_TYPE (attrs);
5826   if (EQ (coding_type, Qundecided))
5827     {
5828       coding->detector = NULL;
5829       coding->decoder = decode_coding_raw_text;
5830       coding->encoder = encode_coding_raw_text;
5831       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5832     }
5833   else if (EQ (coding_type, Qiso_2022))
5834     {
5835       int i;
5836       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5837
5838       /* Invoke graphic register 0 to plane 0.  */
5839       CODING_ISO_INVOCATION (coding, 0) = 0;
5840       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5841       CODING_ISO_INVOCATION (coding, 1)
5842         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5843       /* Setup the initial status of designation.  */
5844       for (i = 0; i < 4; i++)
5845         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5846       /* Not single shifting initially.  */
5847       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5848       /* Beginning of buffer should also be regarded as bol. */
5849       CODING_ISO_BOL (coding) = 1;
5850       coding->detector = detect_coding_iso_2022;
5851       coding->decoder = decode_coding_iso_2022;
5852       coding->encoder = encode_coding_iso_2022;
5853       if (flags & CODING_ISO_FLAG_SAFE)
5854         coding->mode |= CODING_MODE_SAFE_ENCODING;
5855       coding->common_flags
5856         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5857             | CODING_REQUIRE_FLUSHING_MASK);
5858       if (flags & CODING_ISO_FLAG_COMPOSITION)
5859         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5860       if (flags & CODING_ISO_FLAG_DESIGNATION)
5861         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5862       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5863         {
5864           setup_iso_safe_charsets (attrs);
5865           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5866           coding->max_charset_id = SCHARS (val) - 1;
5867           coding->safe_charsets = SDATA (val);
5868         }
5869       CODING_ISO_FLAGS (coding) = flags;
5870       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5871       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5872       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5873       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5874     }
5875   else if (EQ (coding_type, Qcharset))
5876     {
5877       coding->detector = detect_coding_charset;
5878       coding->decoder = decode_coding_charset;
5879       coding->encoder = encode_coding_charset;
5880       coding->common_flags
5881         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5882     }
5883   else if (EQ (coding_type, Qutf_8))
5884     {
5885       val = AREF (attrs, coding_attr_utf_bom);
5886       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5887                                    : EQ (val, Qt) ? utf_with_bom
5888                                    : utf_without_bom);
5889       coding->detector = detect_coding_utf_8;
5890       coding->decoder = decode_coding_utf_8;
5891       coding->encoder = encode_coding_utf_8;
5892       coding->common_flags
5893         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5894       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5895         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5896     }
5897   else if (EQ (coding_type, Qutf_16))
5898     {
5899       val = AREF (attrs, coding_attr_utf_bom);
5900       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5901                                     : EQ (val, Qt) ? utf_with_bom
5902                                     : utf_without_bom);
5903       val = AREF (attrs, coding_attr_utf_16_endian);
5904       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5905                                        : utf_16_little_endian);
5906       CODING_UTF_16_SURROGATE (coding) = 0;
5907       coding->detector = detect_coding_utf_16;
5908       coding->decoder = decode_coding_utf_16;
5909       coding->encoder = encode_coding_utf_16;
5910       coding->common_flags
5911         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5912       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5913         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5914     }
5915   else if (EQ (coding_type, Qccl))
5916     {
5917       coding->detector = detect_coding_ccl;
5918       coding->decoder = decode_coding_ccl;
5919       coding->encoder = encode_coding_ccl;
5920       coding->common_flags
5921         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5922             | CODING_REQUIRE_FLUSHING_MASK);
5923     }
5924   else if (EQ (coding_type, Qemacs_mule))
5925     {
5926       coding->detector = detect_coding_emacs_mule;
5927       coding->decoder = decode_coding_emacs_mule;
5928       coding->encoder = encode_coding_emacs_mule;
5929       coding->common_flags
5930         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5931       coding->spec.emacs_mule.full_support = 1;
5932       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5933           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5934         {
5935           Lisp_Object tail, safe_charsets;
5936           int max_charset_id = 0;
5937
5938           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5939                tail = XCDR (tail))
5940             if (max_charset_id < XFASTINT (XCAR (tail)))
5941               max_charset_id = XFASTINT (XCAR (tail));
5942           safe_charsets = make_uninit_string (max_charset_id + 1);
5943           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5944           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5945                tail = XCDR (tail))
5946             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5947           coding->max_charset_id = max_charset_id;
5948           coding->safe_charsets = SDATA (safe_charsets);
5949           coding->spec.emacs_mule.full_support = 1;
5950         }
5951       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5952       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5953     }
5954   else if (EQ (coding_type, Qshift_jis))
5955     {
5956       coding->detector = detect_coding_sjis;
5957       coding->decoder = decode_coding_sjis;
5958       coding->encoder = encode_coding_sjis;
5959       coding->common_flags
5960         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5961     }
5962   else if (EQ (coding_type, Qbig5))
5963     {
5964       coding->detector = detect_coding_big5;
5965       coding->decoder = decode_coding_big5;
5966       coding->encoder = encode_coding_big5;
5967       coding->common_flags
5968         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5969     }
5970   else                          /* EQ (coding_type, Qraw_text) */
5971     {
5972       coding->detector = NULL;
5973       coding->decoder = decode_coding_raw_text;
5974       coding->encoder = encode_coding_raw_text;
5975       if (! EQ (eol_type, Qunix))
5976         {
5977           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5978           if (! VECTORP (eol_type))
5979             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5980         }
5981
5982     }
5983
5984   return;
5985 }
5986
5987 /* Return a list of charsets supported by CODING.  */
5988
5989 Lisp_Object
5990 coding_charset_list (coding)
5991      struct coding_system *coding;
5992 {
5993   Lisp_Object attrs, charset_list;
5994
5995   CODING_GET_INFO (coding, attrs, charset_list);
5996   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5997     {
5998       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5999
6000       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6001         charset_list = Viso_2022_charset_list;
6002     }
6003   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6004     {
6005       charset_list = Vemacs_mule_charset_list;
6006     }
6007   return charset_list;
6008 }
6009
6010
6011 /* Return a list of charsets supported by CODING-SYSTEM.  */
6012
6013 Lisp_Object
6014 coding_system_charset_list (coding_system)
6015      Lisp_Object coding_system;
6016 {
6017   int id;
6018   Lisp_Object attrs, charset_list;
6019
6020   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6021   attrs = CODING_ID_ATTRS (id);
6022
6023   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6024     {
6025       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6026
6027       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6028         charset_list = Viso_2022_charset_list;
6029       else
6030         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6031     }
6032   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6033     {
6034       charset_list = Vemacs_mule_charset_list;
6035     }
6036   else
6037     {
6038       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6039     }
6040   return charset_list;
6041 }
6042
6043
6044 /* Return raw-text or one of its subsidiaries that has the same
6045    eol_type as CODING-SYSTEM.  */
6046
6047 Lisp_Object
6048 raw_text_coding_system (coding_system)
6049      Lisp_Object coding_system;
6050 {
6051   Lisp_Object spec, attrs;
6052   Lisp_Object eol_type, raw_text_eol_type;
6053
6054   if (NILP (coding_system))
6055     return Qraw_text;
6056   spec = CODING_SYSTEM_SPEC (coding_system);
6057   attrs = AREF (spec, 0);
6058
6059   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6060     return coding_system;
6061
6062   eol_type = AREF (spec, 2);
6063   if (VECTORP (eol_type))
6064     return Qraw_text;
6065   spec = CODING_SYSTEM_SPEC (Qraw_text);
6066   raw_text_eol_type = AREF (spec, 2);
6067   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6068           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6069           : AREF (raw_text_eol_type, 2));
6070 }
6071
6072
6073 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6074    does, return one of the subsidiary that has the same eol-spec as
6075    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6076    inherit end-of-line format from the system's setting
6077    (system_eol_type).  */
6078
6079 Lisp_Object
6080 coding_inherit_eol_type (coding_system, parent)
6081      Lisp_Object coding_system, parent;
6082 {
6083   Lisp_Object spec, eol_type;
6084
6085   if (NILP (coding_system))
6086     coding_system = Qraw_text;
6087   spec = CODING_SYSTEM_SPEC (coding_system);
6088   eol_type = AREF (spec, 2);
6089   if (VECTORP (eol_type))
6090     {
6091       Lisp_Object parent_eol_type;
6092
6093       if (! NILP (parent))
6094         {
6095           Lisp_Object parent_spec;
6096
6097           parent_spec = CODING_SYSTEM_SPEC (parent);
6098           parent_eol_type = AREF (parent_spec, 2);
6099         }
6100       else
6101         parent_eol_type = system_eol_type;
6102       if (EQ (parent_eol_type, Qunix))
6103         coding_system = AREF (eol_type, 0);
6104       else if (EQ (parent_eol_type, Qdos))
6105         coding_system = AREF (eol_type, 1);
6106       else if (EQ (parent_eol_type, Qmac))
6107         coding_system = AREF (eol_type, 2);
6108     }
6109   return coding_system;
6110 }
6111
6112 /* Emacs has a mechanism to automatically detect a coding system if it
6113    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6114    it's impossible to distinguish some coding systems accurately
6115    because they use the same range of codes.  So, at first, coding
6116    systems are categorized into 7, those are:
6117
6118    o coding-category-emacs-mule
6119
6120         The category for a coding system which has the same code range
6121         as Emacs' internal format.  Assigned the coding-system (Lisp
6122         symbol) `emacs-mule' by default.
6123
6124    o coding-category-sjis
6125
6126         The category for a coding system which has the same code range
6127         as SJIS.  Assigned the coding-system (Lisp
6128         symbol) `japanese-shift-jis' by default.
6129
6130    o coding-category-iso-7
6131
6132         The category for a coding system which has the same code range
6133         as ISO2022 of 7-bit environment.  This doesn't use any locking
6134         shift and single shift functions.  This can encode/decode all
6135         charsets.  Assigned the coding-system (Lisp symbol)
6136         `iso-2022-7bit' by default.
6137
6138    o coding-category-iso-7-tight
6139
6140         Same as coding-category-iso-7 except that this can
6141         encode/decode only the specified charsets.
6142
6143    o coding-category-iso-8-1
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 8-bit environment and graphic plane 1 used only
6147         for DIMENSION1 charset.  This doesn't use any locking shift
6148         and single shift functions.  Assigned the coding-system (Lisp
6149         symbol) `iso-latin-1' by default.
6150
6151    o coding-category-iso-8-2
6152
6153         The category for a coding system which has the same code range
6154         as ISO2022 of 8-bit environment and graphic plane 1 used only
6155         for DIMENSION2 charset.  This doesn't use any locking shift
6156         and single shift functions.  Assigned the coding-system (Lisp
6157         symbol) `japanese-iso-8bit' by default.
6158
6159    o coding-category-iso-7-else
6160
6161         The category for a coding system which has the same code range
6162         as ISO2022 of 7-bit environemnt but uses locking shift or
6163         single shift functions.  Assigned the coding-system (Lisp
6164         symbol) `iso-2022-7bit-lock' by default.
6165
6166    o coding-category-iso-8-else
6167
6168         The category for a coding system which has the same code range
6169         as ISO2022 of 8-bit environemnt but uses locking shift or
6170         single shift functions.  Assigned the coding-system (Lisp
6171         symbol) `iso-2022-8bit-ss2' by default.
6172
6173    o coding-category-big5
6174
6175         The category for a coding system which has the same code range
6176         as BIG5.  Assigned the coding-system (Lisp symbol)
6177         `cn-big5' by default.
6178
6179    o coding-category-utf-8
6180
6181         The category for a coding system which has the same code range
6182         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6183         symbol) `utf-8' by default.
6184
6185    o coding-category-utf-16-be
6186
6187         The category for a coding system in which a text has an
6188         Unicode signature (cf. Unicode Standard) in the order of BIG
6189         endian at the head.  Assigned the coding-system (Lisp symbol)
6190         `utf-16-be' by default.
6191
6192    o coding-category-utf-16-le
6193
6194         The category for a coding system in which a text has an
6195         Unicode signature (cf. Unicode Standard) in the order of
6196         LITTLE endian at the head.  Assigned the coding-system (Lisp
6197         symbol) `utf-16-le' by default.
6198
6199    o coding-category-ccl
6200
6201         The category for a coding system of which encoder/decoder is
6202         written in CCL programs.  The default value is nil, i.e., no
6203         coding system is assigned.
6204
6205    o coding-category-binary
6206
6207         The category for a coding system not categorized in any of the
6208         above.  Assigned the coding-system (Lisp symbol)
6209         `no-conversion' by default.
6210
6211    Each of them is a Lisp symbol and the value is an actual
6212    `coding-system's (this is also a Lisp symbol) assigned by a user.
6213    What Emacs does actually is to detect a category of coding system.
6214    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6215    decide only one possible category, it selects a category of the
6216    highest priority.  Priorities of categories are also specified by a
6217    user in a Lisp variable `coding-category-list'.
6218
6219 */
6220
6221 #define EOL_SEEN_NONE   0
6222 #define EOL_SEEN_LF     1
6223 #define EOL_SEEN_CR     2
6224 #define EOL_SEEN_CRLF   4
6225
6226 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6227    SOURCE is encoded.  If CATEGORY is one of
6228    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6229    two-byte, else they are encoded by one-byte.
6230
6231    Return one of EOL_SEEN_XXX.  */
6232
6233 #define MAX_EOL_CHECK_COUNT 3
6234
6235 static int
6236 detect_eol (source, src_bytes, category)
6237      const unsigned char *source;
6238      EMACS_INT src_bytes;
6239      enum coding_category category;
6240 {
6241   const unsigned char *src = source, *src_end = src + src_bytes;
6242   unsigned char c;
6243   int total  = 0;
6244   int eol_seen = EOL_SEEN_NONE;
6245
6246   if ((1 << category) & CATEGORY_MASK_UTF_16)
6247     {
6248       int msb, lsb;
6249
6250       msb = category == (coding_category_utf_16_le
6251                          | coding_category_utf_16_le_nosig);
6252       lsb = 1 - msb;
6253
6254       while (src + 1 < src_end)
6255         {
6256           c = src[lsb];
6257           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6258             {
6259               int this_eol;
6260
6261               if (c == '\n')
6262                 this_eol = EOL_SEEN_LF;
6263               else if (src + 3 >= src_end
6264                        || src[msb + 2] != 0
6265                        || src[lsb + 2] != '\n')
6266                 this_eol = EOL_SEEN_CR;
6267               else
6268                 {
6269                   this_eol = EOL_SEEN_CRLF;
6270                   src += 2;
6271                 }
6272
6273               if (eol_seen == EOL_SEEN_NONE)
6274                 /* This is the first end-of-line.  */
6275                 eol_seen = this_eol;
6276               else if (eol_seen != this_eol)
6277                 {
6278                   /* The found type is different from what found before.
6279                      Allow for stray ^M characters in DOS EOL files.  */
6280                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6281                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6282                     eol_seen = EOL_SEEN_CRLF;
6283                   else
6284                     {
6285                       eol_seen = EOL_SEEN_LF;
6286                       break;
6287                     }
6288                 }
6289               if (++total == MAX_EOL_CHECK_COUNT)
6290                 break;
6291             }
6292           src += 2;
6293         }
6294     }
6295   else
6296     {
6297       while (src < src_end)
6298         {
6299           c = *src++;
6300           if (c == '\n' || c == '\r')
6301             {
6302               int this_eol;
6303
6304               if (c == '\n')
6305                 this_eol = EOL_SEEN_LF;
6306               else if (src >= src_end || *src != '\n')
6307                 this_eol = EOL_SEEN_CR;
6308               else
6309                 this_eol = EOL_SEEN_CRLF, src++;
6310
6311               if (eol_seen == EOL_SEEN_NONE)
6312                 /* This is the first end-of-line.  */
6313                 eol_seen = this_eol;
6314               else if (eol_seen != this_eol)
6315                 {
6316                   /* The found type is different from what found before.
6317                      Allow for stray ^M characters in DOS EOL files.  */
6318                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6319                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6320                     eol_seen = EOL_SEEN_CRLF;
6321                   else
6322                     {
6323                       eol_seen = EOL_SEEN_LF;
6324                       break;
6325                     }
6326                 }
6327               if (++total == MAX_EOL_CHECK_COUNT)
6328                 break;
6329             }
6330         }
6331     }
6332   return eol_seen;
6333 }
6334
6335
6336 static Lisp_Object
6337 adjust_coding_eol_type (coding, eol_seen)
6338      struct coding_system *coding;
6339      int eol_seen;
6340 {
6341   Lisp_Object eol_type;
6342
6343   eol_type = CODING_ID_EOL_TYPE (coding->id);
6344   if (eol_seen & EOL_SEEN_LF)
6345     {
6346       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6347       eol_type = Qunix;
6348     }
6349   else if (eol_seen & EOL_SEEN_CRLF)
6350     {
6351       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6352       eol_type = Qdos;
6353     }
6354   else if (eol_seen & EOL_SEEN_CR)
6355     {
6356       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6357       eol_type = Qmac;
6358     }
6359   return eol_type;
6360 }
6361
6362 /* Detect how a text specified in CODING is encoded.  If a coding
6363    system is detected, update fields of CODING by the detected coding
6364    system.  */
6365
6366 void
6367 detect_coding (coding)
6368      struct coding_system *coding;
6369 {
6370   const unsigned char *src, *src_end;
6371   int saved_mode = coding->mode;
6372
6373   coding->consumed = coding->consumed_char = 0;
6374   coding->produced = coding->produced_char = 0;
6375   coding_set_source (coding);
6376
6377   src_end = coding->source + coding->src_bytes;
6378   coding->head_ascii = 0;
6379
6380   /* If we have not yet decided the text encoding type, detect it
6381      now.  */
6382   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6383     {
6384       int c, i;
6385       struct coding_detection_info detect_info;
6386       int null_byte_found = 0, eight_bit_found = 0;
6387
6388       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6389       for (src = coding->source; src < src_end; src++)
6390         {
6391           c = *src;
6392           if (c & 0x80)
6393             {
6394               eight_bit_found = 1;
6395               if (null_byte_found)
6396                 break;
6397             }
6398           else if (c < 0x20)
6399             {
6400               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6401                   && ! inhibit_iso_escape_detection
6402                   && ! detect_info.checked)
6403                 {
6404                   if (detect_coding_iso_2022 (coding, &detect_info))
6405                     {
6406                       /* We have scanned the whole data.  */
6407                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6408                         {
6409                           /* We didn't find an 8-bit code.  We may
6410                              have found a null-byte, but it's very
6411                              rare that a binary file confirm to
6412                              ISO-2022.  */
6413                           src = src_end;
6414                           coding->head_ascii = src - coding->source;
6415                         }
6416                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6417                       break;
6418                     }
6419                 }
6420               else if (! c && !inhibit_null_byte_detection)
6421                 {
6422                   null_byte_found = 1;
6423                   if (eight_bit_found)
6424                     break;
6425                 }
6426               if (! eight_bit_found)
6427                 coding->head_ascii++;
6428             }
6429           else if (! eight_bit_found)
6430             coding->head_ascii++;
6431         }
6432
6433       if (null_byte_found || eight_bit_found
6434           || coding->head_ascii < coding->src_bytes
6435           || detect_info.found)
6436         {
6437           enum coding_category category;
6438           struct coding_system *this;
6439
6440           if (coding->head_ascii == coding->src_bytes)
6441             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6442             for (i = 0; i < coding_category_raw_text; i++)
6443               {
6444                 category = coding_priorities[i];
6445                 this = coding_categories + category;
6446                 if (detect_info.found & (1 << category))
6447                   break;
6448               }
6449           else
6450             {
6451               if (null_byte_found)
6452                 {
6453                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6454                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6455                 }
6456               for (i = 0; i < coding_category_raw_text; i++)
6457                 {
6458                   category = coding_priorities[i];
6459                   this = coding_categories + category;
6460                   if (this->id < 0)
6461                     {
6462                       /* No coding system of this category is defined.  */
6463                       detect_info.rejected |= (1 << category);
6464                     }
6465                   else if (category >= coding_category_raw_text)
6466                     continue;
6467                   else if (detect_info.checked & (1 << category))
6468                     {
6469                       if (detect_info.found & (1 << category))
6470                         break;
6471                     }
6472                   else if ((*(this->detector)) (coding, &detect_info)
6473                            && detect_info.found & (1 << category))
6474                     {
6475                       if (category == coding_category_utf_16_auto)
6476                         {
6477                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6478                             category = coding_category_utf_16_le;
6479                           else
6480                             category = coding_category_utf_16_be;
6481                         }
6482                       break;
6483                     }
6484                 }
6485             }
6486
6487           if (i < coding_category_raw_text)
6488             setup_coding_system (CODING_ID_NAME (this->id), coding);
6489           else if (null_byte_found)
6490             setup_coding_system (Qno_conversion, coding);
6491           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6492                    == CATEGORY_MASK_ANY)
6493             setup_coding_system (Qraw_text, coding);
6494           else if (detect_info.rejected)
6495             for (i = 0; i < coding_category_raw_text; i++)
6496               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6497                 {
6498                   this = coding_categories + coding_priorities[i];
6499                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6500                   break;
6501                 }
6502         }
6503     }
6504   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6505            == coding_category_utf_8_auto)
6506     {
6507       Lisp_Object coding_systems;
6508       struct coding_detection_info detect_info;
6509
6510       coding_systems
6511         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6512       detect_info.found = detect_info.rejected = 0;
6513       coding->head_ascii = 0;
6514       if (CONSP (coding_systems)
6515           && detect_coding_utf_8 (coding, &detect_info))
6516         {
6517           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6518             setup_coding_system (XCAR (coding_systems), coding);
6519           else
6520             setup_coding_system (XCDR (coding_systems), coding);
6521         }
6522     }
6523   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6524            == coding_category_utf_16_auto)
6525     {
6526       Lisp_Object coding_systems;
6527       struct coding_detection_info detect_info;
6528
6529       coding_systems
6530         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6531       detect_info.found = detect_info.rejected = 0;
6532       coding->head_ascii = 0;
6533       if (CONSP (coding_systems)
6534           && detect_coding_utf_16 (coding, &detect_info))
6535         {
6536           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6537             setup_coding_system (XCAR (coding_systems), coding);
6538           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6539             setup_coding_system (XCDR (coding_systems), coding);
6540         }
6541     }
6542   coding->mode = saved_mode;
6543 }
6544
6545
6546 static void
6547 decode_eol (coding)
6548      struct coding_system *coding;
6549 {
6550   Lisp_Object eol_type;
6551   unsigned char *p, *pbeg, *pend;
6552
6553   eol_type = CODING_ID_EOL_TYPE (coding->id);
6554   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6555     return;
6556
6557   if (NILP (coding->dst_object))
6558     pbeg = coding->destination;
6559   else
6560     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6561   pend = pbeg + coding->produced;
6562
6563   if (VECTORP (eol_type))
6564     {
6565       int eol_seen = EOL_SEEN_NONE;
6566
6567       for (p = pbeg; p < pend; p++)
6568         {
6569           if (*p == '\n')
6570             eol_seen |= EOL_SEEN_LF;
6571           else if (*p == '\r')
6572             {
6573               if (p + 1 < pend && *(p + 1) == '\n')
6574                 {
6575                   eol_seen |= EOL_SEEN_CRLF;
6576                   p++;
6577                 }
6578               else
6579                 eol_seen |= EOL_SEEN_CR;
6580             }
6581         }
6582       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6583       if ((eol_seen & EOL_SEEN_CRLF) != 0
6584           && (eol_seen & EOL_SEEN_CR) != 0
6585           && (eol_seen & EOL_SEEN_LF) == 0)
6586         eol_seen = EOL_SEEN_CRLF;
6587       else if (eol_seen != EOL_SEEN_NONE
6588           && eol_seen != EOL_SEEN_LF
6589           && eol_seen != EOL_SEEN_CRLF
6590           && eol_seen != EOL_SEEN_CR)
6591         eol_seen = EOL_SEEN_LF;
6592       if (eol_seen != EOL_SEEN_NONE)
6593         eol_type = adjust_coding_eol_type (coding, eol_seen);
6594     }
6595
6596   if (EQ (eol_type, Qmac))
6597     {
6598       for (p = pbeg; p < pend; p++)
6599         if (*p == '\r')
6600           *p = '\n';
6601     }
6602   else if (EQ (eol_type, Qdos))
6603     {
6604       int n = 0;
6605
6606       if (NILP (coding->dst_object))
6607         {
6608           /* Start deleting '\r' from the tail to minimize the memory
6609              movement.  */
6610           for (p = pend - 2; p >= pbeg; p--)
6611             if (*p == '\r')
6612               {
6613                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6614                 n++;
6615               }
6616         }
6617       else
6618         {
6619           int pos_byte = coding->dst_pos_byte;
6620           int pos = coding->dst_pos;
6621           int pos_end = pos + coding->produced_char - 1;
6622
6623           while (pos < pos_end)
6624             {
6625               p = BYTE_POS_ADDR (pos_byte);
6626               if (*p == '\r' && p[1] == '\n')
6627                 {
6628                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6629                   n++;
6630                   pos_end--;
6631                 }
6632               pos++;
6633               if (coding->dst_multibyte)
6634                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6635               else
6636                 pos_byte++;
6637             }
6638         }
6639       coding->produced -= n;
6640       coding->produced_char -= n;
6641     }
6642 }
6643
6644
6645 /* Return a translation table (or list of them) from coding system
6646    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6647    decoding (ENCODEP is zero). */
6648
6649 static Lisp_Object
6650 get_translation_table (attrs, encodep, max_lookup)
6651      Lisp_Object attrs;
6652      int encodep, *max_lookup;
6653 {
6654   Lisp_Object standard, translation_table;
6655   Lisp_Object val;
6656
6657   if (NILP (Venable_character_translation))
6658     {
6659       if (max_lookup)
6660         *max_lookup = 0;
6661       return Qnil;
6662     }
6663   if (encodep)
6664     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6665       standard = Vstandard_translation_table_for_encode;
6666   else
6667     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6668       standard = Vstandard_translation_table_for_decode;
6669   if (NILP (translation_table))
6670     translation_table = standard;
6671   else
6672     {
6673       if (SYMBOLP (translation_table))
6674         translation_table = Fget (translation_table, Qtranslation_table);
6675       else if (CONSP (translation_table))
6676         {
6677           translation_table = Fcopy_sequence (translation_table);
6678           for (val = translation_table; CONSP (val); val = XCDR (val))
6679             if (SYMBOLP (XCAR (val)))
6680               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6681         }
6682       if (CHAR_TABLE_P (standard))
6683         {
6684           if (CONSP (translation_table))
6685             translation_table = nconc2 (translation_table,
6686                                         Fcons (standard, Qnil));
6687           else
6688             translation_table = Fcons (translation_table,
6689                                        Fcons (standard, Qnil));
6690         }
6691     }
6692
6693   if (max_lookup)
6694     {
6695       *max_lookup = 1;
6696       if (CHAR_TABLE_P (translation_table)
6697           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6698         {
6699           val = XCHAR_TABLE (translation_table)->extras[1];
6700           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6701             *max_lookup = XFASTINT (val);
6702         }
6703       else if (CONSP (translation_table))
6704         {
6705           Lisp_Object tail, val;
6706
6707           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6708             if (CHAR_TABLE_P (XCAR (tail))
6709                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6710               {
6711                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6712                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6713                   *max_lookup = XFASTINT (val);
6714               }
6715         }
6716     }
6717   return translation_table;
6718 }
6719
6720 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6721   do {                                                          \
6722     trans = Qnil;                                               \
6723     if (CHAR_TABLE_P (table))                                   \
6724       {                                                         \
6725         trans = CHAR_TABLE_REF (table, c);                      \
6726         if (CHARACTERP (trans))                                 \
6727           c = XFASTINT (trans), trans = Qnil;                   \
6728       }                                                         \
6729     else if (CONSP (table))                                     \
6730       {                                                         \
6731         Lisp_Object tail;                                       \
6732                                                                 \
6733         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6734           if (CHAR_TABLE_P (XCAR (tail)))                       \
6735             {                                                   \
6736               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6737               if (CHARACTERP (trans))                           \
6738                 c = XFASTINT (trans), trans = Qnil;             \
6739               else if (! NILP (trans))                          \
6740                 break;                                          \
6741             }                                                   \
6742       }                                                         \
6743   } while (0)
6744
6745
6746 /* Return a translation of character(s) at BUF according to TRANS.
6747    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6748    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6749    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6750    translation is found, and Qnil if not found..
6751    If BUF is too short to lookup characters in FROM, return Qt.  */
6752
6753 static Lisp_Object
6754 get_translation (trans, buf, buf_end)
6755      Lisp_Object trans;
6756      int *buf, *buf_end;
6757 {
6758
6759   if (INTEGERP (trans))
6760     return trans;
6761   for (; CONSP (trans); trans = XCDR (trans))
6762     {
6763       Lisp_Object val = XCAR (trans);
6764       Lisp_Object from = XCAR (val);
6765       int len = ASIZE (from);
6766       int i;
6767
6768       for (i = 0; i < len; i++)
6769         {
6770           if (buf + i == buf_end)
6771             return Qt;
6772           if (XINT (AREF (from, i)) != buf[i])
6773             break;
6774         }
6775       if (i == len)
6776         return val;
6777     }
6778   return Qnil;
6779 }
6780
6781
6782 static int
6783 produce_chars (coding, translation_table, last_block)
6784      struct coding_system *coding;
6785      Lisp_Object translation_table;
6786      int last_block;
6787 {
6788   unsigned char *dst = coding->destination + coding->produced;
6789   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6790   EMACS_INT produced;
6791   EMACS_INT produced_chars = 0;
6792   int carryover = 0;
6793
6794   if (! coding->chars_at_source)
6795     {
6796       /* Source characters are in coding->charbuf.  */
6797       int *buf = coding->charbuf;
6798       int *buf_end = buf + coding->charbuf_used;
6799
6800       if (EQ (coding->src_object, coding->dst_object))
6801         {
6802           coding_set_source (coding);
6803           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6804         }
6805
6806       while (buf < buf_end)
6807         {
6808           int c = *buf, i;
6809
6810           if (c >= 0)
6811             {
6812               int from_nchars = 1, to_nchars = 1;
6813               Lisp_Object trans = Qnil;
6814
6815               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6816               if (! NILP (trans))
6817                 {
6818                   trans = get_translation (trans, buf, buf_end);
6819                   if (INTEGERP (trans))
6820                     c = XINT (trans);
6821                   else if (CONSP (trans))
6822                     {
6823                       from_nchars = ASIZE (XCAR (trans));
6824                       trans = XCDR (trans);
6825                       if (INTEGERP (trans))
6826                         c = XINT (trans);
6827                       else
6828                         {
6829                           to_nchars = ASIZE (trans);
6830                           c = XINT (AREF (trans, 0));
6831                         }
6832                     }
6833                   else if (EQ (trans, Qt) && ! last_block)
6834                     break;
6835                 }
6836
6837               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6838                 {
6839                   dst = alloc_destination (coding,
6840                                            buf_end - buf
6841                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6842                                            dst);
6843                   if (EQ (coding->src_object, coding->dst_object))
6844                     {
6845                       coding_set_source (coding);
6846                       dst_end = (((unsigned char *) coding->source)
6847                                  + coding->consumed);
6848                     }
6849                   else
6850                     dst_end = coding->destination + coding->dst_bytes;
6851                 }
6852
6853               for (i = 0; i < to_nchars; i++)
6854                 {
6855                   if (i > 0)
6856                     c = XINT (AREF (trans, i));
6857                   if (coding->dst_multibyte
6858                       || ! CHAR_BYTE8_P (c))
6859                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6860                   else
6861                     *dst++ = CHAR_TO_BYTE8 (c);
6862                 }
6863               produced_chars += to_nchars;
6864               buf += from_nchars;
6865             }
6866           else
6867             /* This is an annotation datum.  (-C) is the length.  */
6868             buf += -c;
6869         }
6870       carryover = buf_end - buf;
6871     }
6872   else
6873     {
6874       /* Source characters are at coding->source.  */
6875       const unsigned char *src = coding->source;
6876       const unsigned char *src_end = src + coding->consumed;
6877
6878       if (EQ (coding->dst_object, coding->src_object))
6879         dst_end = (unsigned char *) src;
6880       if (coding->src_multibyte != coding->dst_multibyte)
6881         {
6882           if (coding->src_multibyte)
6883             {
6884               int multibytep = 1;
6885               EMACS_INT consumed_chars = 0;
6886
6887               while (1)
6888                 {
6889                   const unsigned char *src_base = src;
6890                   int c;
6891
6892                   ONE_MORE_BYTE (c);
6893                   if (dst == dst_end)
6894                     {
6895                       if (EQ (coding->src_object, coding->dst_object))
6896                         dst_end = (unsigned char *) src;
6897                       if (dst == dst_end)
6898                         {
6899                           EMACS_INT offset = src - coding->source;
6900
6901                           dst = alloc_destination (coding, src_end - src + 1,
6902                                                    dst);
6903                           dst_end = coding->destination + coding->dst_bytes;
6904                           coding_set_source (coding);
6905                           src = coding->source + offset;
6906                           src_end = coding->source + coding->src_bytes;
6907                           if (EQ (coding->src_object, coding->dst_object))
6908                             dst_end = (unsigned char *) src;
6909                         }
6910                     }
6911                   *dst++ = c;
6912                   produced_chars++;
6913                 }
6914             no_more_source:
6915               ;
6916             }
6917           else
6918             while (src < src_end)
6919               {
6920                 int multibytep = 1;
6921                 int c = *src++;
6922
6923                 if (dst >= dst_end - 1)
6924                   {
6925                     if (EQ (coding->src_object, coding->dst_object))
6926                       dst_end = (unsigned char *) src;
6927                     if (dst >= dst_end - 1)
6928                       {
6929                         EMACS_INT offset = src - coding->source;
6930                         EMACS_INT more_bytes;
6931
6932                         if (EQ (coding->src_object, coding->dst_object))
6933                           more_bytes = ((src_end - src) / 2) + 2;
6934                         else
6935                           more_bytes = src_end - src + 2;
6936                         dst = alloc_destination (coding, more_bytes, dst);
6937                         dst_end = coding->destination + coding->dst_bytes;
6938                         coding_set_source (coding);
6939                         src = coding->source + offset;
6940                         src_end = coding->source + coding->src_bytes;
6941                         if (EQ (coding->src_object, coding->dst_object))
6942                           dst_end = (unsigned char *) src;
6943                       }
6944                   }
6945                 EMIT_ONE_BYTE (c);
6946               }
6947         }
6948       else
6949         {
6950           if (!EQ (coding->src_object, coding->dst_object))
6951             {
6952               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6953
6954               if (require > 0)
6955                 {
6956                   EMACS_INT offset = src - coding->source;
6957
6958                   dst = alloc_destination (coding, require, dst);
6959                   coding_set_source (coding);
6960                   src = coding->source + offset;
6961                   src_end = coding->source + coding->src_bytes;
6962                 }
6963             }
6964           produced_chars = coding->consumed_char;
6965           while (src < src_end)
6966             *dst++ = *src++;
6967         }
6968     }
6969
6970   produced = dst - (coding->destination + coding->produced);
6971   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6972     insert_from_gap (produced_chars, produced);
6973   coding->produced += produced;
6974   coding->produced_char += produced_chars;
6975   return carryover;
6976 }
6977
6978 /* Compose text in CODING->object according to the annotation data at
6979    CHARBUF.  CHARBUF is an array:
6980      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6981  */
6982
6983 static INLINE void
6984 produce_composition (coding, charbuf, pos)
6985      struct coding_system *coding;
6986      int *charbuf;
6987      EMACS_INT pos;
6988 {
6989   int len;
6990   EMACS_INT to;
6991   enum composition_method method;
6992   Lisp_Object components;
6993
6994   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6995   to = pos + charbuf[2];
6996   method = (enum composition_method) (charbuf[4]);
6997
6998   if (method == COMPOSITION_RELATIVE)
6999     components = Qnil;
7000   else
7001     {
7002       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7003       int i, j;
7004
7005       if (method == COMPOSITION_WITH_RULE)
7006         len = charbuf[2] * 3 - 2;
7007       charbuf += MAX_ANNOTATION_LENGTH;
7008       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7009       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7010         {
7011           if (charbuf[i] >= 0)
7012             args[j] = make_number (charbuf[i]);
7013           else
7014             {
7015               i++;
7016               args[j] = make_number (charbuf[i] % 0x100);
7017             }
7018         }
7019       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7020     }
7021   compose_text (pos, to, components, Qnil, coding->dst_object);
7022 }
7023
7024
7025 /* Put `charset' property on text in CODING->object according to
7026    the annotation data at CHARBUF.  CHARBUF is an array:
7027      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7028  */
7029
7030 static INLINE void
7031 produce_charset (coding, charbuf, pos)
7032      struct coding_system *coding;
7033      int *charbuf;
7034      EMACS_INT pos;
7035 {
7036   EMACS_INT from = pos - charbuf[2];
7037   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7038
7039   Fput_text_property (make_number (from), make_number (pos),
7040                       Qcharset, CHARSET_NAME (charset),
7041                       coding->dst_object);
7042 }
7043
7044
7045 #define CHARBUF_SIZE 0x4000
7046
7047 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7048   do {                                                                  \
7049     int size = CHARBUF_SIZE;                                            \
7050                                                                         \
7051     coding->charbuf = NULL;                                             \
7052     while (size > 1024)                                                 \
7053       {                                                                 \
7054         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7055         if (coding->charbuf)                                            \
7056           break;                                                        \
7057         size >>= 1;                                                     \
7058       }                                                                 \
7059     if (! coding->charbuf)                                              \
7060       {                                                                 \
7061         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7062         return coding->result;                                          \
7063       }                                                                 \
7064     coding->charbuf_size = size;                                        \
7065   } while (0)
7066
7067
7068 static void
7069 produce_annotation (coding, pos)
7070      struct coding_system *coding;
7071      EMACS_INT pos;
7072 {
7073   int *charbuf = coding->charbuf;
7074   int *charbuf_end = charbuf + coding->charbuf_used;
7075
7076   if (NILP (coding->dst_object))
7077     return;
7078
7079   while (charbuf < charbuf_end)
7080     {
7081       if (*charbuf >= 0)
7082         pos++, charbuf++;
7083       else
7084         {
7085           int len = -*charbuf;
7086
7087           if (len > 2)
7088             switch (charbuf[1])
7089               {
7090               case CODING_ANNOTATE_COMPOSITION_MASK:
7091                 produce_composition (coding, charbuf, pos);
7092                 break;
7093               case CODING_ANNOTATE_CHARSET_MASK:
7094                 produce_charset (coding, charbuf, pos);
7095                 break;
7096               }
7097           charbuf += len;
7098         }
7099     }
7100 }
7101
7102 /* Decode the data at CODING->src_object into CODING->dst_object.
7103    CODING->src_object is a buffer, a string, or nil.
7104    CODING->dst_object is a buffer.
7105
7106    If CODING->src_object is a buffer, it must be the current buffer.
7107    In this case, if CODING->src_pos is positive, it is a position of
7108    the source text in the buffer, otherwise, the source text is in the
7109    gap area of the buffer, and CODING->src_pos specifies the offset of
7110    the text from GPT (which must be the same as PT).  If this is the
7111    same buffer as CODING->dst_object, CODING->src_pos must be
7112    negative.
7113
7114    If CODING->src_object is a string, CODING->src_pos is an index to
7115    that string.
7116
7117    If CODING->src_object is nil, CODING->source must already point to
7118    the non-relocatable memory area.  In this case, CODING->src_pos is
7119    an offset from CODING->source.
7120
7121    The decoded data is inserted at the current point of the buffer
7122    CODING->dst_object.
7123 */
7124
7125 static int
7126 decode_coding (coding)
7127      struct coding_system *coding;
7128 {
7129   Lisp_Object attrs;
7130   Lisp_Object undo_list;
7131   Lisp_Object translation_table;
7132   struct ccl_spec cclspec;
7133   int carryover;
7134   int i;
7135
7136   if (BUFFERP (coding->src_object)
7137       && coding->src_pos > 0
7138       && coding->src_pos < GPT
7139       && coding->src_pos + coding->src_chars > GPT)
7140     move_gap_both (coding->src_pos, coding->src_pos_byte);
7141
7142   undo_list = Qt;
7143   if (BUFFERP (coding->dst_object))
7144     {
7145       if (current_buffer != XBUFFER (coding->dst_object))
7146         set_buffer_internal (XBUFFER (coding->dst_object));
7147       if (GPT != PT)
7148         move_gap_both (PT, PT_BYTE);
7149       undo_list = current_buffer->undo_list;
7150       current_buffer->undo_list = Qt;
7151     }
7152
7153   coding->consumed = coding->consumed_char = 0;
7154   coding->produced = coding->produced_char = 0;
7155   coding->chars_at_source = 0;
7156   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7157   coding->errors = 0;
7158
7159   ALLOC_CONVERSION_WORK_AREA (coding);
7160
7161   attrs = CODING_ID_ATTRS (coding->id);
7162   translation_table = get_translation_table (attrs, 0, NULL);
7163
7164   carryover = 0;
7165   if (coding->decoder == decode_coding_ccl)
7166     {
7167       coding->spec.ccl = &cclspec;
7168       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7169     }
7170   do
7171     {
7172       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7173
7174       coding_set_source (coding);
7175       coding->annotated = 0;
7176       coding->charbuf_used = carryover;
7177       (*(coding->decoder)) (coding);
7178       coding_set_destination (coding);
7179       carryover = produce_chars (coding, translation_table, 0);
7180       if (coding->annotated)
7181         produce_annotation (coding, pos);
7182       for (i = 0; i < carryover; i++)
7183         coding->charbuf[i]
7184           = coding->charbuf[coding->charbuf_used - carryover + i];
7185     }
7186   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7187          || (coding->consumed < coding->src_bytes
7188              && (coding->result == CODING_RESULT_SUCCESS
7189                  || coding->result == CODING_RESULT_INVALID_SRC)));
7190
7191   if (carryover > 0)
7192     {
7193       coding_set_destination (coding);
7194       coding->charbuf_used = carryover;
7195       produce_chars (coding, translation_table, 1);
7196     }
7197
7198   coding->carryover_bytes = 0;
7199   if (coding->consumed < coding->src_bytes)
7200     {
7201       int nbytes = coding->src_bytes - coding->consumed;
7202       const unsigned char *src;
7203
7204       coding_set_source (coding);
7205       coding_set_destination (coding);
7206       src = coding->source + coding->consumed;
7207
7208       if (coding->mode & CODING_MODE_LAST_BLOCK)
7209         {
7210           /* Flush out unprocessed data as binary chars.  We are sure
7211              that the number of data is less than the size of
7212              coding->charbuf.  */
7213           coding->charbuf_used = 0;
7214           coding->chars_at_source = 0;
7215
7216           while (nbytes-- > 0)
7217             {
7218               int c = *src++;
7219
7220               if (c & 0x80)
7221                 c = BYTE8_TO_CHAR (c);
7222               coding->charbuf[coding->charbuf_used++] = c;
7223             }
7224           produce_chars (coding, Qnil, 1);
7225         }
7226       else
7227         {
7228           /* Record unprocessed bytes in coding->carryover.  We are
7229              sure that the number of data is less than the size of
7230              coding->carryover.  */
7231           unsigned char *p = coding->carryover;
7232
7233           if (nbytes > sizeof coding->carryover)
7234             nbytes = sizeof coding->carryover;
7235           coding->carryover_bytes = nbytes;
7236           while (nbytes-- > 0)
7237             *p++ = *src++;
7238         }
7239       coding->consumed = coding->src_bytes;
7240     }
7241
7242   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7243       && !inhibit_eol_conversion)
7244     decode_eol (coding);
7245   if (BUFFERP (coding->dst_object))
7246     {
7247       current_buffer->undo_list = undo_list;
7248       record_insert (coding->dst_pos, coding->produced_char);
7249     }
7250   return coding->result;
7251 }
7252
7253
7254 /* Extract an annotation datum from a composition starting at POS and
7255    ending before LIMIT of CODING->src_object (buffer or string), store
7256    the data in BUF, set *STOP to a starting position of the next
7257    composition (if any) or to LIMIT, and return the address of the
7258    next element of BUF.
7259
7260    If such an annotation is not found, set *STOP to a starting
7261    position of a composition after POS (if any) or to LIMIT, and
7262    return BUF.  */
7263
7264 static INLINE int *
7265 handle_composition_annotation (pos, limit, coding, buf, stop)
7266      EMACS_INT pos, limit;
7267      struct coding_system *coding;
7268      int *buf;
7269      EMACS_INT *stop;
7270 {
7271   EMACS_INT start, end;
7272   Lisp_Object prop;
7273
7274   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7275       || end > limit)
7276     *stop = limit;
7277   else if (start > pos)
7278     *stop = start;
7279   else
7280     {
7281       if (start == pos)
7282         {
7283           /* We found a composition.  Store the corresponding
7284              annotation data in BUF.  */
7285           int *head = buf;
7286           enum composition_method method = COMPOSITION_METHOD (prop);
7287           int nchars = COMPOSITION_LENGTH (prop);
7288
7289           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7290           if (method != COMPOSITION_RELATIVE)
7291             {
7292               Lisp_Object components;
7293               int len, i, i_byte;
7294
7295               components = COMPOSITION_COMPONENTS (prop);
7296               if (VECTORP (components))
7297                 {
7298                   len = XVECTOR (components)->size;
7299                   for (i = 0; i < len; i++)
7300                     *buf++ = XINT (AREF (components, i));
7301                 }
7302               else if (STRINGP (components))
7303                 {
7304                   len = SCHARS (components);
7305                   i = i_byte = 0;
7306                   while (i < len)
7307                     {
7308                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7309                       buf++;
7310                     }
7311                 }
7312               else if (INTEGERP (components))
7313                 {
7314                   len = 1;
7315                   *buf++ = XINT (components);
7316                 }
7317               else if (CONSP (components))
7318                 {
7319                   for (len = 0; CONSP (components);
7320                        len++, components = XCDR (components))
7321                     *buf++ = XINT (XCAR (components));
7322                 }
7323               else
7324                 abort ();
7325               *head -= len;
7326             }
7327         }
7328
7329       if (find_composition (end, limit, &start, &end, &prop,
7330                             coding->src_object)
7331           && end <= limit)
7332         *stop = start;
7333       else
7334         *stop = limit;
7335     }
7336   return buf;
7337 }
7338
7339
7340 /* Extract an annotation datum from a text property `charset' at POS of
7341    CODING->src_object (buffer of string), store the data in BUF, set
7342    *STOP to the position where the value of `charset' property changes
7343    (limiting by LIMIT), and return the address of the next element of
7344    BUF.
7345
7346    If the property value is nil, set *STOP to the position where the
7347    property value is non-nil (limiting by LIMIT), and return BUF.  */
7348
7349 static INLINE int *
7350 handle_charset_annotation (pos, limit, coding, buf, stop)
7351      EMACS_INT pos, limit;
7352      struct coding_system *coding;
7353      int *buf;
7354      EMACS_INT *stop;
7355 {
7356   Lisp_Object val, next;
7357   int id;
7358
7359   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7360   if (! NILP (val) && CHARSETP (val))
7361     id = XINT (CHARSET_SYMBOL_ID (val));
7362   else
7363     id = -1;
7364   ADD_CHARSET_DATA (buf, 0, id);
7365   next = Fnext_single_property_change (make_number (pos), Qcharset,
7366                                        coding->src_object,
7367                                        make_number (limit));
7368   *stop = XINT (next);
7369   return buf;
7370 }
7371
7372
7373 static void
7374 consume_chars (coding, translation_table, max_lookup)
7375      struct coding_system *coding;
7376      Lisp_Object translation_table;
7377      int max_lookup;
7378 {
7379   int *buf = coding->charbuf;
7380   int *buf_end = coding->charbuf + coding->charbuf_size;
7381   const unsigned char *src = coding->source + coding->consumed;
7382   const unsigned char *src_end = coding->source + coding->src_bytes;
7383   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7384   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7385   int multibytep = coding->src_multibyte;
7386   Lisp_Object eol_type;
7387   int c;
7388   EMACS_INT stop, stop_composition, stop_charset;
7389   int *lookup_buf = NULL;
7390
7391   if (! NILP (translation_table))
7392     lookup_buf = alloca (sizeof (int) * max_lookup);
7393
7394   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7395   if (VECTORP (eol_type))
7396     eol_type = Qunix;
7397
7398   /* Note: composition handling is not yet implemented.  */
7399   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7400
7401   if (NILP (coding->src_object))
7402     stop = stop_composition = stop_charset = end_pos;
7403   else
7404     {
7405       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7406         stop = stop_composition = pos;
7407       else
7408         stop = stop_composition = end_pos;
7409       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7410         stop = stop_charset = pos;
7411       else
7412         stop_charset = end_pos;
7413     }
7414
7415   /* Compensate for CRLF and conversion.  */
7416   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7417   while (buf < buf_end)
7418     {
7419       Lisp_Object trans;
7420
7421       if (pos == stop)
7422         {
7423           if (pos == end_pos)
7424             break;
7425           if (pos == stop_composition)
7426             buf = handle_composition_annotation (pos, end_pos, coding,
7427                                                  buf, &stop_composition);
7428           if (pos == stop_charset)
7429             buf = handle_charset_annotation (pos, end_pos, coding,
7430                                              buf, &stop_charset);
7431           stop = (stop_composition < stop_charset
7432                   ? stop_composition : stop_charset);
7433         }
7434
7435       if (! multibytep)
7436         {
7437           EMACS_INT bytes;
7438
7439           if (coding->encoder == encode_coding_raw_text
7440               || coding->encoder == encode_coding_ccl)
7441             c = *src++, pos++;
7442           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7443             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7444           else
7445             c = BYTE8_TO_CHAR (*src), src++, pos++;
7446         }
7447       else
7448         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7449       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7450         c = '\n';
7451       if (! EQ (eol_type, Qunix))
7452         {
7453           if (c == '\n')
7454             {
7455               if (EQ (eol_type, Qdos))
7456                 *buf++ = '\r';
7457               else
7458                 c = '\r';
7459             }
7460         }
7461
7462       trans = Qnil;
7463       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7464       if (NILP (trans))
7465         *buf++ = c;
7466       else
7467         {
7468           int from_nchars = 1, to_nchars = 1;
7469           int *lookup_buf_end;
7470           const unsigned char *p = src;
7471           int i;
7472
7473           lookup_buf[0] = c;
7474           for (i = 1; i < max_lookup && p < src_end; i++)
7475             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7476           lookup_buf_end = lookup_buf + i;
7477           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7478           if (INTEGERP (trans))
7479             c = XINT (trans);
7480           else if (CONSP (trans))
7481             {
7482               from_nchars = ASIZE (XCAR (trans));
7483               trans = XCDR (trans);
7484               if (INTEGERP (trans))
7485                 c = XINT (trans);
7486               else
7487                 {
7488                   to_nchars = ASIZE (trans);
7489                   if (buf + to_nchars > buf_end)
7490                     break;
7491                   c = XINT (AREF (trans, 0));
7492                 }
7493             }
7494           else
7495             break;
7496           *buf++ = c;
7497           for (i = 1; i < to_nchars; i++)
7498             *buf++ = XINT (AREF (trans, i));
7499           for (i = 1; i < from_nchars; i++, pos++)
7500             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7501         }
7502     }
7503
7504   coding->consumed = src - coding->source;
7505   coding->consumed_char = pos - coding->src_pos;
7506   coding->charbuf_used = buf - coding->charbuf;
7507   coding->chars_at_source = 0;
7508 }
7509
7510
7511 /* Encode the text at CODING->src_object into CODING->dst_object.
7512    CODING->src_object is a buffer or a string.
7513    CODING->dst_object is a buffer or nil.
7514
7515    If CODING->src_object is a buffer, it must be the current buffer.
7516    In this case, if CODING->src_pos is positive, it is a position of
7517    the source text in the buffer, otherwise. the source text is in the
7518    gap area of the buffer, and coding->src_pos specifies the offset of
7519    the text from GPT (which must be the same as PT).  If this is the
7520    same buffer as CODING->dst_object, CODING->src_pos must be
7521    negative and CODING should not have `pre-write-conversion'.
7522
7523    If CODING->src_object is a string, CODING should not have
7524    `pre-write-conversion'.
7525
7526    If CODING->dst_object is a buffer, the encoded data is inserted at
7527    the current point of that buffer.
7528
7529    If CODING->dst_object is nil, the encoded data is placed at the
7530    memory area specified by CODING->destination.  */
7531
7532 static int
7533 encode_coding (coding)
7534      struct coding_system *coding;
7535 {
7536   Lisp_Object attrs;
7537   Lisp_Object translation_table;
7538   int max_lookup;
7539   struct ccl_spec cclspec;
7540
7541   attrs = CODING_ID_ATTRS (coding->id);
7542   if (coding->encoder == encode_coding_raw_text)
7543     translation_table = Qnil, max_lookup = 0;
7544   else
7545     translation_table = get_translation_table (attrs, 1, &max_lookup);
7546
7547   if (BUFFERP (coding->dst_object))
7548     {
7549       set_buffer_internal (XBUFFER (coding->dst_object));
7550       coding->dst_multibyte
7551         = ! NILP (current_buffer->enable_multibyte_characters);
7552     }
7553
7554   coding->consumed = coding->consumed_char = 0;
7555   coding->produced = coding->produced_char = 0;
7556   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7557   coding->errors = 0;
7558
7559   ALLOC_CONVERSION_WORK_AREA (coding);
7560
7561   if (coding->encoder == encode_coding_ccl)
7562     {
7563       coding->spec.ccl = &cclspec;
7564       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7565     }
7566   do {
7567     coding_set_source (coding);
7568     consume_chars (coding, translation_table, max_lookup);
7569     coding_set_destination (coding);
7570     (*(coding->encoder)) (coding);
7571   } while (coding->consumed_char < coding->src_chars);
7572
7573   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7574     insert_from_gap (coding->produced_char, coding->produced);
7575
7576   return (coding->result);
7577 }
7578
7579
7580 /* Name (or base name) of work buffer for code conversion.  */
7581 static Lisp_Object Vcode_conversion_workbuf_name;
7582
7583 /* A working buffer used by the top level conversion.  Once it is
7584    created, it is never destroyed.  It has the name
7585    Vcode_conversion_workbuf_name.  The other working buffers are
7586    destroyed after the use is finished, and their names are modified
7587    versions of Vcode_conversion_workbuf_name.  */
7588 static Lisp_Object Vcode_conversion_reused_workbuf;
7589
7590 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7591 static int reused_workbuf_in_use;
7592
7593
7594 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7595    multibyteness of returning buffer.  */
7596
7597 static Lisp_Object
7598 make_conversion_work_buffer (multibyte)
7599      int multibyte;
7600 {
7601   Lisp_Object name, workbuf;
7602   struct buffer *current;
7603
7604   if (reused_workbuf_in_use++)
7605     {
7606       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7607       workbuf = Fget_buffer_create (name);
7608     }
7609   else
7610     {
7611       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7612         Vcode_conversion_reused_workbuf
7613           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7614       workbuf = Vcode_conversion_reused_workbuf;
7615     }
7616   current = current_buffer;
7617   set_buffer_internal (XBUFFER (workbuf));
7618   /* We can't allow modification hooks to run in the work buffer.  For
7619      instance, directory_files_internal assumes that file decoding
7620      doesn't compile new regexps.  */
7621   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7622   Ferase_buffer ();
7623   current_buffer->undo_list = Qt;
7624   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7625   set_buffer_internal (current);
7626   return workbuf;
7627 }
7628
7629
7630 static Lisp_Object
7631 code_conversion_restore (arg)
7632      Lisp_Object arg;
7633 {
7634   Lisp_Object current, workbuf;
7635   struct gcpro gcpro1;
7636
7637   GCPRO1 (arg);
7638   current = XCAR (arg);
7639   workbuf = XCDR (arg);
7640   if (! NILP (workbuf))
7641     {
7642       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7643         reused_workbuf_in_use = 0;
7644       else if (! NILP (Fbuffer_live_p (workbuf)))
7645         Fkill_buffer (workbuf);
7646     }
7647   set_buffer_internal (XBUFFER (current));
7648   UNGCPRO;
7649   return Qnil;
7650 }
7651
7652 Lisp_Object
7653 code_conversion_save (with_work_buf, multibyte)
7654      int with_work_buf, multibyte;
7655 {
7656   Lisp_Object workbuf = Qnil;
7657
7658   if (with_work_buf)
7659     workbuf = make_conversion_work_buffer (multibyte);
7660   record_unwind_protect (code_conversion_restore,
7661                          Fcons (Fcurrent_buffer (), workbuf));
7662   return workbuf;
7663 }
7664
7665 int
7666 decode_coding_gap (coding, chars, bytes)
7667      struct coding_system *coding;
7668      EMACS_INT chars, bytes;
7669 {
7670   int count = specpdl_ptr - specpdl;
7671   Lisp_Object attrs;
7672
7673   code_conversion_save (0, 0);
7674
7675   coding->src_object = Fcurrent_buffer ();
7676   coding->src_chars = chars;
7677   coding->src_bytes = bytes;
7678   coding->src_pos = -chars;
7679   coding->src_pos_byte = -bytes;
7680   coding->src_multibyte = chars < bytes;
7681   coding->dst_object = coding->src_object;
7682   coding->dst_pos = PT;
7683   coding->dst_pos_byte = PT_BYTE;
7684   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7685
7686   if (CODING_REQUIRE_DETECTION (coding))
7687     detect_coding (coding);
7688
7689   coding->mode |= CODING_MODE_LAST_BLOCK;
7690   current_buffer->text->inhibit_shrinking = 1;
7691   decode_coding (coding);
7692   current_buffer->text->inhibit_shrinking = 0;
7693
7694   attrs = CODING_ID_ATTRS (coding->id);
7695   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7696     {
7697       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7698       Lisp_Object val;
7699
7700       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7701       val = call1 (CODING_ATTR_POST_READ (attrs),
7702                    make_number (coding->produced_char));
7703       CHECK_NATNUM (val);
7704       coding->produced_char += Z - prev_Z;
7705       coding->produced += Z_BYTE - prev_Z_BYTE;
7706     }
7707
7708   unbind_to (count, Qnil);
7709   return coding->result;
7710 }
7711
7712 int
7713 encode_coding_gap (coding, chars, bytes)
7714      struct coding_system *coding;
7715      EMACS_INT chars, bytes;
7716 {
7717   int count = specpdl_ptr - specpdl;
7718
7719   code_conversion_save (0, 0);
7720
7721   coding->src_object = Fcurrent_buffer ();
7722   coding->src_chars = chars;
7723   coding->src_bytes = bytes;
7724   coding->src_pos = -chars;
7725   coding->src_pos_byte = -bytes;
7726   coding->src_multibyte = chars < bytes;
7727   coding->dst_object = coding->src_object;
7728   coding->dst_pos = PT;
7729   coding->dst_pos_byte = PT_BYTE;
7730
7731   encode_coding (coding);
7732
7733   unbind_to (count, Qnil);
7734   return coding->result;
7735 }
7736
7737
7738 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7739    SRC_OBJECT into DST_OBJECT by coding context CODING.
7740
7741    SRC_OBJECT is a buffer, a string, or Qnil.
7742
7743    If it is a buffer, the text is at point of the buffer.  FROM and TO
7744    are positions in the buffer.
7745
7746    If it is a string, the text is at the beginning of the string.
7747    FROM and TO are indices to the string.
7748
7749    If it is nil, the text is at coding->source.  FROM and TO are
7750    indices to coding->source.
7751
7752    DST_OBJECT is a buffer, Qt, or Qnil.
7753
7754    If it is a buffer, the decoded text is inserted at point of the
7755    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7756    is deleted.
7757
7758    If it is Qt, a string is made from the decoded text, and
7759    set in CODING->dst_object.
7760
7761    If it is Qnil, the decoded text is stored at CODING->destination.
7762    The caller must allocate CODING->dst_bytes bytes at
7763    CODING->destination by xmalloc.  If the decoded text is longer than
7764    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7765  */
7766
7767 void
7768 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7769                       dst_object)
7770      struct coding_system *coding;
7771      Lisp_Object src_object;
7772      EMACS_INT from, from_byte, to, to_byte;
7773      Lisp_Object dst_object;
7774 {
7775   int count = specpdl_ptr - specpdl;
7776   unsigned char *destination;
7777   EMACS_INT dst_bytes;
7778   EMACS_INT chars = to - from;
7779   EMACS_INT bytes = to_byte - from_byte;
7780   Lisp_Object attrs;
7781   int saved_pt = -1, saved_pt_byte;
7782   int need_marker_adjustment = 0;
7783   Lisp_Object old_deactivate_mark;
7784
7785   old_deactivate_mark = Vdeactivate_mark;
7786
7787   if (NILP (dst_object))
7788     {
7789       destination = coding->destination;
7790       dst_bytes = coding->dst_bytes;
7791     }
7792
7793   coding->src_object = src_object;
7794   coding->src_chars = chars;
7795   coding->src_bytes = bytes;
7796   coding->src_multibyte = chars < bytes;
7797
7798   if (STRINGP (src_object))
7799     {
7800       coding->src_pos = from;
7801       coding->src_pos_byte = from_byte;
7802     }
7803   else if (BUFFERP (src_object))
7804     {
7805       set_buffer_internal (XBUFFER (src_object));
7806       if (from != GPT)
7807         move_gap_both (from, from_byte);
7808       if (EQ (src_object, dst_object))
7809         {
7810           struct Lisp_Marker *tail;
7811
7812           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7813             {
7814               tail->need_adjustment
7815                 = tail->charpos == (tail->insertion_type ? from : to);
7816               need_marker_adjustment |= tail->need_adjustment;
7817             }
7818           saved_pt = PT, saved_pt_byte = PT_BYTE;
7819           TEMP_SET_PT_BOTH (from, from_byte);
7820           current_buffer->text->inhibit_shrinking = 1;
7821           del_range_both (from, from_byte, to, to_byte, 1);
7822           coding->src_pos = -chars;
7823           coding->src_pos_byte = -bytes;
7824         }
7825       else
7826         {
7827           coding->src_pos = from;
7828           coding->src_pos_byte = from_byte;
7829         }
7830     }
7831
7832   if (CODING_REQUIRE_DETECTION (coding))
7833     detect_coding (coding);
7834   attrs = CODING_ID_ATTRS (coding->id);
7835
7836   if (EQ (dst_object, Qt)
7837       || (! NILP (CODING_ATTR_POST_READ (attrs))
7838           && NILP (dst_object)))
7839     {
7840       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7841       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7842       coding->dst_pos = BEG;
7843       coding->dst_pos_byte = BEG_BYTE;
7844     }
7845   else if (BUFFERP (dst_object))
7846     {
7847       code_conversion_save (0, 0);
7848       coding->dst_object = dst_object;
7849       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7850       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7851       coding->dst_multibyte
7852         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7853     }
7854   else
7855     {
7856       code_conversion_save (0, 0);
7857       coding->dst_object = Qnil;
7858       /* Most callers presume this will return a multibyte result, and they
7859          won't use `binary' or `raw-text' anyway, so let's not worry about
7860          CODING_FOR_UNIBYTE.  */
7861       coding->dst_multibyte = 1;
7862     }
7863
7864   decode_coding (coding);
7865
7866   if (BUFFERP (coding->dst_object))
7867     set_buffer_internal (XBUFFER (coding->dst_object));
7868
7869   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7870     {
7871       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7872       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7873       Lisp_Object val;
7874
7875       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7876       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7877               old_deactivate_mark);
7878       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7879                         make_number (coding->produced_char));
7880       UNGCPRO;
7881       CHECK_NATNUM (val);
7882       coding->produced_char += Z - prev_Z;
7883       coding->produced += Z_BYTE - prev_Z_BYTE;
7884     }
7885
7886   if (EQ (dst_object, Qt))
7887     {
7888       coding->dst_object = Fbuffer_string ();
7889     }
7890   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7891     {
7892       set_buffer_internal (XBUFFER (coding->dst_object));
7893       if (dst_bytes < coding->produced)
7894         {
7895           destination = xrealloc (destination, coding->produced);
7896           if (! destination)
7897             {
7898               record_conversion_result (coding,
7899                                         CODING_RESULT_INSUFFICIENT_MEM);
7900               unbind_to (count, Qnil);
7901               return;
7902             }
7903           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7904             move_gap_both (BEGV, BEGV_BYTE);
7905           bcopy (BEGV_ADDR, destination, coding->produced);
7906           coding->destination = destination;
7907         }
7908     }
7909
7910   if (saved_pt >= 0)
7911     {
7912       /* This is the case of:
7913          (BUFFERP (src_object) && EQ (src_object, dst_object))
7914          As we have moved PT while replacing the original buffer
7915          contents, we must recover it now.  */
7916       set_buffer_internal (XBUFFER (src_object));
7917       current_buffer->text->inhibit_shrinking = 0;
7918       if (saved_pt < from)
7919         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7920       else if (saved_pt < from + chars)
7921         TEMP_SET_PT_BOTH (from, from_byte);
7922       else if (! NILP (current_buffer->enable_multibyte_characters))
7923         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7924                           saved_pt_byte + (coding->produced - bytes));
7925       else
7926         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7927                           saved_pt_byte + (coding->produced - bytes));
7928
7929       if (need_marker_adjustment)
7930         {
7931           struct Lisp_Marker *tail;
7932
7933           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7934             if (tail->need_adjustment)
7935               {
7936                 tail->need_adjustment = 0;
7937                 if (tail->insertion_type)
7938                   {
7939                     tail->bytepos = from_byte;
7940                     tail->charpos = from;
7941                   }
7942                 else
7943                   {
7944                     tail->bytepos = from_byte + coding->produced;
7945                     tail->charpos
7946                       = (NILP (current_buffer->enable_multibyte_characters)
7947                          ? tail->bytepos : from + coding->produced_char);
7948                   }
7949               }
7950         }
7951     }
7952
7953   Vdeactivate_mark = old_deactivate_mark;
7954   unbind_to (count, coding->dst_object);
7955 }
7956
7957
7958 void
7959 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7960                       dst_object)
7961      struct coding_system *coding;
7962      Lisp_Object src_object;
7963      EMACS_INT from, from_byte, to, to_byte;
7964      Lisp_Object dst_object;
7965 {
7966   int count = specpdl_ptr - specpdl;
7967   EMACS_INT chars = to - from;
7968   EMACS_INT bytes = to_byte - from_byte;
7969   Lisp_Object attrs;
7970   int saved_pt = -1, saved_pt_byte;
7971   int need_marker_adjustment = 0;
7972   int kill_src_buffer = 0;
7973   Lisp_Object old_deactivate_mark;
7974
7975   old_deactivate_mark = Vdeactivate_mark;
7976
7977   coding->src_object = src_object;
7978   coding->src_chars = chars;
7979   coding->src_bytes = bytes;
7980   coding->src_multibyte = chars < bytes;
7981
7982   attrs = CODING_ID_ATTRS (coding->id);
7983
7984   if (EQ (src_object, dst_object))
7985     {
7986       struct Lisp_Marker *tail;
7987
7988       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7989         {
7990           tail->need_adjustment
7991             = tail->charpos == (tail->insertion_type ? from : to);
7992           need_marker_adjustment |= tail->need_adjustment;
7993         }
7994     }
7995
7996   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7997     {
7998       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7999       set_buffer_internal (XBUFFER (coding->src_object));
8000       if (STRINGP (src_object))
8001         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8002       else if (BUFFERP (src_object))
8003         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8004       else
8005         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
8006
8007       if (EQ (src_object, dst_object))
8008         {
8009           set_buffer_internal (XBUFFER (src_object));
8010           saved_pt = PT, saved_pt_byte = PT_BYTE;
8011           del_range_both (from, from_byte, to, to_byte, 1);
8012           set_buffer_internal (XBUFFER (coding->src_object));
8013         }
8014
8015       {
8016         Lisp_Object args[3];
8017         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8018
8019         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8020                 old_deactivate_mark);
8021         args[0] = CODING_ATTR_PRE_WRITE (attrs);
8022         args[1] = make_number (BEG);
8023         args[2] = make_number (Z);
8024         safe_call (3, args);
8025         UNGCPRO;
8026       }
8027       if (XBUFFER (coding->src_object) != current_buffer)
8028         kill_src_buffer = 1;
8029       coding->src_object = Fcurrent_buffer ();
8030       if (BEG != GPT)
8031         move_gap_both (BEG, BEG_BYTE);
8032       coding->src_chars = Z - BEG;
8033       coding->src_bytes = Z_BYTE - BEG_BYTE;
8034       coding->src_pos = BEG;
8035       coding->src_pos_byte = BEG_BYTE;
8036       coding->src_multibyte = Z < Z_BYTE;
8037     }
8038   else if (STRINGP (src_object))
8039     {
8040       code_conversion_save (0, 0);
8041       coding->src_pos = from;
8042       coding->src_pos_byte = from_byte;
8043     }
8044   else if (BUFFERP (src_object))
8045     {
8046       code_conversion_save (0, 0);
8047       set_buffer_internal (XBUFFER (src_object));
8048       if (EQ (src_object, dst_object))
8049         {
8050           saved_pt = PT, saved_pt_byte = PT_BYTE;
8051           coding->src_object = del_range_1 (from, to, 1, 1);
8052           coding->src_pos = 0;
8053           coding->src_pos_byte = 0;
8054         }
8055       else
8056         {
8057           if (from < GPT && to >= GPT)
8058             move_gap_both (from, from_byte);
8059           coding->src_pos = from;
8060           coding->src_pos_byte = from_byte;
8061         }
8062     }
8063   else
8064     code_conversion_save (0, 0);
8065
8066   if (BUFFERP (dst_object))
8067     {
8068       coding->dst_object = dst_object;
8069       if (EQ (src_object, dst_object))
8070         {
8071           coding->dst_pos = from;
8072           coding->dst_pos_byte = from_byte;
8073         }
8074       else
8075         {
8076           struct buffer *current = current_buffer;
8077
8078           set_buffer_temp (XBUFFER (dst_object));
8079           coding->dst_pos = PT;
8080           coding->dst_pos_byte = PT_BYTE;
8081           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8082           set_buffer_temp (current);
8083         }
8084       coding->dst_multibyte
8085         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8086     }
8087   else if (EQ (dst_object, Qt))
8088     {
8089       coding->dst_object = Qnil;
8090       coding->dst_bytes = coding->src_chars;
8091       if (coding->dst_bytes == 0)
8092         coding->dst_bytes = 1;
8093       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8094       coding->dst_multibyte = 0;
8095     }
8096   else
8097     {
8098       coding->dst_object = Qnil;
8099       coding->dst_multibyte = 0;
8100     }
8101
8102   encode_coding (coding);
8103
8104   if (EQ (dst_object, Qt))
8105     {
8106       if (BUFFERP (coding->dst_object))
8107         coding->dst_object = Fbuffer_string ();
8108       else
8109         {
8110           coding->dst_object
8111             = make_unibyte_string ((char *) coding->destination,
8112                                    coding->produced);
8113           xfree (coding->destination);
8114         }
8115     }
8116
8117   if (saved_pt >= 0)
8118     {
8119       /* This is the case of:
8120          (BUFFERP (src_object) && EQ (src_object, dst_object))
8121          As we have moved PT while replacing the original buffer
8122          contents, we must recover it now.  */
8123       set_buffer_internal (XBUFFER (src_object));
8124       if (saved_pt < from)
8125         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8126       else if (saved_pt < from + chars)
8127         TEMP_SET_PT_BOTH (from, from_byte);
8128       else if (! NILP (current_buffer->enable_multibyte_characters))
8129         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8130                           saved_pt_byte + (coding->produced - bytes));
8131       else
8132         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8133                           saved_pt_byte + (coding->produced - bytes));
8134
8135       if (need_marker_adjustment)
8136         {
8137           struct Lisp_Marker *tail;
8138
8139           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8140             if (tail->need_adjustment)
8141               {
8142                 tail->need_adjustment = 0;
8143                 if (tail->insertion_type)
8144                   {
8145                     tail->bytepos = from_byte;
8146                     tail->charpos = from;
8147                   }
8148                 else
8149                   {
8150                     tail->bytepos = from_byte + coding->produced;
8151                     tail->charpos
8152                       = (NILP (current_buffer->enable_multibyte_characters)
8153                          ? tail->bytepos : from + coding->produced_char);
8154                   }
8155               }
8156         }
8157     }
8158
8159   if (kill_src_buffer)
8160     Fkill_buffer (coding->src_object);
8161
8162   Vdeactivate_mark = old_deactivate_mark;
8163   unbind_to (count, Qnil);
8164 }
8165
8166
8167 Lisp_Object
8168 preferred_coding_system ()
8169 {
8170   int id = coding_categories[coding_priorities[0]].id;
8171
8172   return CODING_ID_NAME (id);
8173 }
8174
8175 \f
8176 #ifdef emacs
8177 /*** 8. Emacs Lisp library functions ***/
8178
8179 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8180        doc: /* Return t if OBJECT is nil or a coding-system.
8181 See the documentation of `define-coding-system' for information
8182 about coding-system objects.  */)
8183      (object)
8184      Lisp_Object object;
8185 {
8186   if (NILP (object)
8187       || CODING_SYSTEM_ID (object) >= 0)
8188     return Qt;
8189   if (! SYMBOLP (object)
8190       || NILP (Fget (object, Qcoding_system_define_form)))
8191     return Qnil;
8192   return Qt;
8193 }
8194
8195 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8196        Sread_non_nil_coding_system, 1, 1, 0,
8197        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8198      (prompt)
8199      Lisp_Object prompt;
8200 {
8201   Lisp_Object val;
8202   do
8203     {
8204       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8205                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8206     }
8207   while (SCHARS (val) == 0);
8208   return (Fintern (val, Qnil));
8209 }
8210
8211 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8212        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8213 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8214 Ignores case when completing coding systems (all Emacs coding systems
8215 are lower-case).  */)
8216      (prompt, default_coding_system)
8217      Lisp_Object prompt, default_coding_system;
8218 {
8219   Lisp_Object val;
8220   int count = SPECPDL_INDEX ();
8221
8222   if (SYMBOLP (default_coding_system))
8223     default_coding_system = SYMBOL_NAME (default_coding_system);
8224   specbind (Qcompletion_ignore_case, Qt);
8225   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8226                           Qt, Qnil, Qcoding_system_history,
8227                           default_coding_system, Qnil);
8228   unbind_to (count, Qnil);
8229   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8230 }
8231
8232 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8233        1, 1, 0,
8234        doc: /* Check validity of CODING-SYSTEM.
8235 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8236 It is valid if it is nil or a symbol defined as a coding system by the
8237 function `define-coding-system'.  */)
8238   (coding_system)
8239      Lisp_Object coding_system;
8240 {
8241   Lisp_Object define_form;
8242
8243   define_form = Fget (coding_system, Qcoding_system_define_form);
8244   if (! NILP (define_form))
8245     {
8246       Fput (coding_system, Qcoding_system_define_form, Qnil);
8247       safe_eval (define_form);
8248     }
8249   if (!NILP (Fcoding_system_p (coding_system)))
8250     return coding_system;
8251   xsignal1 (Qcoding_system_error, coding_system);
8252 }
8253
8254 \f
8255 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8256    HIGHEST is nonzero, return the coding system of the highest
8257    priority among the detected coding systems.  Otherwize return a
8258    list of detected coding systems sorted by their priorities.  If
8259    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8260    multibyte form but contains only ASCII and eight-bit chars.
8261    Otherwise, the bytes are raw bytes.
8262
8263    CODING-SYSTEM controls the detection as below:
8264
8265    If it is nil, detect both text-format and eol-format.  If the
8266    text-format part of CODING-SYSTEM is already specified
8267    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8268    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8269    detect only text-format.  */
8270
8271 Lisp_Object
8272 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8273                       coding_system)
8274      const unsigned char *src;
8275      EMACS_INT src_chars, src_bytes;
8276      int highest;
8277      int multibytep;
8278      Lisp_Object coding_system;
8279 {
8280   const unsigned char *src_end = src + src_bytes;
8281   Lisp_Object attrs, eol_type;
8282   Lisp_Object val = Qnil;
8283   struct coding_system coding;
8284   int id;
8285   struct coding_detection_info detect_info;
8286   enum coding_category base_category;
8287   int null_byte_found = 0, eight_bit_found = 0;
8288
8289   if (NILP (coding_system))
8290     coding_system = Qundecided;
8291   setup_coding_system (coding_system, &coding);
8292   attrs = CODING_ID_ATTRS (coding.id);
8293   eol_type = CODING_ID_EOL_TYPE (coding.id);
8294   coding_system = CODING_ATTR_BASE_NAME (attrs);
8295
8296   coding.source = src;
8297   coding.src_chars = src_chars;
8298   coding.src_bytes = src_bytes;
8299   coding.src_multibyte = multibytep;
8300   coding.consumed = 0;
8301   coding.mode |= CODING_MODE_LAST_BLOCK;
8302   coding.head_ascii = 0;
8303
8304   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8305
8306   /* At first, detect text-format if necessary.  */
8307   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8308   if (base_category == coding_category_undecided)
8309     {
8310       enum coding_category category;
8311       struct coding_system *this;
8312       int c, i;
8313
8314       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8315       for (; src < src_end; src++)
8316         {
8317           c = *src;
8318           if (c & 0x80)
8319             {
8320               eight_bit_found = 1;
8321               if (null_byte_found)
8322                 break;
8323             }
8324           else if (c < 0x20)
8325             {
8326               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8327                   && ! inhibit_iso_escape_detection
8328                   && ! detect_info.checked)
8329                 {
8330                   if (detect_coding_iso_2022 (&coding, &detect_info))
8331                     {
8332                       /* We have scanned the whole data.  */
8333                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8334                         {
8335                           /* We didn't find an 8-bit code.  We may
8336                              have found a null-byte, but it's very
8337                              rare that a binary file confirm to
8338                              ISO-2022.  */
8339                           src = src_end;
8340                           coding.head_ascii = src - coding.source;
8341                         }
8342                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8343                       break;
8344                     }
8345                 }
8346               else if (! c && !inhibit_null_byte_detection)
8347                 {
8348                   null_byte_found = 1;
8349                   if (eight_bit_found)
8350                     break;
8351                 }
8352               if (! eight_bit_found)
8353                 coding.head_ascii++;
8354             }
8355           else if (! eight_bit_found)
8356             coding.head_ascii++;
8357         }
8358
8359       if (null_byte_found || eight_bit_found
8360           || coding.head_ascii < coding.src_bytes
8361           || detect_info.found)
8362         {
8363           if (coding.head_ascii == coding.src_bytes)
8364             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8365             for (i = 0; i < coding_category_raw_text; i++)
8366               {
8367                 category = coding_priorities[i];
8368                 this = coding_categories + category;
8369                 if (detect_info.found & (1 << category))
8370                   break;
8371               }
8372           else
8373             {
8374               if (null_byte_found)
8375                 {
8376                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8377                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8378                 }
8379               for (i = 0; i < coding_category_raw_text; i++)
8380                 {
8381                   category = coding_priorities[i];
8382                   this = coding_categories + category;
8383
8384                   if (this->id < 0)
8385                     {
8386                       /* No coding system of this category is defined.  */
8387                       detect_info.rejected |= (1 << category);
8388                     }
8389                   else if (category >= coding_category_raw_text)
8390                     continue;
8391                   else if (detect_info.checked & (1 << category))
8392                     {
8393                       if (highest
8394                           && (detect_info.found & (1 << category)))
8395                         break;
8396                     }
8397                   else if ((*(this->detector)) (&coding, &detect_info)
8398                            && highest
8399                            && (detect_info.found & (1 << category)))
8400                     {
8401                       if (category == coding_category_utf_16_auto)
8402                         {
8403                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8404                             category = coding_category_utf_16_le;
8405                           else
8406                             category = coding_category_utf_16_be;
8407                         }
8408                       break;
8409                     }
8410                 }
8411             }
8412         }
8413
8414       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8415           || null_byte_found)
8416         {
8417           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8418           id = CODING_SYSTEM_ID (Qno_conversion);
8419           val = Fcons (make_number (id), Qnil);
8420         }
8421       else if (! detect_info.rejected && ! detect_info.found)
8422         {
8423           detect_info.found = CATEGORY_MASK_ANY;
8424           id = coding_categories[coding_category_undecided].id;
8425           val = Fcons (make_number (id), Qnil);
8426         }
8427       else if (highest)
8428         {
8429           if (detect_info.found)
8430             {
8431               detect_info.found = 1 << category;
8432               val = Fcons (make_number (this->id), Qnil);
8433             }
8434           else
8435             for (i = 0; i < coding_category_raw_text; i++)
8436               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8437                 {
8438                   detect_info.found = 1 << coding_priorities[i];
8439                   id = coding_categories[coding_priorities[i]].id;
8440                   val = Fcons (make_number (id), Qnil);
8441                   break;
8442                 }
8443         }
8444       else
8445         {
8446           int mask = detect_info.rejected | detect_info.found;
8447           int found = 0;
8448
8449           for (i = coding_category_raw_text - 1; i >= 0; i--)
8450             {
8451               category = coding_priorities[i];
8452               if (! (mask & (1 << category)))
8453                 {
8454                   found |= 1 << category;
8455                   id = coding_categories[category].id;
8456                   if (id >= 0)
8457                     val = Fcons (make_number (id), val);
8458                 }
8459             }
8460           for (i = coding_category_raw_text - 1; i >= 0; i--)
8461             {
8462               category = coding_priorities[i];
8463               if (detect_info.found & (1 << category))
8464                 {
8465                   id = coding_categories[category].id;
8466                   val = Fcons (make_number (id), val);
8467                 }
8468             }
8469           detect_info.found |= found;
8470         }
8471     }
8472   else if (base_category == coding_category_utf_8_auto)
8473     {
8474       if (detect_coding_utf_8 (&coding, &detect_info))
8475         {
8476           struct coding_system *this;
8477
8478           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8479             this = coding_categories + coding_category_utf_8_sig;
8480           else
8481             this = coding_categories + coding_category_utf_8_nosig;
8482           val = Fcons (make_number (this->id), Qnil);
8483         }
8484     }
8485   else if (base_category == coding_category_utf_16_auto)
8486     {
8487       if (detect_coding_utf_16 (&coding, &detect_info))
8488         {
8489           struct coding_system *this;
8490
8491           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8492             this = coding_categories + coding_category_utf_16_le;
8493           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8494             this = coding_categories + coding_category_utf_16_be;
8495           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8496             this = coding_categories + coding_category_utf_16_be_nosig;
8497           else
8498             this = coding_categories + coding_category_utf_16_le_nosig;
8499           val = Fcons (make_number (this->id), Qnil);
8500         }
8501     }
8502   else
8503     {
8504       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8505       val = Fcons (make_number (coding.id), Qnil);
8506     }
8507
8508   /* Then, detect eol-format if necessary.  */
8509   {
8510     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8511     Lisp_Object tail;
8512
8513     if (VECTORP (eol_type))
8514       {
8515         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8516           {
8517             if (null_byte_found)
8518               normal_eol = EOL_SEEN_LF;
8519             else
8520               normal_eol = detect_eol (coding.source, src_bytes,
8521                                        coding_category_raw_text);
8522           }
8523         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8524                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8525           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8526                                       coding_category_utf_16_be);
8527         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8528                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8529           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8530                                       coding_category_utf_16_le);
8531       }
8532     else
8533       {
8534         if (EQ (eol_type, Qunix))
8535           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8536         else if (EQ (eol_type, Qdos))
8537           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8538         else
8539           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8540       }
8541
8542     for (tail = val; CONSP (tail); tail = XCDR (tail))
8543       {
8544         enum coding_category category;
8545         int this_eol;
8546
8547         id = XINT (XCAR (tail));
8548         attrs = CODING_ID_ATTRS (id);
8549         category = XINT (CODING_ATTR_CATEGORY (attrs));
8550         eol_type = CODING_ID_EOL_TYPE (id);
8551         if (VECTORP (eol_type))
8552           {
8553             if (category == coding_category_utf_16_be
8554                 || category == coding_category_utf_16_be_nosig)
8555               this_eol = utf_16_be_eol;
8556             else if (category == coding_category_utf_16_le
8557                      || category == coding_category_utf_16_le_nosig)
8558               this_eol = utf_16_le_eol;
8559             else
8560               this_eol = normal_eol;
8561
8562             if (this_eol == EOL_SEEN_LF)
8563               XSETCAR (tail, AREF (eol_type, 0));
8564             else if (this_eol == EOL_SEEN_CRLF)
8565               XSETCAR (tail, AREF (eol_type, 1));
8566             else if (this_eol == EOL_SEEN_CR)
8567               XSETCAR (tail, AREF (eol_type, 2));
8568             else
8569               XSETCAR (tail, CODING_ID_NAME (id));
8570           }
8571         else
8572           XSETCAR (tail, CODING_ID_NAME (id));
8573       }
8574   }
8575
8576   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8577 }
8578
8579
8580 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8581        2, 3, 0,
8582        doc: /* Detect coding system of the text in the region between START and END.
8583 Return a list of possible coding systems ordered by priority.
8584 The coding systems to try and their priorities follows what
8585 the function `coding-system-priority-list' (which see) returns.
8586
8587 If only ASCII characters are found (except for such ISO-2022 control
8588 characters as ESC), it returns a list of single element `undecided'
8589 or its subsidiary coding system according to a detected end-of-line
8590 format.
8591
8592 If optional argument HIGHEST is non-nil, return the coding system of
8593 highest priority.  */)
8594      (start, end, highest)
8595      Lisp_Object start, end, highest;
8596 {
8597   int from, to;
8598   int from_byte, to_byte;
8599
8600   CHECK_NUMBER_COERCE_MARKER (start);
8601   CHECK_NUMBER_COERCE_MARKER (end);
8602
8603   validate_region (&start, &end);
8604   from = XINT (start), to = XINT (end);
8605   from_byte = CHAR_TO_BYTE (from);
8606   to_byte = CHAR_TO_BYTE (to);
8607
8608   if (from < GPT && to >= GPT)
8609     move_gap_both (to, to_byte);
8610
8611   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8612                                to - from, to_byte - from_byte,
8613                                !NILP (highest),
8614                                !NILP (current_buffer
8615                                       ->enable_multibyte_characters),
8616                                Qnil);
8617 }
8618
8619 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8620        1, 2, 0,
8621        doc: /* Detect coding system of the text in STRING.
8622 Return a list of possible coding systems ordered by priority.
8623 The coding systems to try and their priorities follows what
8624 the function `coding-system-priority-list' (which see) returns.
8625
8626 If only ASCII characters are found (except for such ISO-2022 control
8627 characters as ESC), it returns a list of single element `undecided'
8628 or its subsidiary coding system according to a detected end-of-line
8629 format.
8630
8631 If optional argument HIGHEST is non-nil, return the coding system of
8632 highest priority.  */)
8633      (string, highest)
8634      Lisp_Object string, highest;
8635 {
8636   CHECK_STRING (string);
8637
8638   return detect_coding_system (SDATA (string),
8639                                SCHARS (string), SBYTES (string),
8640                                !NILP (highest), STRING_MULTIBYTE (string),
8641                                Qnil);
8642 }
8643
8644
8645 static INLINE int
8646 char_encodable_p (c, attrs)
8647      int c;
8648      Lisp_Object attrs;
8649 {
8650   Lisp_Object tail;
8651   struct charset *charset;
8652   Lisp_Object translation_table;
8653
8654   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8655   if (! NILP (translation_table))
8656     c = translate_char (translation_table, c);
8657   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8658        CONSP (tail); tail = XCDR (tail))
8659     {
8660       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8661       if (CHAR_CHARSET_P (c, charset))
8662         break;
8663     }
8664   return (! NILP (tail));
8665 }
8666
8667
8668 /* Return a list of coding systems that safely encode the text between
8669    START and END.  If EXCLUDE is non-nil, it is a list of coding
8670    systems not to check.  The returned list doesn't contain any such
8671    coding systems.  In any case, if the text contains only ASCII or is
8672    unibyte, return t.  */
8673
8674 DEFUN ("find-coding-systems-region-internal",
8675        Ffind_coding_systems_region_internal,
8676        Sfind_coding_systems_region_internal, 2, 3, 0,
8677        doc: /* Internal use only.  */)
8678      (start, end, exclude)
8679      Lisp_Object start, end, exclude;
8680 {
8681   Lisp_Object coding_attrs_list, safe_codings;
8682   EMACS_INT start_byte, end_byte;
8683   const unsigned char *p, *pbeg, *pend;
8684   int c;
8685   Lisp_Object tail, elt, work_table;
8686
8687   if (STRINGP (start))
8688     {
8689       if (!STRING_MULTIBYTE (start)
8690           || SCHARS (start) == SBYTES (start))
8691         return Qt;
8692       start_byte = 0;
8693       end_byte = SBYTES (start);
8694     }
8695   else
8696     {
8697       CHECK_NUMBER_COERCE_MARKER (start);
8698       CHECK_NUMBER_COERCE_MARKER (end);
8699       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8700         args_out_of_range (start, end);
8701       if (NILP (current_buffer->enable_multibyte_characters))
8702         return Qt;
8703       start_byte = CHAR_TO_BYTE (XINT (start));
8704       end_byte = CHAR_TO_BYTE (XINT (end));
8705       if (XINT (end) - XINT (start) == end_byte - start_byte)
8706         return Qt;
8707
8708       if (XINT (start) < GPT && XINT (end) > GPT)
8709         {
8710           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8711             move_gap_both (XINT (start), start_byte);
8712           else
8713             move_gap_both (XINT (end), end_byte);
8714         }
8715     }
8716
8717   coding_attrs_list = Qnil;
8718   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8719     if (NILP (exclude)
8720         || NILP (Fmemq (XCAR (tail), exclude)))
8721       {
8722         Lisp_Object attrs;
8723
8724         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8725         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8726             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8727           {
8728             ASET (attrs, coding_attr_trans_tbl,
8729                   get_translation_table (attrs, 1, NULL));
8730             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8731           }
8732       }
8733
8734   if (STRINGP (start))
8735     p = pbeg = SDATA (start);
8736   else
8737     p = pbeg = BYTE_POS_ADDR (start_byte);
8738   pend = p + (end_byte - start_byte);
8739
8740   while (p < pend && ASCII_BYTE_P (*p)) p++;
8741   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8742
8743   work_table = Fmake_char_table (Qnil, Qnil);
8744   while (p < pend)
8745     {
8746       if (ASCII_BYTE_P (*p))
8747         p++;
8748       else
8749         {
8750           c = STRING_CHAR_ADVANCE (p);
8751           if (!NILP (char_table_ref (work_table, c)))
8752             /* This character was already checked.  Ignore it.  */
8753             continue;
8754
8755           charset_map_loaded = 0;
8756           for (tail = coding_attrs_list; CONSP (tail);)
8757             {
8758               elt = XCAR (tail);
8759               if (NILP (elt))
8760                 tail = XCDR (tail);
8761               else if (char_encodable_p (c, elt))
8762                 tail = XCDR (tail);
8763               else if (CONSP (XCDR (tail)))
8764                 {
8765                   XSETCAR (tail, XCAR (XCDR (tail)));
8766                   XSETCDR (tail, XCDR (XCDR (tail)));
8767                 }
8768               else
8769                 {
8770                   XSETCAR (tail, Qnil);
8771                   tail = XCDR (tail);
8772                 }
8773             }
8774           if (charset_map_loaded)
8775             {
8776               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8777
8778               if (STRINGP (start))
8779                 pbeg = SDATA (start);
8780               else
8781                 pbeg = BYTE_POS_ADDR (start_byte);
8782               p = pbeg + p_offset;
8783               pend = pbeg + pend_offset;
8784             }
8785           char_table_set (work_table, c, Qt);
8786         }
8787     }
8788
8789   safe_codings = list2 (Qraw_text, Qno_conversion);
8790   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8791     if (! NILP (XCAR (tail)))
8792       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8793
8794   return safe_codings;
8795 }
8796
8797
8798 DEFUN ("unencodable-char-position", Funencodable_char_position,
8799        Sunencodable_char_position, 3, 5, 0,
8800        doc: /*
8801 Return position of first un-encodable character in a region.
8802 START and END specify the region and CODING-SYSTEM specifies the
8803 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8804
8805 If optional 4th argument COUNT is non-nil, it specifies at most how
8806 many un-encodable characters to search.  In this case, the value is a
8807 list of positions.
8808
8809 If optional 5th argument STRING is non-nil, it is a string to search
8810 for un-encodable characters.  In that case, START and END are indexes
8811 to the string.  */)
8812      (start, end, coding_system, count, string)
8813      Lisp_Object start, end, coding_system, count, string;
8814 {
8815   int n;
8816   struct coding_system coding;
8817   Lisp_Object attrs, charset_list, translation_table;
8818   Lisp_Object positions;
8819   int from, to;
8820   const unsigned char *p, *stop, *pend;
8821   int ascii_compatible;
8822
8823   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8824   attrs = CODING_ID_ATTRS (coding.id);
8825   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8826     return Qnil;
8827   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8828   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8829   translation_table = get_translation_table (attrs, 1, NULL);
8830
8831   if (NILP (string))
8832     {
8833       validate_region (&start, &end);
8834       from = XINT (start);
8835       to = XINT (end);
8836       if (NILP (current_buffer->enable_multibyte_characters)
8837           || (ascii_compatible
8838               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8839         return Qnil;
8840       p = CHAR_POS_ADDR (from);
8841       pend = CHAR_POS_ADDR (to);
8842       if (from < GPT && to >= GPT)
8843         stop = GPT_ADDR;
8844       else
8845         stop = pend;
8846     }
8847   else
8848     {
8849       CHECK_STRING (string);
8850       CHECK_NATNUM (start);
8851       CHECK_NATNUM (end);
8852       from = XINT (start);
8853       to = XINT (end);
8854       if (from > to
8855           || to > SCHARS (string))
8856         args_out_of_range_3 (string, start, end);
8857       if (! STRING_MULTIBYTE (string))
8858         return Qnil;
8859       p = SDATA (string) + string_char_to_byte (string, from);
8860       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8861       if (ascii_compatible && (to - from) == (pend - p))
8862         return Qnil;
8863     }
8864
8865   if (NILP (count))
8866     n = 1;
8867   else
8868     {
8869       CHECK_NATNUM (count);
8870       n = XINT (count);
8871     }
8872
8873   positions = Qnil;
8874   while (1)
8875     {
8876       int c;
8877
8878       if (ascii_compatible)
8879         while (p < stop && ASCII_BYTE_P (*p))
8880           p++, from++;
8881       if (p >= stop)
8882         {
8883           if (p >= pend)
8884             break;
8885           stop = pend;
8886           p = GAP_END_ADDR;
8887         }
8888
8889       c = STRING_CHAR_ADVANCE (p);
8890       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8891           && ! char_charset (translate_char (translation_table, c),
8892                              charset_list, NULL))
8893         {
8894           positions = Fcons (make_number (from), positions);
8895           n--;
8896           if (n == 0)
8897             break;
8898         }
8899
8900       from++;
8901     }
8902
8903   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8904 }
8905
8906
8907 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8908        Scheck_coding_systems_region, 3, 3, 0,
8909        doc: /* Check if the region is encodable by coding systems.
8910
8911 START and END are buffer positions specifying the region.
8912 CODING-SYSTEM-LIST is a list of coding systems to check.
8913
8914 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8915 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8916 whole region, POS0, POS1, ... are buffer positions where non-encodable
8917 characters are found.
8918
8919 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8920 value is nil.
8921
8922 START may be a string.  In that case, check if the string is
8923 encodable, and the value contains indices to the string instead of
8924 buffer positions.  END is ignored.
8925
8926 If the current buffer (or START if it is a string) is unibyte, the value
8927 is nil.  */)
8928      (start, end, coding_system_list)
8929      Lisp_Object start, end, coding_system_list;
8930 {
8931   Lisp_Object list;
8932   EMACS_INT start_byte, end_byte;
8933   int pos;
8934   const unsigned char *p, *pbeg, *pend;
8935   int c;
8936   Lisp_Object tail, elt, attrs;
8937
8938   if (STRINGP (start))
8939     {
8940       if (!STRING_MULTIBYTE (start)
8941           || SCHARS (start) == SBYTES (start))
8942         return Qnil;
8943       start_byte = 0;
8944       end_byte = SBYTES (start);
8945       pos = 0;
8946     }
8947   else
8948     {
8949       CHECK_NUMBER_COERCE_MARKER (start);
8950       CHECK_NUMBER_COERCE_MARKER (end);
8951       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8952         args_out_of_range (start, end);
8953       if (NILP (current_buffer->enable_multibyte_characters))
8954         return Qnil;
8955       start_byte = CHAR_TO_BYTE (XINT (start));
8956       end_byte = CHAR_TO_BYTE (XINT (end));
8957       if (XINT (end) - XINT (start) == end_byte - start_byte)
8958         return Qnil;
8959
8960       if (XINT (start) < GPT && XINT (end) > GPT)
8961         {
8962           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8963             move_gap_both (XINT (start), start_byte);
8964           else
8965             move_gap_both (XINT (end), end_byte);
8966         }
8967       pos = XINT (start);
8968     }
8969
8970   list = Qnil;
8971   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8972     {
8973       elt = XCAR (tail);
8974       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8975       ASET (attrs, coding_attr_trans_tbl,
8976             get_translation_table (attrs, 1, NULL));
8977       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8978     }
8979
8980   if (STRINGP (start))
8981     p = pbeg = SDATA (start);
8982   else
8983     p = pbeg = BYTE_POS_ADDR (start_byte);
8984   pend = p + (end_byte - start_byte);
8985
8986   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8987   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8988
8989   while (p < pend)
8990     {
8991       if (ASCII_BYTE_P (*p))
8992         p++;
8993       else
8994         {
8995           c = STRING_CHAR_ADVANCE (p);
8996
8997           charset_map_loaded = 0;
8998           for (tail = list; CONSP (tail); tail = XCDR (tail))
8999             {
9000               elt = XCDR (XCAR (tail));
9001               if (! char_encodable_p (c, XCAR (elt)))
9002                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9003             }
9004           if (charset_map_loaded)
9005             {
9006               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
9007
9008               if (STRINGP (start))
9009                 pbeg = SDATA (start);
9010               else
9011                 pbeg = BYTE_POS_ADDR (start_byte);
9012               p = pbeg + p_offset;
9013               pend = pbeg + pend_offset;
9014             }
9015         }
9016       pos++;
9017     }
9018
9019   tail = list;
9020   list = Qnil;
9021   for (; CONSP (tail); tail = XCDR (tail))
9022     {
9023       elt = XCAR (tail);
9024       if (CONSP (XCDR (XCDR (elt))))
9025         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9026                       list);
9027     }
9028
9029   return list;
9030 }
9031
9032
9033 Lisp_Object
9034 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9035      Lisp_Object start, end, coding_system, dst_object;
9036      int encodep, norecord;
9037 {
9038   struct coding_system coding;
9039   EMACS_INT from, from_byte, to, to_byte;
9040   Lisp_Object src_object;
9041
9042   CHECK_NUMBER_COERCE_MARKER (start);
9043   CHECK_NUMBER_COERCE_MARKER (end);
9044   if (NILP (coding_system))
9045     coding_system = Qno_conversion;
9046   else
9047     CHECK_CODING_SYSTEM (coding_system);
9048   src_object = Fcurrent_buffer ();
9049   if (NILP (dst_object))
9050     dst_object = src_object;
9051   else if (! EQ (dst_object, Qt))
9052     CHECK_BUFFER (dst_object);
9053
9054   validate_region (&start, &end);
9055   from = XFASTINT (start);
9056   from_byte = CHAR_TO_BYTE (from);
9057   to = XFASTINT (end);
9058   to_byte = CHAR_TO_BYTE (to);
9059
9060   setup_coding_system (coding_system, &coding);
9061   coding.mode |= CODING_MODE_LAST_BLOCK;
9062
9063   if (encodep)
9064     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9065                           dst_object);
9066   else
9067     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9068                           dst_object);
9069   if (! norecord)
9070     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9071
9072   return (BUFFERP (dst_object)
9073           ? make_number (coding.produced_char)
9074           : coding.dst_object);
9075 }
9076
9077
9078 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9079        3, 4, "r\nzCoding system: ",
9080        doc: /* Decode the current region from the specified coding system.
9081 When called from a program, takes four arguments:
9082         START, END, CODING-SYSTEM, and DESTINATION.
9083 START and END are buffer positions.
9084
9085 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9086 If nil, the region between START and END is replaced by the decoded text.
9087 If buffer, the decoded text is inserted in that buffer after point (point
9088 does not move).
9089 In those cases, the length of the decoded text is returned.
9090 If DESTINATION is t, the decoded text is returned.
9091
9092 This function sets `last-coding-system-used' to the precise coding system
9093 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9094 not fully specified.)  */)
9095      (start, end, coding_system, destination)
9096      Lisp_Object start, end, coding_system, destination;
9097 {
9098   return code_convert_region (start, end, coding_system, destination, 0, 0);
9099 }
9100
9101 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9102        3, 4, "r\nzCoding system: ",
9103        doc: /* Encode the current region by specified coding system.
9104 When called from a program, takes four arguments:
9105         START, END, CODING-SYSTEM and DESTINATION.
9106 START and END are buffer positions.
9107
9108 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9109 If nil, the region between START and END is replace by the encoded text.
9110 If buffer, the encoded text is inserted in that buffer after point (point
9111 does not move).
9112 In those cases, the length of the encoded text is returned.
9113 If DESTINATION is t, the encoded text is returned.
9114
9115 This function sets `last-coding-system-used' to the precise coding system
9116 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9117 not fully specified.)  */)
9118   (start, end, coding_system, destination)
9119      Lisp_Object start, end, coding_system, destination;
9120 {
9121   return code_convert_region (start, end, coding_system, destination, 1, 0);
9122 }
9123
9124 Lisp_Object
9125 code_convert_string (string, coding_system, dst_object,
9126                      encodep, nocopy, norecord)
9127      Lisp_Object string, coding_system, dst_object;
9128      int encodep, nocopy, norecord;
9129 {
9130   struct coding_system coding;
9131   EMACS_INT chars, bytes;
9132
9133   CHECK_STRING (string);
9134   if (NILP (coding_system))
9135     {
9136       if (! norecord)
9137         Vlast_coding_system_used = Qno_conversion;
9138       if (NILP (dst_object))
9139         return (nocopy ? Fcopy_sequence (string) : string);
9140     }
9141
9142   if (NILP (coding_system))
9143     coding_system = Qno_conversion;
9144   else
9145     CHECK_CODING_SYSTEM (coding_system);
9146   if (NILP (dst_object))
9147     dst_object = Qt;
9148   else if (! EQ (dst_object, Qt))
9149     CHECK_BUFFER (dst_object);
9150
9151   setup_coding_system (coding_system, &coding);
9152   coding.mode |= CODING_MODE_LAST_BLOCK;
9153   chars = SCHARS (string);
9154   bytes = SBYTES (string);
9155   if (encodep)
9156     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9157   else
9158     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9159   if (! norecord)
9160     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9161
9162   return (BUFFERP (dst_object)
9163           ? make_number (coding.produced_char)
9164           : coding.dst_object);
9165 }
9166
9167
9168 /* Encode or decode STRING according to CODING_SYSTEM.
9169    Do not set Vlast_coding_system_used.
9170
9171    This function is called only from macros DECODE_FILE and
9172    ENCODE_FILE, thus we ignore character composition.  */
9173
9174 Lisp_Object
9175 code_convert_string_norecord (string, coding_system, encodep)
9176      Lisp_Object string, coding_system;
9177      int encodep;
9178 {
9179   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9180 }
9181
9182
9183 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9184        2, 4, 0,
9185        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9186
9187 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9188 if the decoding operation is trivial.
9189
9190 Optional fourth arg BUFFER non-nil means that the decoded text is
9191 inserted in that buffer after point (point does not move).  In this
9192 case, the return value is the length of the decoded text.
9193
9194 This function sets `last-coding-system-used' to the precise coding system
9195 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9196 not fully specified.)  */)
9197   (string, coding_system, nocopy, buffer)
9198      Lisp_Object string, coding_system, nocopy, buffer;
9199 {
9200   return code_convert_string (string, coding_system, buffer,
9201                               0, ! NILP (nocopy), 0);
9202 }
9203
9204 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9205        2, 4, 0,
9206        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9207
9208 Optional third arg NOCOPY non-nil means it is OK to return STRING
9209 itself if the encoding operation is trivial.
9210
9211 Optional fourth arg BUFFER non-nil means that the encoded text is
9212 inserted in that buffer after point (point does not move).  In this
9213 case, the return value is the length of the encoded text.
9214
9215 This function sets `last-coding-system-used' to the precise coding system
9216 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9217 not fully specified.)  */)
9218      (string, coding_system, nocopy, buffer)
9219      Lisp_Object string, coding_system, nocopy, buffer;
9220 {
9221   return code_convert_string (string, coding_system, buffer,
9222                               1, ! NILP (nocopy), 1);
9223 }
9224
9225 \f
9226 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9227        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9228 Return the corresponding character.  */)
9229      (code)
9230      Lisp_Object code;
9231 {
9232   Lisp_Object spec, attrs, val;
9233   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9234   int c;
9235
9236   CHECK_NATNUM (code);
9237   c = XFASTINT (code);
9238   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9239   attrs = AREF (spec, 0);
9240
9241   if (ASCII_BYTE_P (c)
9242       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9243     return code;
9244
9245   val = CODING_ATTR_CHARSET_LIST (attrs);
9246   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9247   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9248   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9249
9250   if (c <= 0x7F)
9251     charset = charset_roman;
9252   else if (c >= 0xA0 && c < 0xDF)
9253     {
9254       charset = charset_kana;
9255       c -= 0x80;
9256     }
9257   else
9258     {
9259       int s1 = c >> 8, s2 = c & 0xFF;
9260
9261       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9262           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9263         error ("Invalid code: %d", code);
9264       SJIS_TO_JIS (c);
9265       charset = charset_kanji;
9266     }
9267   c = DECODE_CHAR (charset, c);
9268   if (c < 0)
9269     error ("Invalid code: %d", code);
9270   return make_number (c);
9271 }
9272
9273
9274 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9275        doc: /* Encode a Japanese character CH to shift_jis encoding.
9276 Return the corresponding code in SJIS.  */)
9277      (ch)
9278     Lisp_Object ch;
9279 {
9280   Lisp_Object spec, attrs, charset_list;
9281   int c;
9282   struct charset *charset;
9283   unsigned code;
9284
9285   CHECK_CHARACTER (ch);
9286   c = XFASTINT (ch);
9287   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9288   attrs = AREF (spec, 0);
9289
9290   if (ASCII_CHAR_P (c)
9291       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9292     return ch;
9293
9294   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9295   charset = char_charset (c, charset_list, &code);
9296   if (code == CHARSET_INVALID_CODE (charset))
9297     error ("Can't encode by shift_jis encoding: %d", c);
9298   JIS_TO_SJIS (code);
9299
9300   return make_number (code);
9301 }
9302
9303 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9304        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9305 Return the corresponding character.  */)
9306      (code)
9307      Lisp_Object code;
9308 {
9309   Lisp_Object spec, attrs, val;
9310   struct charset *charset_roman, *charset_big5, *charset;
9311   int c;
9312
9313   CHECK_NATNUM (code);
9314   c = XFASTINT (code);
9315   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9316   attrs = AREF (spec, 0);
9317
9318   if (ASCII_BYTE_P (c)
9319       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9320     return code;
9321
9322   val = CODING_ATTR_CHARSET_LIST (attrs);
9323   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9324   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9325
9326   if (c <= 0x7F)
9327     charset = charset_roman;
9328   else
9329     {
9330       int b1 = c >> 8, b2 = c & 0x7F;
9331       if (b1 < 0xA1 || b1 > 0xFE
9332           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9333         error ("Invalid code: %d", code);
9334       charset = charset_big5;
9335     }
9336   c = DECODE_CHAR (charset, (unsigned )c);
9337   if (c < 0)
9338     error ("Invalid code: %d", code);
9339   return make_number (c);
9340 }
9341
9342 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9343        doc: /* Encode the Big5 character CH to BIG5 coding system.
9344 Return the corresponding character code in Big5.  */)
9345      (ch)
9346      Lisp_Object ch;
9347 {
9348   Lisp_Object spec, attrs, charset_list;
9349   struct charset *charset;
9350   int c;
9351   unsigned code;
9352
9353   CHECK_CHARACTER (ch);
9354   c = XFASTINT (ch);
9355   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9356   attrs = AREF (spec, 0);
9357   if (ASCII_CHAR_P (c)
9358       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9359     return ch;
9360
9361   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9362   charset = char_charset (c, charset_list, &code);
9363   if (code == CHARSET_INVALID_CODE (charset))
9364     error ("Can't encode by Big5 encoding: %d", c);
9365
9366   return make_number (code);
9367 }
9368
9369 \f
9370 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9371        Sset_terminal_coding_system_internal, 1, 2, 0,
9372        doc: /* Internal use only.  */)
9373      (coding_system, terminal)
9374      Lisp_Object coding_system;
9375      Lisp_Object terminal;
9376 {
9377   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9378   CHECK_SYMBOL (coding_system);
9379   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9380   /* We had better not send unsafe characters to terminal.  */
9381   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9382   /* Characer composition should be disabled.  */
9383   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9384   terminal_coding->src_multibyte = 1;
9385   terminal_coding->dst_multibyte = 0;
9386   return Qnil;
9387 }
9388
9389 DEFUN ("set-safe-terminal-coding-system-internal",
9390        Fset_safe_terminal_coding_system_internal,
9391        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9392        doc: /* Internal use only.  */)
9393      (coding_system)
9394      Lisp_Object coding_system;
9395 {
9396   CHECK_SYMBOL (coding_system);
9397   setup_coding_system (Fcheck_coding_system (coding_system),
9398                        &safe_terminal_coding);
9399   /* Characer composition should be disabled.  */
9400   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9401   safe_terminal_coding.src_multibyte = 1;
9402   safe_terminal_coding.dst_multibyte = 0;
9403   return Qnil;
9404 }
9405
9406 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9407        Sterminal_coding_system, 0, 1, 0,
9408        doc: /* Return coding system specified for terminal output on the given terminal.
9409 TERMINAL may be a terminal object, a frame, or nil for the selected
9410 frame's terminal device.  */)
9411      (terminal)
9412      Lisp_Object terminal;
9413 {
9414   struct coding_system *terminal_coding
9415     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9416   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9417
9418   /* For backward compatibility, return nil if it is `undecided'. */
9419   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9420 }
9421
9422 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9423        Sset_keyboard_coding_system_internal, 1, 2, 0,
9424        doc: /* Internal use only.  */)
9425      (coding_system, terminal)
9426      Lisp_Object coding_system;
9427      Lisp_Object terminal;
9428 {
9429   struct terminal *t = get_terminal (terminal, 1);
9430   CHECK_SYMBOL (coding_system);
9431   if (NILP (coding_system))
9432     coding_system = Qno_conversion;
9433   else
9434     Fcheck_coding_system (coding_system);
9435   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9436   /* Characer composition should be disabled.  */
9437   TERMINAL_KEYBOARD_CODING (t)->common_flags
9438     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9439   return Qnil;
9440 }
9441
9442 DEFUN ("keyboard-coding-system",
9443        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9444        doc: /* Return coding system specified for decoding keyboard input.  */)
9445      (terminal)
9446      Lisp_Object terminal;
9447 {
9448   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9449                          (get_terminal (terminal, 1))->id);
9450 }
9451
9452 \f
9453 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9454        Sfind_operation_coding_system,  1, MANY, 0,
9455        doc: /* Choose a coding system for an operation based on the target name.
9456 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9457 DECODING-SYSTEM is the coding system to use for decoding
9458 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9459 for encoding (in case OPERATION does encoding).
9460
9461 The first argument OPERATION specifies an I/O primitive:
9462   For file I/O, `insert-file-contents' or `write-region'.
9463   For process I/O, `call-process', `call-process-region', or `start-process'.
9464   For network I/O, `open-network-stream'.
9465
9466 The remaining arguments should be the same arguments that were passed
9467 to the primitive.  Depending on which primitive, one of those arguments
9468 is selected as the TARGET.  For example, if OPERATION does file I/O,
9469 whichever argument specifies the file name is TARGET.
9470
9471 TARGET has a meaning which depends on OPERATION:
9472   For file I/O, TARGET is a file name (except for the special case below).
9473   For process I/O, TARGET is a process name.
9474   For network I/O, TARGET is a service name or a port number.
9475
9476 This function looks up what is specified for TARGET in
9477 `file-coding-system-alist', `process-coding-system-alist',
9478 or `network-coding-system-alist' depending on OPERATION.
9479 They may specify a coding system, a cons of coding systems,
9480 or a function symbol to call.
9481 In the last case, we call the function with one argument,
9482 which is a list of all the arguments given to this function.
9483 If the function can't decide a coding system, it can return
9484 `undecided' so that the normal code-detection is performed.
9485
9486 If OPERATION is `insert-file-contents', the argument corresponding to
9487 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9488 file name to look up, and BUFFER is a buffer that contains the file's
9489 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9490 function to call for FILENAME, that function should examine the
9491 contents of BUFFER instead of reading the file.
9492
9493 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9494      (nargs, args)
9495      int nargs;
9496      Lisp_Object *args;
9497 {
9498   Lisp_Object operation, target_idx, target, val;
9499   register Lisp_Object chain;
9500
9501   if (nargs < 2)
9502     error ("Too few arguments");
9503   operation = args[0];
9504   if (!SYMBOLP (operation)
9505       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9506     error ("Invalid first argument");
9507   if (nargs < 1 + XINT (target_idx))
9508     error ("Too few arguments for operation: %s",
9509            SDATA (SYMBOL_NAME (operation)));
9510   target = args[XINT (target_idx) + 1];
9511   if (!(STRINGP (target)
9512         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9513             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9514         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9515     error ("Invalid %dth argument", XINT (target_idx) + 1);
9516   if (CONSP (target))
9517     target = XCAR (target);
9518
9519   chain = ((EQ (operation, Qinsert_file_contents)
9520             || EQ (operation, Qwrite_region))
9521            ? Vfile_coding_system_alist
9522            : (EQ (operation, Qopen_network_stream)
9523               ? Vnetwork_coding_system_alist
9524               : Vprocess_coding_system_alist));
9525   if (NILP (chain))
9526     return Qnil;
9527
9528   for (; CONSP (chain); chain = XCDR (chain))
9529     {
9530       Lisp_Object elt;
9531
9532       elt = XCAR (chain);
9533       if (CONSP (elt)
9534           && ((STRINGP (target)
9535                && STRINGP (XCAR (elt))
9536                && fast_string_match (XCAR (elt), target) >= 0)
9537               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9538         {
9539           val = XCDR (elt);
9540           /* Here, if VAL is both a valid coding system and a valid
9541              function symbol, we return VAL as a coding system.  */
9542           if (CONSP (val))
9543             return val;
9544           if (! SYMBOLP (val))
9545             return Qnil;
9546           if (! NILP (Fcoding_system_p (val)))
9547             return Fcons (val, val);
9548           if (! NILP (Ffboundp (val)))
9549             {
9550               /* We use call1 rather than safe_call1
9551                  so as to get bug reports about functions called here
9552                  which don't handle the current interface.  */
9553               val = call1 (val, Flist (nargs, args));
9554               if (CONSP (val))
9555                 return val;
9556               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9557                 return Fcons (val, val);
9558             }
9559           return Qnil;
9560         }
9561     }
9562   return Qnil;
9563 }
9564
9565 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9566        Sset_coding_system_priority, 0, MANY, 0,
9567        doc: /* Assign higher priority to the coding systems given as arguments.
9568 If multiple coding systems belong to the same category,
9569 all but the first one are ignored.
9570
9571 usage: (set-coding-system-priority &rest coding-systems)  */)
9572      (nargs, args)
9573      int nargs;
9574      Lisp_Object *args;
9575 {
9576   int i, j;
9577   int changed[coding_category_max];
9578   enum coding_category priorities[coding_category_max];
9579
9580   bzero (changed, sizeof changed);
9581
9582   for (i = j = 0; i < nargs; i++)
9583     {
9584       enum coding_category category;
9585       Lisp_Object spec, attrs;
9586
9587       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9588       attrs = AREF (spec, 0);
9589       category = XINT (CODING_ATTR_CATEGORY (attrs));
9590       if (changed[category])
9591         /* Ignore this coding system because a coding system of the
9592            same category already had a higher priority.  */
9593         continue;
9594       changed[category] = 1;
9595       priorities[j++] = category;
9596       if (coding_categories[category].id >= 0
9597           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9598         setup_coding_system (args[i], &coding_categories[category]);
9599       Fset (AREF (Vcoding_category_table, category), args[i]);
9600     }
9601
9602   /* Now we have decided top J priorities.  Reflect the order of the
9603      original priorities to the remaining priorities.  */
9604
9605   for (i = j, j = 0; i < coding_category_max; i++, j++)
9606     {
9607       while (j < coding_category_max
9608              && changed[coding_priorities[j]])
9609         j++;
9610       if (j == coding_category_max)
9611         abort ();
9612       priorities[i] = coding_priorities[j];
9613     }
9614
9615   bcopy (priorities, coding_priorities, sizeof priorities);
9616
9617   /* Update `coding-category-list'.  */
9618   Vcoding_category_list = Qnil;
9619   for (i = coding_category_max - 1; i >= 0; i--)
9620     Vcoding_category_list
9621       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9622                Vcoding_category_list);
9623
9624   return Qnil;
9625 }
9626
9627 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9628        Scoding_system_priority_list, 0, 1, 0,
9629        doc: /* Return a list of coding systems ordered by their priorities.
9630 The list contains a subset of coding systems; i.e. coding systems
9631 assigned to each coding category (see `coding-category-list').
9632
9633 HIGHESTP non-nil means just return the highest priority one.  */)
9634      (highestp)
9635      Lisp_Object highestp;
9636 {
9637   int i;
9638   Lisp_Object val;
9639
9640   for (i = 0, val = Qnil; i < coding_category_max; i++)
9641     {
9642       enum coding_category category = coding_priorities[i];
9643       int id = coding_categories[category].id;
9644       Lisp_Object attrs;
9645
9646       if (id < 0)
9647         continue;
9648       attrs = CODING_ID_ATTRS (id);
9649       if (! NILP (highestp))
9650         return CODING_ATTR_BASE_NAME (attrs);
9651       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9652     }
9653   return Fnreverse (val);
9654 }
9655
9656 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9657
9658 static Lisp_Object
9659 make_subsidiaries (base)
9660      Lisp_Object base;
9661 {
9662   Lisp_Object subsidiaries;
9663   int base_name_len = SBYTES (SYMBOL_NAME (base));
9664   char *buf = (char *) alloca (base_name_len + 6);
9665   int i;
9666
9667   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9668   subsidiaries = Fmake_vector (make_number (3), Qnil);
9669   for (i = 0; i < 3; i++)
9670     {
9671       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9672       ASET (subsidiaries, i, intern (buf));
9673     }
9674   return subsidiaries;
9675 }
9676
9677
9678 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9679        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9680        doc: /* For internal use only.
9681 usage: (define-coding-system-internal ...)  */)
9682      (nargs, args)
9683      int nargs;
9684      Lisp_Object *args;
9685 {
9686   Lisp_Object name;
9687   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9688   Lisp_Object attrs;            /* Vector of attributes.  */
9689   Lisp_Object eol_type;
9690   Lisp_Object aliases;
9691   Lisp_Object coding_type, charset_list, safe_charsets;
9692   enum coding_category category;
9693   Lisp_Object tail, val;
9694   int max_charset_id = 0;
9695   int i;
9696
9697   if (nargs < coding_arg_max)
9698     goto short_args;
9699
9700   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9701
9702   name = args[coding_arg_name];
9703   CHECK_SYMBOL (name);
9704   CODING_ATTR_BASE_NAME (attrs) = name;
9705
9706   val = args[coding_arg_mnemonic];
9707   if (! STRINGP (val))
9708     CHECK_CHARACTER (val);
9709   CODING_ATTR_MNEMONIC (attrs) = val;
9710
9711   coding_type = args[coding_arg_coding_type];
9712   CHECK_SYMBOL (coding_type);
9713   CODING_ATTR_TYPE (attrs) = coding_type;
9714
9715   charset_list = args[coding_arg_charset_list];
9716   if (SYMBOLP (charset_list))
9717     {
9718       if (EQ (charset_list, Qiso_2022))
9719         {
9720           if (! EQ (coding_type, Qiso_2022))
9721             error ("Invalid charset-list");
9722           charset_list = Viso_2022_charset_list;
9723         }
9724       else if (EQ (charset_list, Qemacs_mule))
9725         {
9726           if (! EQ (coding_type, Qemacs_mule))
9727             error ("Invalid charset-list");
9728           charset_list = Vemacs_mule_charset_list;
9729         }
9730       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9731         if (max_charset_id < XFASTINT (XCAR (tail)))
9732           max_charset_id = XFASTINT (XCAR (tail));
9733     }
9734   else
9735     {
9736       charset_list = Fcopy_sequence (charset_list);
9737       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9738         {
9739           struct charset *charset;
9740
9741           val = XCAR (tail);
9742           CHECK_CHARSET_GET_CHARSET (val, charset);
9743           if (EQ (coding_type, Qiso_2022)
9744               ? CHARSET_ISO_FINAL (charset) < 0
9745               : EQ (coding_type, Qemacs_mule)
9746               ? CHARSET_EMACS_MULE_ID (charset) < 0
9747               : 0)
9748             error ("Can't handle charset `%s'",
9749                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9750
9751           XSETCAR (tail, make_number (charset->id));
9752           if (max_charset_id < charset->id)
9753             max_charset_id = charset->id;
9754         }
9755     }
9756   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9757
9758   safe_charsets = make_uninit_string (max_charset_id + 1);
9759   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9760   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9761     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9762   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9763
9764   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9765
9766   val = args[coding_arg_decode_translation_table];
9767   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9768     CHECK_SYMBOL (val);
9769   CODING_ATTR_DECODE_TBL (attrs) = val;
9770
9771   val = args[coding_arg_encode_translation_table];
9772   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9773     CHECK_SYMBOL (val);
9774   CODING_ATTR_ENCODE_TBL (attrs) = val;
9775
9776   val = args[coding_arg_post_read_conversion];
9777   CHECK_SYMBOL (val);
9778   CODING_ATTR_POST_READ (attrs) = val;
9779
9780   val = args[coding_arg_pre_write_conversion];
9781   CHECK_SYMBOL (val);
9782   CODING_ATTR_PRE_WRITE (attrs) = val;
9783
9784   val = args[coding_arg_default_char];
9785   if (NILP (val))
9786     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9787   else
9788     {
9789       CHECK_CHARACTER (val);
9790       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9791     }
9792
9793   val = args[coding_arg_for_unibyte];
9794   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9795
9796   val = args[coding_arg_plist];
9797   CHECK_LIST (val);
9798   CODING_ATTR_PLIST (attrs) = val;
9799
9800   if (EQ (coding_type, Qcharset))
9801     {
9802       /* Generate a lisp vector of 256 elements.  Each element is nil,
9803          integer, or a list of charset IDs.
9804
9805          If Nth element is nil, the byte code N is invalid in this
9806          coding system.
9807
9808          If Nth element is a number NUM, N is the first byte of a
9809          charset whose ID is NUM.
9810
9811          If Nth element is a list of charset IDs, N is the first byte
9812          of one of them.  The list is sorted by dimensions of the
9813          charsets.  A charset of smaller dimension comes firtst. */
9814       val = Fmake_vector (make_number (256), Qnil);
9815
9816       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9817         {
9818           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9819           int dim = CHARSET_DIMENSION (charset);
9820           int idx = (dim - 1) * 4;
9821
9822           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9823             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9824
9825           for (i = charset->code_space[idx];
9826                i <= charset->code_space[idx + 1]; i++)
9827             {
9828               Lisp_Object tmp, tmp2;
9829               int dim2;
9830
9831               tmp = AREF (val, i);
9832               if (NILP (tmp))
9833                 tmp = XCAR (tail);
9834               else if (NUMBERP (tmp))
9835                 {
9836                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9837                   if (dim < dim2)
9838                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9839                   else
9840                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9841                 }
9842               else
9843                 {
9844                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9845                     {
9846                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9847                       if (dim < dim2)
9848                         break;
9849                     }
9850                   if (NILP (tmp2))
9851                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9852                   else
9853                     {
9854                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9855                       XSETCAR (tmp2, XCAR (tail));
9856                     }
9857                 }
9858               ASET (val, i, tmp);
9859             }
9860         }
9861       ASET (attrs, coding_attr_charset_valids, val);
9862       category = coding_category_charset;
9863     }
9864   else if (EQ (coding_type, Qccl))
9865     {
9866       Lisp_Object valids;
9867
9868       if (nargs < coding_arg_ccl_max)
9869         goto short_args;
9870
9871       val = args[coding_arg_ccl_decoder];
9872       CHECK_CCL_PROGRAM (val);
9873       if (VECTORP (val))
9874         val = Fcopy_sequence (val);
9875       ASET (attrs, coding_attr_ccl_decoder, val);
9876
9877       val = args[coding_arg_ccl_encoder];
9878       CHECK_CCL_PROGRAM (val);
9879       if (VECTORP (val))
9880         val = Fcopy_sequence (val);
9881       ASET (attrs, coding_attr_ccl_encoder, val);
9882
9883       val = args[coding_arg_ccl_valids];
9884       valids = Fmake_string (make_number (256), make_number (0));
9885       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9886         {
9887           int from, to;
9888
9889           val = Fcar (tail);
9890           if (INTEGERP (val))
9891             {
9892               from = to = XINT (val);
9893               if (from < 0 || from > 255)
9894                 args_out_of_range_3 (val, make_number (0), make_number (255));
9895             }
9896           else
9897             {
9898               CHECK_CONS (val);
9899               CHECK_NATNUM_CAR (val);
9900               CHECK_NATNUM_CDR (val);
9901               from = XINT (XCAR (val));
9902               if (from > 255)
9903                 args_out_of_range_3 (XCAR (val),
9904                                      make_number (0), make_number (255));
9905               to = XINT (XCDR (val));
9906               if (to < from || to > 255)
9907                 args_out_of_range_3 (XCDR (val),
9908                                      XCAR (val), make_number (255));
9909             }
9910           for (i = from; i <= to; i++)
9911             SSET (valids, i, 1);
9912         }
9913       ASET (attrs, coding_attr_ccl_valids, valids);
9914
9915       category = coding_category_ccl;
9916     }
9917   else if (EQ (coding_type, Qutf_16))
9918     {
9919       Lisp_Object bom, endian;
9920
9921       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9922
9923       if (nargs < coding_arg_utf16_max)
9924         goto short_args;
9925
9926       bom = args[coding_arg_utf16_bom];
9927       if (! NILP (bom) && ! EQ (bom, Qt))
9928         {
9929           CHECK_CONS (bom);
9930           val = XCAR (bom);
9931           CHECK_CODING_SYSTEM (val);
9932           val = XCDR (bom);
9933           CHECK_CODING_SYSTEM (val);
9934         }
9935       ASET (attrs, coding_attr_utf_bom, bom);
9936
9937       endian = args[coding_arg_utf16_endian];
9938       CHECK_SYMBOL (endian);
9939       if (NILP (endian))
9940         endian = Qbig;
9941       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9942         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9943       ASET (attrs, coding_attr_utf_16_endian, endian);
9944
9945       category = (CONSP (bom)
9946                   ? coding_category_utf_16_auto
9947                   : NILP (bom)
9948                   ? (EQ (endian, Qbig)
9949                      ? coding_category_utf_16_be_nosig
9950                      : coding_category_utf_16_le_nosig)
9951                   : (EQ (endian, Qbig)
9952                      ? coding_category_utf_16_be
9953                      : coding_category_utf_16_le));
9954     }
9955   else if (EQ (coding_type, Qiso_2022))
9956     {
9957       Lisp_Object initial, reg_usage, request, flags;
9958       int i;
9959
9960       if (nargs < coding_arg_iso2022_max)
9961         goto short_args;
9962
9963       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9964       CHECK_VECTOR (initial);
9965       for (i = 0; i < 4; i++)
9966         {
9967           val = Faref (initial, make_number (i));
9968           if (! NILP (val))
9969             {
9970               struct charset *charset;
9971
9972               CHECK_CHARSET_GET_CHARSET (val, charset);
9973               ASET (initial, i, make_number (CHARSET_ID (charset)));
9974               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9975                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9976             }
9977           else
9978             ASET (initial, i, make_number (-1));
9979         }
9980
9981       reg_usage = args[coding_arg_iso2022_reg_usage];
9982       CHECK_CONS (reg_usage);
9983       CHECK_NUMBER_CAR (reg_usage);
9984       CHECK_NUMBER_CDR (reg_usage);
9985
9986       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9987       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9988         {
9989           int id;
9990           Lisp_Object tmp;
9991
9992           val = Fcar (tail);
9993           CHECK_CONS (val);
9994           tmp = XCAR (val);
9995           CHECK_CHARSET_GET_ID (tmp, id);
9996           CHECK_NATNUM_CDR (val);
9997           if (XINT (XCDR (val)) >= 4)
9998             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9999           XSETCAR (val, make_number (id));
10000         }
10001
10002       flags = args[coding_arg_iso2022_flags];
10003       CHECK_NATNUM (flags);
10004       i = XINT (flags);
10005       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10006         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
10007
10008       ASET (attrs, coding_attr_iso_initial, initial);
10009       ASET (attrs, coding_attr_iso_usage, reg_usage);
10010       ASET (attrs, coding_attr_iso_request, request);
10011       ASET (attrs, coding_attr_iso_flags, flags);
10012       setup_iso_safe_charsets (attrs);
10013
10014       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10015         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10016                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10017                     ? coding_category_iso_7_else
10018                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10019                     ? coding_category_iso_7
10020                     : coding_category_iso_7_tight);
10021       else
10022         {
10023           int id = XINT (AREF (initial, 1));
10024
10025           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10026                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10027                        || id < 0)
10028                       ? coding_category_iso_8_else
10029                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10030                       ? coding_category_iso_8_1
10031                       : coding_category_iso_8_2);
10032         }
10033       if (category != coding_category_iso_8_1
10034           && category != coding_category_iso_8_2)
10035         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10036     }
10037   else if (EQ (coding_type, Qemacs_mule))
10038     {
10039       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10040         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10041       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10042       category = coding_category_emacs_mule;
10043     }
10044   else if (EQ (coding_type, Qshift_jis))
10045     {
10046
10047       struct charset *charset;
10048
10049       if (XINT (Flength (charset_list)) != 3
10050           && XINT (Flength (charset_list)) != 4)
10051         error ("There should be three or four charsets");
10052
10053       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10054       if (CHARSET_DIMENSION (charset) != 1)
10055         error ("Dimension of charset %s is not one",
10056                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10057       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10058         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10059
10060       charset_list = XCDR (charset_list);
10061       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10062       if (CHARSET_DIMENSION (charset) != 1)
10063         error ("Dimension of charset %s is not one",
10064                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10065
10066       charset_list = XCDR (charset_list);
10067       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10068       if (CHARSET_DIMENSION (charset) != 2)
10069         error ("Dimension of charset %s is not two",
10070                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10071
10072       charset_list = XCDR (charset_list);
10073       if (! NILP (charset_list))
10074         {
10075           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10076           if (CHARSET_DIMENSION (charset) != 2)
10077             error ("Dimension of charset %s is not two",
10078                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10079         }
10080
10081       category = coding_category_sjis;
10082       Vsjis_coding_system = name;
10083     }
10084   else if (EQ (coding_type, Qbig5))
10085     {
10086       struct charset *charset;
10087
10088       if (XINT (Flength (charset_list)) != 2)
10089         error ("There should be just two charsets");
10090
10091       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10092       if (CHARSET_DIMENSION (charset) != 1)
10093         error ("Dimension of charset %s is not one",
10094                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10095       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10096         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10097
10098       charset_list = XCDR (charset_list);
10099       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10100       if (CHARSET_DIMENSION (charset) != 2)
10101         error ("Dimension of charset %s is not two",
10102                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10103
10104       category = coding_category_big5;
10105       Vbig5_coding_system = name;
10106     }
10107   else if (EQ (coding_type, Qraw_text))
10108     {
10109       category = coding_category_raw_text;
10110       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10111     }
10112   else if (EQ (coding_type, Qutf_8))
10113     {
10114       Lisp_Object bom;
10115
10116       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10117
10118       if (nargs < coding_arg_utf8_max)
10119         goto short_args;
10120
10121       bom = args[coding_arg_utf8_bom];
10122       if (! NILP (bom) && ! EQ (bom, Qt))
10123         {
10124           CHECK_CONS (bom);
10125           val = XCAR (bom);
10126           CHECK_CODING_SYSTEM (val);
10127           val = XCDR (bom);
10128           CHECK_CODING_SYSTEM (val);
10129         }
10130       ASET (attrs, coding_attr_utf_bom, bom);
10131
10132       category = (CONSP (bom) ? coding_category_utf_8_auto
10133                   : NILP (bom) ? coding_category_utf_8_nosig
10134                   : coding_category_utf_8_sig);
10135     }
10136   else if (EQ (coding_type, Qundecided))
10137     category = coding_category_undecided;
10138   else
10139     error ("Invalid coding system type: %s",
10140            SDATA (SYMBOL_NAME (coding_type)));
10141
10142   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10143   CODING_ATTR_PLIST (attrs)
10144     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10145                                 CODING_ATTR_PLIST (attrs)));
10146   CODING_ATTR_PLIST (attrs)
10147     = Fcons (QCascii_compatible_p,
10148              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10149                     CODING_ATTR_PLIST (attrs)));
10150
10151   eol_type = args[coding_arg_eol_type];
10152   if (! NILP (eol_type)
10153       && ! EQ (eol_type, Qunix)
10154       && ! EQ (eol_type, Qdos)
10155       && ! EQ (eol_type, Qmac))
10156     error ("Invalid eol-type");
10157
10158   aliases = Fcons (name, Qnil);
10159
10160   if (NILP (eol_type))
10161     {
10162       eol_type = make_subsidiaries (name);
10163       for (i = 0; i < 3; i++)
10164         {
10165           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10166
10167           this_name = AREF (eol_type, i);
10168           this_aliases = Fcons (this_name, Qnil);
10169           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10170           this_spec = Fmake_vector (make_number (3), attrs);
10171           ASET (this_spec, 1, this_aliases);
10172           ASET (this_spec, 2, this_eol_type);
10173           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10174           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10175           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10176           if (NILP (val))
10177             Vcoding_system_alist
10178               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10179                        Vcoding_system_alist);
10180         }
10181     }
10182
10183   spec_vec = Fmake_vector (make_number (3), attrs);
10184   ASET (spec_vec, 1, aliases);
10185   ASET (spec_vec, 2, eol_type);
10186
10187   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10188   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10189   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10190   if (NILP (val))
10191     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10192                                   Vcoding_system_alist);
10193
10194   {
10195     int id = coding_categories[category].id;
10196
10197     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10198       setup_coding_system (name, &coding_categories[category]);
10199   }
10200
10201   return Qnil;
10202
10203  short_args:
10204   return Fsignal (Qwrong_number_of_arguments,
10205                   Fcons (intern ("define-coding-system-internal"),
10206                          make_number (nargs)));
10207 }
10208
10209
10210 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10211        3, 3, 0,
10212        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10213   (coding_system, prop, val)
10214      Lisp_Object coding_system, prop, val;
10215 {
10216   Lisp_Object spec, attrs;
10217
10218   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10219   attrs = AREF (spec, 0);
10220   if (EQ (prop, QCmnemonic))
10221     {
10222       if (! STRINGP (val))
10223         CHECK_CHARACTER (val);
10224       CODING_ATTR_MNEMONIC (attrs) = val;
10225     }
10226   else if (EQ (prop, QCdefault_char))
10227     {
10228       if (NILP (val))
10229         val = make_number (' ');
10230       else
10231         CHECK_CHARACTER (val);
10232       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10233     }
10234   else if (EQ (prop, QCdecode_translation_table))
10235     {
10236       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10237         CHECK_SYMBOL (val);
10238       CODING_ATTR_DECODE_TBL (attrs) = val;
10239     }
10240   else if (EQ (prop, QCencode_translation_table))
10241     {
10242       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10243         CHECK_SYMBOL (val);
10244       CODING_ATTR_ENCODE_TBL (attrs) = val;
10245     }
10246   else if (EQ (prop, QCpost_read_conversion))
10247     {
10248       CHECK_SYMBOL (val);
10249       CODING_ATTR_POST_READ (attrs) = val;
10250     }
10251   else if (EQ (prop, QCpre_write_conversion))
10252     {
10253       CHECK_SYMBOL (val);
10254       CODING_ATTR_PRE_WRITE (attrs) = val;
10255     }
10256   else if (EQ (prop, QCascii_compatible_p))
10257     {
10258       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10259     }
10260
10261   CODING_ATTR_PLIST (attrs)
10262     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10263   return val;
10264 }
10265
10266
10267 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10268        Sdefine_coding_system_alias, 2, 2, 0,
10269        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10270      (alias, coding_system)
10271      Lisp_Object alias, coding_system;
10272 {
10273   Lisp_Object spec, aliases, eol_type, val;
10274
10275   CHECK_SYMBOL (alias);
10276   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10277   aliases = AREF (spec, 1);
10278   /* ALIASES should be a list of length more than zero, and the first
10279      element is a base coding system.  Append ALIAS at the tail of the
10280      list.  */
10281   while (!NILP (XCDR (aliases)))
10282     aliases = XCDR (aliases);
10283   XSETCDR (aliases, Fcons (alias, Qnil));
10284
10285   eol_type = AREF (spec, 2);
10286   if (VECTORP (eol_type))
10287     {
10288       Lisp_Object subsidiaries;
10289       int i;
10290
10291       subsidiaries = make_subsidiaries (alias);
10292       for (i = 0; i < 3; i++)
10293         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10294                                      AREF (eol_type, i));
10295     }
10296
10297   Fputhash (alias, spec, Vcoding_system_hash_table);
10298   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10299   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10300   if (NILP (val))
10301     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10302                                   Vcoding_system_alist);
10303
10304   return Qnil;
10305 }
10306
10307 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10308        1, 1, 0,
10309        doc: /* Return the base of CODING-SYSTEM.
10310 Any alias or subsidiary coding system is not a base coding system.  */)
10311   (coding_system)
10312      Lisp_Object coding_system;
10313 {
10314   Lisp_Object spec, attrs;
10315
10316   if (NILP (coding_system))
10317     return (Qno_conversion);
10318   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10319   attrs = AREF (spec, 0);
10320   return CODING_ATTR_BASE_NAME (attrs);
10321 }
10322
10323 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10324        1, 1, 0,
10325        doc: "Return the property list of CODING-SYSTEM.")
10326      (coding_system)
10327      Lisp_Object coding_system;
10328 {
10329   Lisp_Object spec, attrs;
10330
10331   if (NILP (coding_system))
10332     coding_system = Qno_conversion;
10333   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10334   attrs = AREF (spec, 0);
10335   return CODING_ATTR_PLIST (attrs);
10336 }
10337
10338
10339 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10340        1, 1, 0,
10341        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10342      (coding_system)
10343      Lisp_Object coding_system;
10344 {
10345   Lisp_Object spec;
10346
10347   if (NILP (coding_system))
10348     coding_system = Qno_conversion;
10349   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10350   return AREF (spec, 1);
10351 }
10352
10353 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10354        Scoding_system_eol_type, 1, 1, 0,
10355        doc: /* Return eol-type of CODING-SYSTEM.
10356 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10357
10358 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10359 and CR respectively.
10360
10361 A vector value indicates that a format of end-of-line should be
10362 detected automatically.  Nth element of the vector is the subsidiary
10363 coding system whose eol-type is N.  */)
10364      (coding_system)
10365      Lisp_Object coding_system;
10366 {
10367   Lisp_Object spec, eol_type;
10368   int n;
10369
10370   if (NILP (coding_system))
10371     coding_system = Qno_conversion;
10372   if (! CODING_SYSTEM_P (coding_system))
10373     return Qnil;
10374   spec = CODING_SYSTEM_SPEC (coding_system);
10375   eol_type = AREF (spec, 2);
10376   if (VECTORP (eol_type))
10377     return Fcopy_sequence (eol_type);
10378   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10379   return make_number (n);
10380 }
10381
10382 #endif /* emacs */
10383
10384 \f
10385 /*** 9. Post-amble ***/
10386
10387 void
10388 init_coding_once ()
10389 {
10390   int i;
10391
10392   for (i = 0; i < coding_category_max; i++)
10393     {
10394       coding_categories[i].id = -1;
10395       coding_priorities[i] = i;
10396     }
10397
10398   /* ISO2022 specific initialize routine.  */
10399   for (i = 0; i < 0x20; i++)
10400     iso_code_class[i] = ISO_control_0;
10401   for (i = 0x21; i < 0x7F; i++)
10402     iso_code_class[i] = ISO_graphic_plane_0;
10403   for (i = 0x80; i < 0xA0; i++)
10404     iso_code_class[i] = ISO_control_1;
10405   for (i = 0xA1; i < 0xFF; i++)
10406     iso_code_class[i] = ISO_graphic_plane_1;
10407   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10408   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10409   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10410   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10411   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10412   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10413   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10414   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10415   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10416
10417   for (i = 0; i < 256; i++)
10418     {
10419       emacs_mule_bytes[i] = 1;
10420     }
10421   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10422   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10423   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10424   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10425 }
10426
10427 #ifdef emacs
10428
10429 void
10430 syms_of_coding ()
10431 {
10432   staticpro (&Vcoding_system_hash_table);
10433   {
10434     Lisp_Object args[2];
10435     args[0] = QCtest;
10436     args[1] = Qeq;
10437     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10438   }
10439
10440   staticpro (&Vsjis_coding_system);
10441   Vsjis_coding_system = Qnil;
10442
10443   staticpro (&Vbig5_coding_system);
10444   Vbig5_coding_system = Qnil;
10445
10446   staticpro (&Vcode_conversion_reused_workbuf);
10447   Vcode_conversion_reused_workbuf = Qnil;
10448
10449   staticpro (&Vcode_conversion_workbuf_name);
10450   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10451
10452   reused_workbuf_in_use = 0;
10453
10454   DEFSYM (Qcharset, "charset");
10455   DEFSYM (Qtarget_idx, "target-idx");
10456   DEFSYM (Qcoding_system_history, "coding-system-history");
10457   Fset (Qcoding_system_history, Qnil);
10458
10459   /* Target FILENAME is the first argument.  */
10460   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10461   /* Target FILENAME is the third argument.  */
10462   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10463
10464   DEFSYM (Qcall_process, "call-process");
10465   /* Target PROGRAM is the first argument.  */
10466   Fput (Qcall_process, Qtarget_idx, make_number (0));
10467
10468   DEFSYM (Qcall_process_region, "call-process-region");
10469   /* Target PROGRAM is the third argument.  */
10470   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10471
10472   DEFSYM (Qstart_process, "start-process");
10473   /* Target PROGRAM is the third argument.  */
10474   Fput (Qstart_process, Qtarget_idx, make_number (2));
10475
10476   DEFSYM (Qopen_network_stream, "open-network-stream");
10477   /* Target SERVICE is the fourth argument.  */
10478   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10479
10480   DEFSYM (Qcoding_system, "coding-system");
10481   DEFSYM (Qcoding_aliases, "coding-aliases");
10482
10483   DEFSYM (Qeol_type, "eol-type");
10484   DEFSYM (Qunix, "unix");
10485   DEFSYM (Qdos, "dos");
10486
10487   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10488   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10489   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10490   DEFSYM (Qdefault_char, "default-char");
10491   DEFSYM (Qundecided, "undecided");
10492   DEFSYM (Qno_conversion, "no-conversion");
10493   DEFSYM (Qraw_text, "raw-text");
10494
10495   DEFSYM (Qiso_2022, "iso-2022");
10496
10497   DEFSYM (Qutf_8, "utf-8");
10498   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10499
10500   DEFSYM (Qutf_16, "utf-16");
10501   DEFSYM (Qbig, "big");
10502   DEFSYM (Qlittle, "little");
10503
10504   DEFSYM (Qshift_jis, "shift-jis");
10505   DEFSYM (Qbig5, "big5");
10506
10507   DEFSYM (Qcoding_system_p, "coding-system-p");
10508
10509   DEFSYM (Qcoding_system_error, "coding-system-error");
10510   Fput (Qcoding_system_error, Qerror_conditions,
10511         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10512   Fput (Qcoding_system_error, Qerror_message,
10513         make_pure_c_string ("Invalid coding system"));
10514
10515   /* Intern this now in case it isn't already done.
10516      Setting this variable twice is harmless.
10517      But don't staticpro it here--that is done in alloc.c.  */
10518   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10519
10520   DEFSYM (Qtranslation_table, "translation-table");
10521   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10522   DEFSYM (Qtranslation_table_id, "translation-table-id");
10523   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10524   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10525
10526   DEFSYM (Qvalid_codes, "valid-codes");
10527
10528   DEFSYM (Qemacs_mule, "emacs-mule");
10529
10530   DEFSYM (QCcategory, ":category");
10531   DEFSYM (QCmnemonic, ":mnemonic");
10532   DEFSYM (QCdefault_char, ":default-char");
10533   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10534   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10535   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10536   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10537   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10538
10539   Vcoding_category_table
10540     = Fmake_vector (make_number (coding_category_max), Qnil);
10541   staticpro (&Vcoding_category_table);
10542   /* Followings are target of code detection.  */
10543   ASET (Vcoding_category_table, coding_category_iso_7,
10544         intern_c_string ("coding-category-iso-7"));
10545   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10546         intern_c_string ("coding-category-iso-7-tight"));
10547   ASET (Vcoding_category_table, coding_category_iso_8_1,
10548         intern_c_string ("coding-category-iso-8-1"));
10549   ASET (Vcoding_category_table, coding_category_iso_8_2,
10550         intern_c_string ("coding-category-iso-8-2"));
10551   ASET (Vcoding_category_table, coding_category_iso_7_else,
10552         intern_c_string ("coding-category-iso-7-else"));
10553   ASET (Vcoding_category_table, coding_category_iso_8_else,
10554         intern_c_string ("coding-category-iso-8-else"));
10555   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10556         intern_c_string ("coding-category-utf-8-auto"));
10557   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10558         intern_c_string ("coding-category-utf-8"));
10559   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10560         intern_c_string ("coding-category-utf-8-sig"));
10561   ASET (Vcoding_category_table, coding_category_utf_16_be,
10562         intern_c_string ("coding-category-utf-16-be"));
10563   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10564         intern_c_string ("coding-category-utf-16-auto"));
10565   ASET (Vcoding_category_table, coding_category_utf_16_le,
10566         intern_c_string ("coding-category-utf-16-le"));
10567   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10568         intern_c_string ("coding-category-utf-16-be-nosig"));
10569   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10570         intern_c_string ("coding-category-utf-16-le-nosig"));
10571   ASET (Vcoding_category_table, coding_category_charset,
10572         intern_c_string ("coding-category-charset"));
10573   ASET (Vcoding_category_table, coding_category_sjis,
10574         intern_c_string ("coding-category-sjis"));
10575   ASET (Vcoding_category_table, coding_category_big5,
10576         intern_c_string ("coding-category-big5"));
10577   ASET (Vcoding_category_table, coding_category_ccl,
10578         intern_c_string ("coding-category-ccl"));
10579   ASET (Vcoding_category_table, coding_category_emacs_mule,
10580         intern_c_string ("coding-category-emacs-mule"));
10581   /* Followings are NOT target of code detection.  */
10582   ASET (Vcoding_category_table, coding_category_raw_text,
10583         intern_c_string ("coding-category-raw-text"));
10584   ASET (Vcoding_category_table, coding_category_undecided,
10585         intern_c_string ("coding-category-undecided"));
10586
10587   DEFSYM (Qinsufficient_source, "insufficient-source");
10588   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10589   DEFSYM (Qinvalid_source, "invalid-source");
10590   DEFSYM (Qinterrupted, "interrupted");
10591   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10592   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10593
10594   defsubr (&Scoding_system_p);
10595   defsubr (&Sread_coding_system);
10596   defsubr (&Sread_non_nil_coding_system);
10597   defsubr (&Scheck_coding_system);
10598   defsubr (&Sdetect_coding_region);
10599   defsubr (&Sdetect_coding_string);
10600   defsubr (&Sfind_coding_systems_region_internal);
10601   defsubr (&Sunencodable_char_position);
10602   defsubr (&Scheck_coding_systems_region);
10603   defsubr (&Sdecode_coding_region);
10604   defsubr (&Sencode_coding_region);
10605   defsubr (&Sdecode_coding_string);
10606   defsubr (&Sencode_coding_string);
10607   defsubr (&Sdecode_sjis_char);
10608   defsubr (&Sencode_sjis_char);
10609   defsubr (&Sdecode_big5_char);
10610   defsubr (&Sencode_big5_char);
10611   defsubr (&Sset_terminal_coding_system_internal);
10612   defsubr (&Sset_safe_terminal_coding_system_internal);
10613   defsubr (&Sterminal_coding_system);
10614   defsubr (&Sset_keyboard_coding_system_internal);
10615   defsubr (&Skeyboard_coding_system);
10616   defsubr (&Sfind_operation_coding_system);
10617   defsubr (&Sset_coding_system_priority);
10618   defsubr (&Sdefine_coding_system_internal);
10619   defsubr (&Sdefine_coding_system_alias);
10620   defsubr (&Scoding_system_put);
10621   defsubr (&Scoding_system_base);
10622   defsubr (&Scoding_system_plist);
10623   defsubr (&Scoding_system_aliases);
10624   defsubr (&Scoding_system_eol_type);
10625   defsubr (&Scoding_system_priority_list);
10626
10627   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10628                doc: /* List of coding systems.
10629
10630 Do not alter the value of this variable manually.  This variable should be
10631 updated by the functions `define-coding-system' and
10632 `define-coding-system-alias'.  */);
10633   Vcoding_system_list = Qnil;
10634
10635   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10636                doc: /* Alist of coding system names.
10637 Each element is one element list of coding system name.
10638 This variable is given to `completing-read' as COLLECTION argument.
10639
10640 Do not alter the value of this variable manually.  This variable should be
10641 updated by the functions `make-coding-system' and
10642 `define-coding-system-alias'.  */);
10643   Vcoding_system_alist = Qnil;
10644
10645   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10646                doc: /* List of coding-categories (symbols) ordered by priority.
10647
10648 On detecting a coding system, Emacs tries code detection algorithms
10649 associated with each coding-category one by one in this order.  When
10650 one algorithm agrees with a byte sequence of source text, the coding
10651 system bound to the corresponding coding-category is selected.
10652
10653 Don't modify this variable directly, but use `set-coding-priority'.  */);
10654   {
10655     int i;
10656
10657     Vcoding_category_list = Qnil;
10658     for (i = coding_category_max - 1; i >= 0; i--)
10659       Vcoding_category_list
10660         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10661                  Vcoding_category_list);
10662   }
10663
10664   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10665                doc: /* Specify the coding system for read operations.
10666 It is useful to bind this variable with `let', but do not set it globally.
10667 If the value is a coding system, it is used for decoding on read operation.
10668 If not, an appropriate element is used from one of the coding system alists.
10669 There are three such tables: `file-coding-system-alist',
10670 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10671   Vcoding_system_for_read = Qnil;
10672
10673   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10674                doc: /* Specify the coding system for write operations.
10675 Programs bind this variable with `let', but you should not set it globally.
10676 If the value is a coding system, it is used for encoding of output,
10677 when writing it to a file and when sending it to a file or subprocess.
10678
10679 If this does not specify a coding system, an appropriate element
10680 is used from one of the coding system alists.
10681 There are three such tables: `file-coding-system-alist',
10682 `process-coding-system-alist', and `network-coding-system-alist'.
10683 For output to files, if the above procedure does not specify a coding system,
10684 the value of `buffer-file-coding-system' is used.  */);
10685   Vcoding_system_for_write = Qnil;
10686
10687   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10688                doc: /*
10689 Coding system used in the latest file or process I/O.  */);
10690   Vlast_coding_system_used = Qnil;
10691
10692   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10693                doc: /*
10694 Error status of the last code conversion.
10695
10696 When an error was detected in the last code conversion, this variable
10697 is set to one of the following symbols.
10698   `insufficient-source'
10699   `inconsistent-eol'
10700   `invalid-source'
10701   `interrupted'
10702   `insufficient-memory'
10703 When no error was detected, the value doesn't change.  So, to check
10704 the error status of a code conversion by this variable, you must
10705 explicitly set this variable to nil before performing code
10706 conversion.  */);
10707   Vlast_code_conversion_error = Qnil;
10708
10709   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10710                doc: /*
10711 *Non-nil means always inhibit code conversion of end-of-line format.
10712 See info node `Coding Systems' and info node `Text and Binary' concerning
10713 such conversion.  */);
10714   inhibit_eol_conversion = 0;
10715
10716   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10717                doc: /*
10718 Non-nil means process buffer inherits coding system of process output.
10719 Bind it to t if the process output is to be treated as if it were a file
10720 read from some filesystem.  */);
10721   inherit_process_coding_system = 0;
10722
10723   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10724                doc: /*
10725 Alist to decide a coding system to use for a file I/O operation.
10726 The format is ((PATTERN . VAL) ...),
10727 where PATTERN is a regular expression matching a file name,
10728 VAL is a coding system, a cons of coding systems, or a function symbol.
10729 If VAL is a coding system, it is used for both decoding and encoding
10730 the file contents.
10731 If VAL is a cons of coding systems, the car part is used for decoding,
10732 and the cdr part is used for encoding.
10733 If VAL is a function symbol, the function must return a coding system
10734 or a cons of coding systems which are used as above.  The function is
10735 called with an argument that is a list of the arguments with which
10736 `find-operation-coding-system' was called.  If the function can't decide
10737 a coding system, it can return `undecided' so that the normal
10738 code-detection is performed.
10739
10740 See also the function `find-operation-coding-system'
10741 and the variable `auto-coding-alist'.  */);
10742   Vfile_coding_system_alist = Qnil;
10743
10744   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10745                doc: /*
10746 Alist to decide a coding system to use for a process I/O operation.
10747 The format is ((PATTERN . VAL) ...),
10748 where PATTERN is a regular expression matching a program name,
10749 VAL is a coding system, a cons of coding systems, or a function symbol.
10750 If VAL is a coding system, it is used for both decoding what received
10751 from the program and encoding what sent to the program.
10752 If VAL is a cons of coding systems, the car part is used for decoding,
10753 and the cdr part is used for encoding.
10754 If VAL is a function symbol, the function must return a coding system
10755 or a cons of coding systems which are used as above.
10756
10757 See also the function `find-operation-coding-system'.  */);
10758   Vprocess_coding_system_alist = Qnil;
10759
10760   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10761                doc: /*
10762 Alist to decide a coding system to use for a network I/O operation.
10763 The format is ((PATTERN . VAL) ...),
10764 where PATTERN is a regular expression matching a network service name
10765 or is a port number to connect to,
10766 VAL is a coding system, a cons of coding systems, or a function symbol.
10767 If VAL is a coding system, it is used for both decoding what received
10768 from the network stream and encoding what sent to the network stream.
10769 If VAL is a cons of coding systems, the car part is used for decoding,
10770 and the cdr part is used for encoding.
10771 If VAL is a function symbol, the function must return a coding system
10772 or a cons of coding systems which are used as above.
10773
10774 See also the function `find-operation-coding-system'.  */);
10775   Vnetwork_coding_system_alist = Qnil;
10776
10777   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10778                doc: /* Coding system to use with system messages.
10779 Also used for decoding keyboard input on X Window system.  */);
10780   Vlocale_coding_system = Qnil;
10781
10782   /* The eol mnemonics are reset in startup.el system-dependently.  */
10783   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10784                doc: /*
10785 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10786   eol_mnemonic_unix = make_pure_c_string (":");
10787
10788   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10789                doc: /*
10790 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10791   eol_mnemonic_dos = make_pure_c_string ("\\");
10792
10793   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10794                doc: /*
10795 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10796   eol_mnemonic_mac = make_pure_c_string ("/");
10797
10798   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10799                doc: /*
10800 *String displayed in mode line when end-of-line format is not yet determined.  */);
10801   eol_mnemonic_undecided = make_pure_c_string (":");
10802
10803   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10804                doc: /*
10805 *Non-nil enables character translation while encoding and decoding.  */);
10806   Venable_character_translation = Qt;
10807
10808   DEFVAR_LISP ("standard-translation-table-for-decode",
10809                &Vstandard_translation_table_for_decode,
10810                doc: /* Table for translating characters while decoding.  */);
10811   Vstandard_translation_table_for_decode = Qnil;
10812
10813   DEFVAR_LISP ("standard-translation-table-for-encode",
10814                &Vstandard_translation_table_for_encode,
10815                doc: /* Table for translating characters while encoding.  */);
10816   Vstandard_translation_table_for_encode = Qnil;
10817
10818   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10819                doc: /* Alist of charsets vs revision numbers.
10820 While encoding, if a charset (car part of an element) is found,
10821 designate it with the escape sequence identifying revision (cdr part
10822 of the element).  */);
10823   Vcharset_revision_table = Qnil;
10824
10825   DEFVAR_LISP ("default-process-coding-system",
10826                &Vdefault_process_coding_system,
10827                doc: /* Cons of coding systems used for process I/O by default.
10828 The car part is used for decoding a process output,
10829 the cdr part is used for encoding a text to be sent to a process.  */);
10830   Vdefault_process_coding_system = Qnil;
10831
10832   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10833                doc: /*
10834 Table of extra Latin codes in the range 128..159 (inclusive).
10835 This is a vector of length 256.
10836 If Nth element is non-nil, the existence of code N in a file
10837 \(or output of subprocess) doesn't prevent it to be detected as
10838 a coding system of ISO 2022 variant which has a flag
10839 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10840 or reading output of a subprocess.
10841 Only 128th through 159th elements have a meaning.  */);
10842   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10843
10844   DEFVAR_LISP ("select-safe-coding-system-function",
10845                &Vselect_safe_coding_system_function,
10846                doc: /*
10847 Function to call to select safe coding system for encoding a text.
10848
10849 If set, this function is called to force a user to select a proper
10850 coding system which can encode the text in the case that a default
10851 coding system used in each operation can't encode the text.  The
10852 function should take care that the buffer is not modified while
10853 the coding system is being selected.
10854
10855 The default value is `select-safe-coding-system' (which see).  */);
10856   Vselect_safe_coding_system_function = Qnil;
10857
10858   DEFVAR_BOOL ("coding-system-require-warning",
10859                &coding_system_require_warning,
10860                doc: /* Internal use only.
10861 If non-nil, on writing a file, `select-safe-coding-system-function' is
10862 called even if `coding-system-for-write' is non-nil.  The command
10863 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10864   coding_system_require_warning = 0;
10865
10866
10867   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10868                &inhibit_iso_escape_detection,
10869                doc: /*
10870 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10871
10872 When Emacs reads text, it tries to detect how the text is encoded.
10873 This code detection is sensitive to escape sequences.  If Emacs sees
10874 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10875 of the ISO2022 encodings, and decodes text by the corresponding coding
10876 system (e.g. `iso-2022-7bit').
10877
10878 However, there may be a case that you want to read escape sequences in
10879 a file as is.  In such a case, you can set this variable to non-nil.
10880 Then the code detection will ignore any escape sequences, and no text is
10881 detected as encoded in some ISO-2022 encoding.  The result is that all
10882 escape sequences become visible in a buffer.
10883
10884 The default value is nil, and it is strongly recommended not to change
10885 it.  That is because many Emacs Lisp source files that contain
10886 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10887 in Emacs's distribution, and they won't be decoded correctly on
10888 reading if you suppress escape sequence detection.
10889
10890 The other way to read escape sequences in a file without decoding is
10891 to explicitly specify some coding system that doesn't use ISO-2022
10892 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10893   inhibit_iso_escape_detection = 0;
10894
10895   DEFVAR_BOOL ("inhibit-null-byte-detection",
10896                &inhibit_null_byte_detection,
10897                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10898 By default, Emacs treats it as binary data, and does not attempt to
10899 decode it.  The effect is as if you specified `no-conversion' for
10900 reading that text.
10901
10902 Set this to non-nil when a regular text happens to include null bytes.
10903 Examples are Index nodes of Info files and null-byte delimited output
10904 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10905 decode text as usual.  */);
10906   inhibit_null_byte_detection = 0;
10907
10908   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10909                doc: /* Char table for translating self-inserting characters.
10910 This is applied to the result of input methods, not their input.
10911 See also `keyboard-translate-table'.
10912
10913 Use of this variable for character code unification was rendered
10914 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10915 internal character representation.  */);
10916     Vtranslation_table_for_input = Qnil;
10917
10918   {
10919     Lisp_Object args[coding_arg_max];
10920     Lisp_Object plist[16];
10921     int i;
10922
10923     for (i = 0; i < coding_arg_max; i++)
10924       args[i] = Qnil;
10925
10926     plist[0] = intern_c_string (":name");
10927     plist[1] = args[coding_arg_name] = Qno_conversion;
10928     plist[2] = intern_c_string (":mnemonic");
10929     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10930     plist[4] = intern_c_string (":coding-type");
10931     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10932     plist[6] = intern_c_string (":ascii-compatible-p");
10933     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10934     plist[8] = intern_c_string (":default-char");
10935     plist[9] = args[coding_arg_default_char] = make_number (0);
10936     plist[10] = intern_c_string (":for-unibyte");
10937     plist[11] = args[coding_arg_for_unibyte] = Qt;
10938     plist[12] = intern_c_string (":docstring");
10939     plist[13] = make_pure_c_string ("Do no conversion.\n\
10940 \n\
10941 When you visit a file with this coding, the file is read into a\n\
10942 unibyte buffer as is, thus each byte of a file is treated as a\n\
10943 character.");
10944     plist[14] = intern_c_string (":eol-type");
10945     plist[15] = args[coding_arg_eol_type] = Qunix;
10946     args[coding_arg_plist] = Flist (16, plist);
10947     Fdefine_coding_system_internal (coding_arg_max, args);
10948
10949     plist[1] = args[coding_arg_name] = Qundecided;
10950     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10951     plist[5] = args[coding_arg_coding_type] = Qundecided;
10952     /* This is already set.
10953        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10954     plist[8] = intern_c_string (":charset-list");
10955     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10956     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10957     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10958     plist[15] = args[coding_arg_eol_type] = Qnil;
10959     args[coding_arg_plist] = Flist (16, plist);
10960     Fdefine_coding_system_internal (coding_arg_max, args);
10961   }
10962
10963   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10964
10965   {
10966     int i;
10967
10968     for (i = 0; i < coding_category_max; i++)
10969       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10970   }
10971 #if defined (MSDOS) || defined (WINDOWSNT)
10972   system_eol_type = Qdos;
10973 #else
10974   system_eol_type = Qunix;
10975 #endif
10976   staticpro (&system_eol_type);
10977 }
10978
10979 char *
10980 emacs_strerror (error_number)
10981      int error_number;
10982 {
10983   char *str;
10984
10985   synchronize_system_messages_locale ();
10986   str = strerror (error_number);
10987
10988   if (! NILP (Vlocale_coding_system))
10989     {
10990       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10991                                                       Vlocale_coding_system,
10992                                                       0);
10993       str = (char *) SDATA (dec);
10994     }
10995
10996   return str;
10997 }
10998
10999 #endif /* emacs */
11000
11001 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
11002    (do not change this comment) */