src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 /* Coding-systems are handed between Emacs Lisp programs and C internal
 347    routines by the following three variables.  */
 348 /* Coding system to be used to encode text for terminal display when
 349    terminal coding system is nil.  */
 350 struct coding_system safe_terminal_coding;
 351
 352 #endif /* emacs */
 353
 354 Lisp_Object Qtranslation_table;
 355 Lisp_Object Qtranslation_table_id;
 356 static Lisp_Object Qtranslation_table_for_decode;
 357 static Lisp_Object Qtranslation_table_for_encode;
 358
 359 /* Two special coding systems.  */
 360 static Lisp_Object Vsjis_coding_system;
 361 static Lisp_Object Vbig5_coding_system;
 362
 363 /* ISO2022 section */
 364
 365 #define CODING_ISO_INITIAL(coding, reg)                 \
 366   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 367                      coding_attr_iso_initial),          \
 368                reg)))
 369
 370
 371 #define CODING_ISO_REQUEST(coding, charset_id)          \
 372   (((charset_id) <= (coding)->max_charset_id            \
 373     ? ((coding)->safe_charsets[charset_id] != 255       \
 374        ? (coding)->safe_charsets[charset_id]            \
 375        : -1)                                            \
 376     : -1))
 377
 378
 379 #define CODING_ISO_FLAGS(coding)        \
 380   ((coding)->spec.iso_2022.flags)
 381 #define CODING_ISO_DESIGNATION(coding, reg)     \
 382   ((coding)->spec.iso_2022.current_designation[reg])
 383 #define CODING_ISO_INVOCATION(coding, plane)    \
 384   ((coding)->spec.iso_2022.current_invocation[plane])
 385 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 386   ((coding)->spec.iso_2022.single_shifting)
 387 #define CODING_ISO_BOL(coding)  \
 388   ((coding)->spec.iso_2022.bol)
 389 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 390   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 391 #define CODING_ISO_CMP_STATUS(coding)   \
 392   (&(coding)->spec.iso_2022.cmp_status)
 393 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 394   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 395 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 396   ((coding)->spec.iso_2022.embedded_utf_8)
 397
 398 /* Control characters of ISO2022.  */
 399                         /* code */      /* function */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 654    and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 /* Store a byte C in the place pointed by DST and increment DST to the
 725    next free point, and increment PRODUCED_CHARS.  The caller should
 726    assure that C is 0..127, and declare and set the variable `dst'
 727    appropriately in advance.
 728 */
 729
 730
 731 #define EMIT_ONE_ASCII_BYTE(c)  \
 732   do {                          \
 733     produced_chars++;           \
 734     *dst++ = (c);               \
 735   } while (0)
 736
 737
 738 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 739
 740 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 741   do {                                  \
 742     produced_chars += 2;                \
 743     *dst++ = (c1), *dst++ = (c2);       \
 744   } while (0)
 745
 746
 747 /* Store a byte C in the place pointed by DST and increment DST to the
 748    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 749    store in an appropriate multibyte form.  The caller should
 750    declare and set the variables `dst' and `multibytep' appropriately
 751    in advance.  */
 752
 753 #define EMIT_ONE_BYTE(c)                \
 754   do {                                  \
 755     produced_chars++;                   \
 756     if (multibytep)                     \
 757       {                                 \
 758         unsigned ch = (c);              \
 759         if (ch >= 0x80)                 \
 760           ch = BYTE8_TO_CHAR (ch);      \
 761         CHAR_STRING_ADVANCE (ch, dst);  \
 762       }                                 \
 763     else                                \
 764       *dst++ = (c);                     \
 765   } while (0)
 766
 767
 768 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 769
 770 #define EMIT_TWO_BYTES(c1, c2)          \
 771   do {                                  \
 772     produced_chars += 2;                \
 773     if (multibytep)                     \
 774       {                                 \
 775         unsigned ch;                    \
 776                                         \
 777         ch = (c1);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781         ch = (c2);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785       }                                 \
 786     else                                \
 787       {                                 \
 788         *dst++ = (c1);                  \
 789         *dst++ = (c2);                  \
 790       }                                 \
 791   } while (0)
 792
 793
 794 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 795   do {                                  \
 796     EMIT_ONE_BYTE (c1);                 \
 797     EMIT_TWO_BYTES (c2, c3);            \
 798   } while (0)
 799
 800
 801 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 802   do {                                          \
 803     EMIT_TWO_BYTES (c1, c2);                    \
 804     EMIT_TWO_BYTES (c3, c4);                    \
 805   } while (0)
 806
 807
 808 static void
 809 record_conversion_result (struct coding_system *coding,
 810                           enum coding_result_code result)
 811 {
 812   coding->result = result;
 813   switch (result)
 814     {
 815     case CODING_RESULT_INSUFFICIENT_SRC:
 816       Vlast_code_conversion_error = Qinsufficient_source;
 817       break;
 818     case CODING_RESULT_INCONSISTENT_EOL:
 819       Vlast_code_conversion_error = Qinconsistent_eol;
 820       break;
 821     case CODING_RESULT_INVALID_SRC:
 822       Vlast_code_conversion_error = Qinvalid_source;
 823       break;
 824     case CODING_RESULT_INTERRUPT:
 825       Vlast_code_conversion_error = Qinterrupted;
 826       break;
 827     case CODING_RESULT_INSUFFICIENT_MEM:
 828       Vlast_code_conversion_error = Qinsufficient_memory;
 829       break;
 830     case CODING_RESULT_INSUFFICIENT_DST:
 831       /* Don't record this error in Vlast_code_conversion_error
 832          because it happens just temporarily and is resolved when the
 833          whole conversion is finished.  */
 834       break;
 835     case CODING_RESULT_SUCCESS:
 836       break;
 837     default:
 838       Vlast_code_conversion_error = intern ("Unknown error");
 839     }
 840 }
 841
 842 /* These wrapper macros are used to preserve validity of pointers into
 843    buffer text across calls to decode_char, encode_char, etc, which
 844    could cause relocation of buffers if it loads a charset map,
 845    because loading a charset map allocates large structures.  */
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     ptrdiff_t offset;                                                        \
 850                                                                              \
 851     charset_map_loaded = 0;                                                  \
 852     c = DECODE_CHAR (charset, code);                                         \
 853     if (charset_map_loaded                                                   \
 854         && (offset = coding_change_source (coding)))                         \
 855       {                                                                      \
 856         src += offset;                                                       \
 857         src_base += offset;                                                  \
 858         src_end += offset;                                                   \
 859       }                                                                      \
 860   } while (0)
 861
 862 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 863   do {                                                                  \
 864     ptrdiff_t offset;                                                   \
 865                                                                         \
 866     charset_map_loaded = 0;                                             \
 867     code = ENCODE_CHAR (charset, c);                                    \
 868     if (charset_map_loaded                                              \
 869         && (offset = coding_change_destination (coding)))               \
 870       {                                                                 \
 871         dst += offset;                                                  \
 872         dst_end += offset;                                              \
 873       }                                                                 \
 874   } while (0)
 875
 876 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 877   do {                                                                  \
 878     ptrdiff_t offset;                                                   \
 879                                                                         \
 880     charset_map_loaded = 0;                                             \
 881     charset = char_charset (c, charset_list, code_return);              \
 882     if (charset_map_loaded                                              \
 883         && (offset = coding_change_destination (coding)))               \
 884       {                                                                 \
 885         dst += offset;                                                  \
 886         dst_end += offset;                                              \
 887       }                                                                 \
 888   } while (0)
 889
 890 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 891   do {                                                                  \
 892     ptrdiff_t offset;                                                   \
 893                                                                         \
 894     charset_map_loaded = 0;                                             \
 895     result = CHAR_CHARSET_P (c, charset);                               \
 896     if (charset_map_loaded                                              \
 897         && (offset = coding_change_destination (coding)))               \
 898       {                                                                 \
 899         dst += offset;                                                  \
 900         dst_end += offset;                                              \
 901       }                                                                 \
 902   } while (0)
 903
 904
 905 /* If there are at least BYTES length of room at dst, allocate memory
 906    for coding->destination and update dst and dst_end.  We don't have
 907    to take care of coding->source which will be relocated.  It is
 908    handled by calling coding_set_source in encode_coding.  */
 909
 910 #define ASSURE_DESTINATION(bytes)                               \
 911   do {                                                          \
 912     if (dst + (bytes) >= dst_end)                               \
 913       {                                                         \
 914         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 915                                                                 \
 916         dst = alloc_destination (coding, more_bytes, dst);      \
 917         dst_end = coding->destination + coding->dst_bytes;      \
 918       }                                                         \
 919   } while (0)
 920
 921
 922 /* Store multibyte form of the character C in P, and advance P to the
 923    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 924    never calls MAYBE_UNIFY_CHAR.  */
 925
 926 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 927   do {                                          \
 928     if ((c) <= MAX_1_BYTE_CHAR)                 \
 929       *(p)++ = (c);                             \
 930     else if ((c) <= MAX_2_BYTE_CHAR)            \
 931       *(p)++ = (0xC0 | ((c) >> 6)),             \
 932         *(p)++ = (0x80 | ((c) & 0x3F));         \
 933     else if ((c) <= MAX_3_BYTE_CHAR)            \
 934       *(p)++ = (0xE0 | ((c) >> 12)),            \
 935         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 936         *(p)++ = (0x80 | ((c) & 0x3F));         \
 937     else if ((c) <= MAX_4_BYTE_CHAR)            \
 938       *(p)++ = (0xF0 | (c >> 18)),              \
 939         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 940         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 941         *(p)++ = (0x80 | (c & 0x3F));           \
 942     else if ((c) <= MAX_5_BYTE_CHAR)            \
 943       *(p)++ = 0xF8,                            \
 944         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 945         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 946         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 947         *(p)++ = (0x80 | (c & 0x3F));           \
 948     else                                        \
 949       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
 950   } while (0)
 951
 952
 953 /* Return the character code of character whose multibyte form is at
 954    P, and advance P to the end of the multibyte form.  This is like
 955    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
 956
 957 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
 958   (!((p)[0] & 0x80)                                             \
 959    ? *(p)++                                                     \
 960    : ! ((p)[0] & 0x20)                                          \
 961    ? ((p) += 2,                                                 \
 962       ((((p)[-2] & 0x1F) << 6)                                  \
 963        | ((p)[-1] & 0x3F)                                       \
 964        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
 965    : ! ((p)[0] & 0x10)                                          \
 966    ? ((p) += 3,                                                 \
 967       ((((p)[-3] & 0x0F) << 12)                                 \
 968        | (((p)[-2] & 0x3F) << 6)                                \
 969        | ((p)[-1] & 0x3F)))                                     \
 970    : ! ((p)[0] & 0x08)                                          \
 971    ? ((p) += 4,                                                 \
 972       ((((p)[-4] & 0xF) << 18)                                  \
 973        | (((p)[-3] & 0x3F) << 12)                               \
 974        | (((p)[-2] & 0x3F) << 6)                                \
 975        | ((p)[-1] & 0x3F)))                                     \
 976    : ((p) += 5,                                                 \
 977       ((((p)[-4] & 0x3F) << 18)                                 \
 978        | (((p)[-3] & 0x3F) << 12)                               \
 979        | (((p)[-2] & 0x3F) << 6)                                \
 980        | ((p)[-1] & 0x3F))))
 981
 982
 983 /* Set coding->source from coding->src_object.  */
 984
 985 static void
 986 coding_set_source (struct coding_system *coding)
 987 {
 988   if (BUFFERP (coding->src_object))
 989     {
 990       struct buffer *buf = XBUFFER (coding->src_object);
 991
 992       if (coding->src_pos < 0)
 993         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 994       else
 995         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 996     }
 997   else if (STRINGP (coding->src_object))
 998     {
 999       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1000     }
1001   else
1002     {
1003       /* Otherwise, the source is C string and is never relocated
1004          automatically.  Thus we don't have to update anything.  */
1005     }
1006 }
1007
1008
1009 /* Set coding->source from coding->src_object, and return how many
1010    bytes coding->source was changed.  */
1011
1012 static ptrdiff_t
1013 coding_change_source (struct coding_system *coding)
1014 {
1015   const unsigned char *orig = coding->source;
1016   coding_set_source (coding);
1017   return coding->source - orig;
1018 }
1019
1020
1021 /* Set coding->destination from coding->dst_object.  */
1022
1023 static void
1024 coding_set_destination (struct coding_system *coding)
1025 {
1026   if (BUFFERP (coding->dst_object))
1027     {
1028       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1029         {
1030           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1031           coding->dst_bytes = (GAP_END_ADDR
1032                                - (coding->src_bytes - coding->consumed)
1033                                - coding->destination);
1034         }
1035       else
1036         {
1037           /* We are sure that coding->dst_pos_byte is before the gap
1038              of the buffer. */
1039           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1040                                  + coding->dst_pos_byte - BEG_BYTE);
1041           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1042                                - coding->destination);
1043         }
1044     }
1045   else
1046     {
1047       /* Otherwise, the destination is C string and is never relocated
1048          automatically.  Thus we don't have to update anything.  */
1049     }
1050 }
1051
1052
1053 /* Set coding->destination from coding->dst_object, and return how
1054    many bytes coding->destination was changed.  */
1055
1056 static ptrdiff_t
1057 coding_change_destination (struct coding_system *coding)
1058 {
1059   const unsigned char *orig = coding->destination;
1060   coding_set_destination (coding);
1061   return coding->destination - orig;
1062 }
1063
1064
1065 static void
1066 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1067 {
1068   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1069     string_overflow ();
1070   coding->destination = xrealloc (coding->destination,
1071                                   coding->dst_bytes + bytes);
1072   coding->dst_bytes += bytes;
1073 }
1074
1075 static void
1076 coding_alloc_by_making_gap (struct coding_system *coding,
1077                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1078 {
1079   if (EQ (coding->src_object, coding->dst_object))
1080     {
1081       /* The gap may contain the produced data at the head and not-yet
1082          consumed data at the tail.  To preserve those data, we at
1083          first make the gap size to zero, then increase the gap
1084          size.  */
1085       ptrdiff_t add = GAP_SIZE;
1086
1087       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1088       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1089       make_gap (bytes);
1090       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1091       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1092     }
1093   else
1094     {
1095       Lisp_Object this_buffer;
1096
1097       this_buffer = Fcurrent_buffer ();
1098       set_buffer_internal (XBUFFER (coding->dst_object));
1099       make_gap (bytes);
1100       set_buffer_internal (XBUFFER (this_buffer));
1101     }
1102 }
1103
1104
1105 static unsigned char *
1106 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1107                    unsigned char *dst)
1108 {
1109   ptrdiff_t offset = dst - coding->destination;
1110
1111   if (BUFFERP (coding->dst_object))
1112     {
1113       struct buffer *buf = XBUFFER (coding->dst_object);
1114
1115       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1116     }
1117   else
1118     coding_alloc_by_realloc (coding, nbytes);
1119   coding_set_destination (coding);
1120   dst = coding->destination + offset;
1121   return dst;
1122 }
1123
1124 /** Macros for annotations.  */
1125
1126 /* An annotation data is stored in the array coding->charbuf in this
1127    format:
1128      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1129    LENGTH is the number of elements in the annotation.
1130    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1131    NCHARS is the number of characters in the text annotated.
1132
1133    The format of the following elements depend on ANNOTATION_MASK.
1134
1135    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1136    follows:
1137      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1138
1139    NBYTES is the number of bytes specified in the header part of
1140    old-style emacs-mule encoding, or 0 for the other kind of
1141    composition.
1142
1143    METHOD is one of enum composition_method.
1144
1145    Optional COMPOSITION-COMPONENTS are characters and composition
1146    rules.
1147
1148    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1149    follows.
1150
1151    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1152    recover from an invalid annotation, and should be skipped by
1153    produce_annotation.  */
1154
1155 /* Maximum length of the header of annotation data.  */
1156 #define MAX_ANNOTATION_LENGTH 5
1157
1158 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1159   do {                                                  \
1160     *(buf)++ = -(len);                                  \
1161     *(buf)++ = (mask);                                  \
1162     *(buf)++ = (nchars);                                \
1163     coding->annotated = 1;                              \
1164   } while (0);
1165
1166 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1167   do {                                                                      \
1168     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1169     *buf++ = nbytes;                                                        \
1170     *buf++ = method;                                                        \
1171   } while (0)
1172
1173
1174 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1175   do {                                                                  \
1176     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1177     *buf++ = id;                                                        \
1178   } while (0)
1179
1180 \f
1181 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1182
1183
1184
1185 \f
1186 /*** 3. UTF-8 ***/
1187
1188 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1189    Return true if a text is encoded in UTF-8.  */
1190
1191 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1192 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1193 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1194 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1195 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1196 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1197
1198 #define UTF_8_BOM_1 0xEF
1199 #define UTF_8_BOM_2 0xBB
1200 #define UTF_8_BOM_3 0xBF
1201
1202 static bool
1203 detect_coding_utf_8 (struct coding_system *coding,
1204                      struct coding_detection_info *detect_info)
1205 {
1206   const unsigned char *src = coding->source, *src_base;
1207   const unsigned char *src_end = coding->source + coding->src_bytes;
1208   bool multibytep = coding->src_multibyte;
1209   ptrdiff_t consumed_chars = 0;
1210   bool bom_found = 0;
1211   bool found = 0;
1212
1213   detect_info->checked |= CATEGORY_MASK_UTF_8;
1214   /* A coding system of this category is always ASCII compatible.  */
1215   src += coding->head_ascii;
1216
1217   while (1)
1218     {
1219       int c, c1, c2, c3, c4;
1220
1221       src_base = src;
1222       ONE_MORE_BYTE (c);
1223       if (c < 0 || UTF_8_1_OCTET_P (c))
1224         continue;
1225       ONE_MORE_BYTE (c1);
1226       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1227         break;
1228       if (UTF_8_2_OCTET_LEADING_P (c))
1229         {
1230           found = 1;
1231           continue;
1232         }
1233       ONE_MORE_BYTE (c2);
1234       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1235         break;
1236       if (UTF_8_3_OCTET_LEADING_P (c))
1237         {
1238           found = 1;
1239           if (src_base == coding->source
1240               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1241             bom_found = 1;
1242           continue;
1243         }
1244       ONE_MORE_BYTE (c3);
1245       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1246         break;
1247       if (UTF_8_4_OCTET_LEADING_P (c))
1248         {
1249           found = 1;
1250           continue;
1251         }
1252       ONE_MORE_BYTE (c4);
1253       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1254         break;
1255       if (UTF_8_5_OCTET_LEADING_P (c))
1256         {
1257           found = 1;
1258           continue;
1259         }
1260       break;
1261     }
1262   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1263   return 0;
1264
1265  no_more_source:
1266   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1267     {
1268       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1269       return 0;
1270     }
1271   if (bom_found)
1272     {
1273       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1274       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1275     }
1276   else
1277     {
1278       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1279       if (found)
1280         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1281     }
1282   return 1;
1283 }
1284
1285
1286 static void
1287 decode_coding_utf_8 (struct coding_system *coding)
1288 {
1289   const unsigned char *src = coding->source + coding->consumed;
1290   const unsigned char *src_end = coding->source + coding->src_bytes;
1291   const unsigned char *src_base;
1292   int *charbuf = coding->charbuf + coding->charbuf_used;
1293   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1294   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1295   bool multibytep = coding->src_multibyte;
1296   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1297   bool eol_dos
1298     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1299   int byte_after_cr = -1;
1300
1301   if (bom != utf_without_bom)
1302     {
1303       int c1, c2, c3;
1304
1305       src_base = src;
1306       ONE_MORE_BYTE (c1);
1307       if (! UTF_8_3_OCTET_LEADING_P (c1))
1308         src = src_base;
1309       else
1310         {
1311           ONE_MORE_BYTE (c2);
1312           if (! UTF_8_EXTRA_OCTET_P (c2))
1313             src = src_base;
1314           else
1315             {
1316               ONE_MORE_BYTE (c3);
1317               if (! UTF_8_EXTRA_OCTET_P (c3))
1318                 src = src_base;
1319               else
1320                 {
1321                   if ((c1 != UTF_8_BOM_1)
1322                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1323                     src = src_base;
1324                   else
1325                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1326                 }
1327             }
1328         }
1329     }
1330   CODING_UTF_8_BOM (coding) = utf_without_bom;
1331
1332   while (1)
1333     {
1334       int c, c1, c2, c3, c4, c5;
1335
1336       src_base = src;
1337       consumed_chars_base = consumed_chars;
1338
1339       if (charbuf >= charbuf_end)
1340         {
1341           if (byte_after_cr >= 0)
1342             src_base--;
1343           break;
1344         }
1345
1346       if (byte_after_cr >= 0)
1347         c1 = byte_after_cr, byte_after_cr = -1;
1348       else
1349         ONE_MORE_BYTE (c1);
1350       if (c1 < 0)
1351         {
1352           c = - c1;
1353         }
1354       else if (UTF_8_1_OCTET_P (c1))
1355         {
1356           if (eol_dos && c1 == '\r')
1357             ONE_MORE_BYTE (byte_after_cr);
1358           c = c1;
1359         }
1360       else
1361         {
1362           ONE_MORE_BYTE (c2);
1363           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1364             goto invalid_code;
1365           if (UTF_8_2_OCTET_LEADING_P (c1))
1366             {
1367               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1368               /* Reject overlong sequences here and below.  Encoders
1369                  producing them are incorrect, they can be misleading,
1370                  and they mess up read/write invariance.  */
1371               if (c < 128)
1372                 goto invalid_code;
1373             }
1374           else
1375             {
1376               ONE_MORE_BYTE (c3);
1377               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1378                 goto invalid_code;
1379               if (UTF_8_3_OCTET_LEADING_P (c1))
1380                 {
1381                   c = (((c1 & 0xF) << 12)
1382                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1383                   if (c < 0x800
1384                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1385                     goto invalid_code;
1386                 }
1387               else
1388                 {
1389                   ONE_MORE_BYTE (c4);
1390                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1391                     goto invalid_code;
1392                   if (UTF_8_4_OCTET_LEADING_P (c1))
1393                     {
1394                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1395                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1396                     if (c < 0x10000)
1397                       goto invalid_code;
1398                     }
1399                   else
1400                     {
1401                       ONE_MORE_BYTE (c5);
1402                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1403                         goto invalid_code;
1404                       if (UTF_8_5_OCTET_LEADING_P (c1))
1405                         {
1406                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1407                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1408                                | (c5 & 0x3F));
1409                           if ((c > MAX_CHAR) || (c < 0x200000))
1410                             goto invalid_code;
1411                         }
1412                       else
1413                         goto invalid_code;
1414                     }
1415                 }
1416             }
1417         }
1418
1419       *charbuf++ = c;
1420       continue;
1421
1422     invalid_code:
1423       src = src_base;
1424       consumed_chars = consumed_chars_base;
1425       ONE_MORE_BYTE (c);
1426       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1427       coding->errors++;
1428     }
1429
1430  no_more_source:
1431   coding->consumed_char += consumed_chars_base;
1432   coding->consumed = src_base - coding->source;
1433   coding->charbuf_used = charbuf - coding->charbuf;
1434 }
1435
1436
1437 static bool
1438 encode_coding_utf_8 (struct coding_system *coding)
1439 {
1440   bool multibytep = coding->dst_multibyte;
1441   int *charbuf = coding->charbuf;
1442   int *charbuf_end = charbuf + coding->charbuf_used;
1443   unsigned char *dst = coding->destination + coding->produced;
1444   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1445   ptrdiff_t produced_chars = 0;
1446   int c;
1447
1448   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1449     {
1450       ASSURE_DESTINATION (3);
1451       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1452       CODING_UTF_8_BOM (coding) = utf_without_bom;
1453     }
1454
1455   if (multibytep)
1456     {
1457       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1458
1459       while (charbuf < charbuf_end)
1460         {
1461           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1462
1463           ASSURE_DESTINATION (safe_room);
1464           c = *charbuf++;
1465           if (CHAR_BYTE8_P (c))
1466             {
1467               c = CHAR_TO_BYTE8 (c);
1468               EMIT_ONE_BYTE (c);
1469             }
1470           else
1471             {
1472               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1473               for (p = str; p < pend; p++)
1474                 EMIT_ONE_BYTE (*p);
1475             }
1476         }
1477     }
1478   else
1479     {
1480       int safe_room = MAX_MULTIBYTE_LENGTH;
1481
1482       while (charbuf < charbuf_end)
1483         {
1484           ASSURE_DESTINATION (safe_room);
1485           c = *charbuf++;
1486           if (CHAR_BYTE8_P (c))
1487             *dst++ = CHAR_TO_BYTE8 (c);
1488           else
1489             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1490           produced_chars++;
1491         }
1492     }
1493   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1494   coding->produced_char += produced_chars;
1495   coding->produced = dst - coding->destination;
1496   return 0;
1497 }
1498
1499
1500 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1501    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1502
1503 #define UTF_16_HIGH_SURROGATE_P(val) \
1504   (((val) & 0xFC00) == 0xD800)
1505
1506 #define UTF_16_LOW_SURROGATE_P(val) \
1507   (((val) & 0xFC00) == 0xDC00)
1508
1509
1510 static bool
1511 detect_coding_utf_16 (struct coding_system *coding,
1512                       struct coding_detection_info *detect_info)
1513 {
1514   const unsigned char *src = coding->source;
1515   const unsigned char *src_end = coding->source + coding->src_bytes;
1516   bool multibytep = coding->src_multibyte;
1517   int c1, c2;
1518
1519   detect_info->checked |= CATEGORY_MASK_UTF_16;
1520   if (coding->mode & CODING_MODE_LAST_BLOCK
1521       && (coding->src_chars & 1))
1522     {
1523       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1524       return 0;
1525     }
1526
1527   TWO_MORE_BYTES (c1, c2);
1528   if ((c1 == 0xFF) && (c2 == 0xFE))
1529     {
1530       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1531                              | CATEGORY_MASK_UTF_16_AUTO);
1532       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1533                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1534                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1535     }
1536   else if ((c1 == 0xFE) && (c2 == 0xFF))
1537     {
1538       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1539                              | CATEGORY_MASK_UTF_16_AUTO);
1540       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1541                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1542                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1543     }
1544   else if (c2 < 0)
1545     {
1546       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1547       return 0;
1548     }
1549   else
1550     {
1551       /* We check the dispersion of Eth and Oth bytes where E is even and
1552          O is odd.  If both are high, we assume binary data.*/
1553       unsigned char e[256], o[256];
1554       unsigned e_num = 1, o_num = 1;
1555
1556       memset (e, 0, 256);
1557       memset (o, 0, 256);
1558       e[c1] = 1;
1559       o[c2] = 1;
1560
1561       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1562                                 |CATEGORY_MASK_UTF_16_BE
1563                                 | CATEGORY_MASK_UTF_16_LE);
1564
1565       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1566              != CATEGORY_MASK_UTF_16)
1567         {
1568           TWO_MORE_BYTES (c1, c2);
1569           if (c2 < 0)
1570             break;
1571           if (! e[c1])
1572             {
1573               e[c1] = 1;
1574               e_num++;
1575               if (e_num >= 128)
1576                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1577             }
1578           if (! o[c2])
1579             {
1580               o[c2] = 1;
1581               o_num++;
1582               if (o_num >= 128)
1583                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1584             }
1585         }
1586       return 0;
1587     }
1588
1589  no_more_source:
1590   return 1;
1591 }
1592
1593 static void
1594 decode_coding_utf_16 (struct coding_system *coding)
1595 {
1596   const unsigned char *src = coding->source + coding->consumed;
1597   const unsigned char *src_end = coding->source + coding->src_bytes;
1598   const unsigned char *src_base;
1599   int *charbuf = coding->charbuf + coding->charbuf_used;
1600   /* We may produces at most 3 chars in one loop.  */
1601   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1602   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1603   bool multibytep = coding->src_multibyte;
1604   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1605   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1606   int surrogate = CODING_UTF_16_SURROGATE (coding);
1607   bool eol_dos
1608     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1609   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1610
1611   if (bom == utf_with_bom)
1612     {
1613       int c, c1, c2;
1614
1615       src_base = src;
1616       ONE_MORE_BYTE (c1);
1617       ONE_MORE_BYTE (c2);
1618       c = (c1 << 8) | c2;
1619
1620       if (endian == utf_16_big_endian
1621           ? c != 0xFEFF : c != 0xFFFE)
1622         {
1623           /* The first two bytes are not BOM.  Treat them as bytes
1624              for a normal character.  */
1625           src = src_base;
1626           coding->errors++;
1627         }
1628       CODING_UTF_16_BOM (coding) = utf_without_bom;
1629     }
1630   else if (bom == utf_detect_bom)
1631     {
1632       /* We have already tried to detect BOM and failed in
1633          detect_coding.  */
1634       CODING_UTF_16_BOM (coding) = utf_without_bom;
1635     }
1636
1637   while (1)
1638     {
1639       int c, c1, c2;
1640
1641       src_base = src;
1642       consumed_chars_base = consumed_chars;
1643
1644       if (charbuf >= charbuf_end)
1645         {
1646           if (byte_after_cr1 >= 0)
1647             src_base -= 2;
1648           break;
1649         }
1650
1651       if (byte_after_cr1 >= 0)
1652         c1 = byte_after_cr1, byte_after_cr1 = -1;
1653       else
1654         ONE_MORE_BYTE (c1);
1655       if (c1 < 0)
1656         {
1657           *charbuf++ = -c1;
1658           continue;
1659         }
1660       if (byte_after_cr2 >= 0)
1661         c2 = byte_after_cr2, byte_after_cr2 = -1;
1662       else
1663         ONE_MORE_BYTE (c2);
1664       if (c2 < 0)
1665         {
1666           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1667           *charbuf++ = -c2;
1668           continue;
1669         }
1670       c = (endian == utf_16_big_endian
1671            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1672
1673       if (surrogate)
1674         {
1675           if (! UTF_16_LOW_SURROGATE_P (c))
1676             {
1677               if (endian == utf_16_big_endian)
1678                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1679               else
1680                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1681               *charbuf++ = c1;
1682               *charbuf++ = c2;
1683               coding->errors++;
1684               if (UTF_16_HIGH_SURROGATE_P (c))
1685                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1686               else
1687                 *charbuf++ = c;
1688             }
1689           else
1690             {
1691               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1692               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1693               *charbuf++ = 0x10000 + c;
1694             }
1695         }
1696       else
1697         {
1698           if (UTF_16_HIGH_SURROGATE_P (c))
1699             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1700           else
1701             {
1702               if (eol_dos && c == '\r')
1703                 {
1704                   ONE_MORE_BYTE (byte_after_cr1);
1705                   ONE_MORE_BYTE (byte_after_cr2);
1706                 }
1707               *charbuf++ = c;
1708             }
1709         }
1710     }
1711
1712  no_more_source:
1713   coding->consumed_char += consumed_chars_base;
1714   coding->consumed = src_base - coding->source;
1715   coding->charbuf_used = charbuf - coding->charbuf;
1716 }
1717
1718 static bool
1719 encode_coding_utf_16 (struct coding_system *coding)
1720 {
1721   bool multibytep = coding->dst_multibyte;
1722   int *charbuf = coding->charbuf;
1723   int *charbuf_end = charbuf + coding->charbuf_used;
1724   unsigned char *dst = coding->destination + coding->produced;
1725   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1726   int safe_room = 8;
1727   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1728   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1729   ptrdiff_t produced_chars = 0;
1730   int c;
1731
1732   if (bom != utf_without_bom)
1733     {
1734       ASSURE_DESTINATION (safe_room);
1735       if (big_endian)
1736         EMIT_TWO_BYTES (0xFE, 0xFF);
1737       else
1738         EMIT_TWO_BYTES (0xFF, 0xFE);
1739       CODING_UTF_16_BOM (coding) = utf_without_bom;
1740     }
1741
1742   while (charbuf < charbuf_end)
1743     {
1744       ASSURE_DESTINATION (safe_room);
1745       c = *charbuf++;
1746       if (c > MAX_UNICODE_CHAR)
1747         c = coding->default_char;
1748
1749       if (c < 0x10000)
1750         {
1751           if (big_endian)
1752             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1753           else
1754             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1755         }
1756       else
1757         {
1758           int c1, c2;
1759
1760           c -= 0x10000;
1761           c1 = (c >> 10) + 0xD800;
1762           c2 = (c & 0x3FF) + 0xDC00;
1763           if (big_endian)
1764             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1765           else
1766             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1767         }
1768     }
1769   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1770   coding->produced = dst - coding->destination;
1771   coding->produced_char += produced_chars;
1772   return 0;
1773 }
1774
1775 \f
1776 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1777
1778 /* Emacs' internal format for representation of multiple character
1779    sets is a kind of multi-byte encoding, i.e. characters are
1780    represented by variable-length sequences of one-byte codes.
1781
1782    ASCII characters and control characters (e.g. `tab', `newline') are
1783    represented by one-byte sequences which are their ASCII codes, in
1784    the range 0x00 through 0x7F.
1785
1786    8-bit characters of the range 0x80..0x9F are represented by
1787    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1788    code + 0x20).
1789
1790    8-bit characters of the range 0xA0..0xFF are represented by
1791    one-byte sequences which are their 8-bit code.
1792
1793    The other characters are represented by a sequence of `base
1794    leading-code', optional `extended leading-code', and one or two
1795    `position-code's.  The length of the sequence is determined by the
1796    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1797    whereas extended leading-code and position-code take the range 0xA0
1798    through 0xFF.  See `charset.h' for more details about leading-code
1799    and position-code.
1800
1801    --- CODE RANGE of Emacs' internal format ---
1802    character set        range
1803    -------------        -----
1804    ascii                0x00..0x7F
1805    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1806    eight-bit-graphic    0xA0..0xBF
1807    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1808    ---------------------------------------------
1809
1810    As this is the internal character representation, the format is
1811    usually not used externally (i.e. in a file or in a data sent to a
1812    process).  But, it is possible to have a text externally in this
1813    format (i.e. by encoding by the coding system `emacs-mule').
1814
1815    In that case, a sequence of one-byte codes has a slightly different
1816    form.
1817
1818    At first, all characters in eight-bit-control are represented by
1819    one-byte sequences which are their 8-bit code.
1820
1821    Next, character composition data are represented by the byte
1822    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1823    where,
1824         METHOD is 0xF2 plus one of composition method (enum
1825         composition_method),
1826
1827         BYTES is 0xA0 plus a byte length of this composition data,
1828
1829         CHARS is 0xA0 plus a number of characters composed by this
1830         data,
1831
1832         COMPONENTs are characters of multibyte form or composition
1833         rules encoded by two-byte of ASCII codes.
1834
1835    In addition, for backward compatibility, the following formats are
1836    also recognized as composition data on decoding.
1837
1838    0x80 MSEQ ...
1839    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1840
1841    Here,
1842         MSEQ is a multibyte form but in these special format:
1843           ASCII: 0xA0 ASCII_CODE+0x80,
1844           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1845         RULE is a one byte code of the range 0xA0..0xF0 that
1846         represents a composition rule.
1847   */
1848
1849 char emacs_mule_bytes[256];
1850
1851
1852 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1853    Return true if a text is encoded in 'emacs-mule'.  */
1854
1855 static bool
1856 detect_coding_emacs_mule (struct coding_system *coding,
1857                           struct coding_detection_info *detect_info)
1858 {
1859   const unsigned char *src = coding->source, *src_base;
1860   const unsigned char *src_end = coding->source + coding->src_bytes;
1861   bool multibytep = coding->src_multibyte;
1862   ptrdiff_t consumed_chars = 0;
1863   int c;
1864   int found = 0;
1865
1866   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1867   /* A coding system of this category is always ASCII compatible.  */
1868   src += coding->head_ascii;
1869
1870   while (1)
1871     {
1872       src_base = src;
1873       ONE_MORE_BYTE (c);
1874       if (c < 0)
1875         continue;
1876       if (c == 0x80)
1877         {
1878           /* Perhaps the start of composite character.  We simply skip
1879              it because analyzing it is too heavy for detecting.  But,
1880              at least, we check that the composite character
1881              constitutes of more than 4 bytes.  */
1882           const unsigned char *src_start;
1883
1884         repeat:
1885           src_start = src;
1886           do
1887             {
1888               ONE_MORE_BYTE (c);
1889             }
1890           while (c >= 0xA0);
1891
1892           if (src - src_start <= 4)
1893             break;
1894           found = CATEGORY_MASK_EMACS_MULE;
1895           if (c == 0x80)
1896             goto repeat;
1897         }
1898
1899       if (c < 0x80)
1900         {
1901           if (c < 0x20
1902               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1903             break;
1904         }
1905       else
1906         {
1907           int more_bytes = emacs_mule_bytes[c] - 1;
1908
1909           while (more_bytes > 0)
1910             {
1911               ONE_MORE_BYTE (c);
1912               if (c < 0xA0)
1913                 {
1914                   src--;        /* Unread the last byte.  */
1915                   break;
1916                 }
1917               more_bytes--;
1918             }
1919           if (more_bytes != 0)
1920             break;
1921           found = CATEGORY_MASK_EMACS_MULE;
1922         }
1923     }
1924   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1925   return 0;
1926
1927  no_more_source:
1928   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1929     {
1930       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1931       return 0;
1932     }
1933   detect_info->found |= found;
1934   return 1;
1935 }
1936
1937
1938 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1939    character.  If CMP_STATUS indicates that we must expect MSEQ or
1940    RULE described above, decode it and return the negative value of
1941    the decoded character or rule.  If an invalid byte is found, return
1942    -1.  If SRC is too short, return -2.  */
1943
1944 static int
1945 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1946                  int *nbytes, int *nchars, int *id,
1947                  struct composition_status *cmp_status)
1948 {
1949   const unsigned char *src_end = coding->source + coding->src_bytes;
1950   const unsigned char *src_base = src;
1951   bool multibytep = coding->src_multibyte;
1952   int charset_ID;
1953   unsigned code;
1954   int c;
1955   int consumed_chars = 0;
1956   bool mseq_found = 0;
1957
1958   ONE_MORE_BYTE (c);
1959   if (c < 0)
1960     {
1961       c = -c;
1962       charset_ID = emacs_mule_charset[0];
1963     }
1964   else
1965     {
1966       if (c >= 0xA0)
1967         {
1968           if (cmp_status->state != COMPOSING_NO
1969               && cmp_status->old_form)
1970             {
1971               if (cmp_status->state == COMPOSING_CHAR)
1972                 {
1973                   if (c == 0xA0)
1974                     {
1975                       ONE_MORE_BYTE (c);
1976                       c -= 0x80;
1977                       if (c < 0)
1978                         goto invalid_code;
1979                     }
1980                   else
1981                     c -= 0x20;
1982                   mseq_found = 1;
1983                 }
1984               else
1985                 {
1986                   *nbytes = src - src_base;
1987                   *nchars = consumed_chars;
1988                   return -c;
1989                 }
1990             }
1991           else
1992             goto invalid_code;
1993         }
1994
1995       switch (emacs_mule_bytes[c])
1996         {
1997         case 2:
1998           if ((charset_ID = emacs_mule_charset[c]) < 0)
1999             goto invalid_code;
2000           ONE_MORE_BYTE (c);
2001           if (c < 0xA0)
2002             goto invalid_code;
2003           code = c & 0x7F;
2004           break;
2005
2006         case 3:
2007           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2008               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2009             {
2010               ONE_MORE_BYTE (c);
2011               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2012                 goto invalid_code;
2013               ONE_MORE_BYTE (c);
2014               if (c < 0xA0)
2015                 goto invalid_code;
2016               code = c & 0x7F;
2017             }
2018           else
2019             {
2020               if ((charset_ID = emacs_mule_charset[c]) < 0)
2021                 goto invalid_code;
2022               ONE_MORE_BYTE (c);
2023               if (c < 0xA0)
2024                 goto invalid_code;
2025               code = (c & 0x7F) << 8;
2026               ONE_MORE_BYTE (c);
2027               if (c < 0xA0)
2028                 goto invalid_code;
2029               code |= c & 0x7F;
2030             }
2031           break;
2032
2033         case 4:
2034           ONE_MORE_BYTE (c);
2035           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2036             goto invalid_code;
2037           ONE_MORE_BYTE (c);
2038           if (c < 0xA0)
2039             goto invalid_code;
2040           code = (c & 0x7F) << 8;
2041           ONE_MORE_BYTE (c);
2042           if (c < 0xA0)
2043             goto invalid_code;
2044           code |= c & 0x7F;
2045           break;
2046
2047         case 1:
2048           code = c;
2049           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2050           break;
2051
2052         default:
2053           emacs_abort ();
2054         }
2055       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2056                           CHARSET_FROM_ID (charset_ID), code, c);
2057       if (c < 0)
2058         goto invalid_code;
2059     }
2060   *nbytes = src - src_base;
2061   *nchars = consumed_chars;
2062   if (id)
2063     *id = charset_ID;
2064   return (mseq_found ? -c : c);
2065
2066  no_more_source:
2067   return -2;
2068
2069  invalid_code:
2070   return -1;
2071 }
2072
2073
2074 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2075
2076 /* Handle these composition sequence ('|': the end of header elements,
2077    BYTES and CHARS >= 0xA0):
2078
2079    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2080    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2081    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2082
2083    and these old form:
2084
2085    (4) relative composition: 0x80 | MSEQ ... MSEQ
2086    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2087
2088    When the starter 0x80 and the following header elements are found,
2089    this annotation header is produced.
2090
2091         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2092
2093    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2094    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2095
2096    Then, upon reading the following elements, these codes are produced
2097    until the composition end is found:
2098
2099    (1) CHAR ... CHAR
2100    (2) ALT ... ALT CHAR ... CHAR
2101    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2102    (4) CHAR ... CHAR
2103    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2104
2105    When the composition end is found, LENGTH and NCHARS in the
2106    annotation header is updated as below:
2107
2108    (1) LENGTH: unchanged, NCHARS: unchanged
2109    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2110    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2111    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2112    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2113
2114    If an error is found while composing, the annotation header is
2115    changed to the original composition header (plus filler -1s) as
2116    below:
2117
2118    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2119    (5)          [ 0x80 0xFF -1 -1- -1 ]
2120
2121    and the sequence [ -2 DECODED-RULE ] is changed to the original
2122    byte sequence as below:
2123         o the original byte sequence is B: [ B -1 ]
2124         o the original byte sequence is B1 B2: [ B1 B2 ]
2125
2126    Most of the routines are implemented by macros because many
2127    variables and labels in the caller decode_coding_emacs_mule must be
2128    accessible, and they are usually called just once (thus doesn't
2129    increase the size of compiled object).  */
2130
2131 /* Decode a composition rule represented by C as a component of
2132    composition sequence of Emacs 20 style.  Set RULE to the decoded
2133    rule. */
2134
2135 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2136   do {                                                  \
2137     int gref, nref;                                     \
2138                                                         \
2139     c -= 0xA0;                                          \
2140     if (c < 0 || c >= 81)                               \
2141       goto invalid_code;                                \
2142     gref = c / 9, nref = c % 9;                         \
2143     if (gref == 4) gref = 10;                           \
2144     if (nref == 4) nref = 10;                           \
2145     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2146   } while (0)
2147
2148
2149 /* Decode a composition rule represented by C and the following byte
2150    at SRC as a component of composition sequence of Emacs 21 style.
2151    Set RULE to the decoded rule.  */
2152
2153 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2154   do {                                                  \
2155     int gref, nref;                                     \
2156                                                         \
2157     gref = c - 0x20;                                    \
2158     if (gref < 0 || gref >= 81)                         \
2159       goto invalid_code;                                \
2160     ONE_MORE_BYTE (c);                                  \
2161     nref = c - 0x20;                                    \
2162     if (nref < 0 || nref >= 81)                         \
2163       goto invalid_code;                                \
2164     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2165   } while (0)
2166
2167
2168 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2169    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2170    byte length of this composition information, CHARS is the number of
2171    characters composed by this composition.  */
2172
2173 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2174   do {                                                                  \
2175     enum composition_method method = c - 0xF2;                          \
2176     int nbytes, nchars;                                                 \
2177                                                                         \
2178     ONE_MORE_BYTE (c);                                                  \
2179     if (c < 0)                                                          \
2180       goto invalid_code;                                                \
2181     nbytes = c - 0xA0;                                                  \
2182     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2183       goto invalid_code;                                                \
2184     ONE_MORE_BYTE (c);                                                  \
2185     nchars = c - 0xA0;                                                  \
2186     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2187       goto invalid_code;                                                \
2188     cmp_status->old_form = 0;                                           \
2189     cmp_status->method = method;                                        \
2190     if (method == COMPOSITION_RELATIVE)                                 \
2191       cmp_status->state = COMPOSING_CHAR;                               \
2192     else                                                                \
2193       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2194     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2195     cmp_status->nchars = nchars;                                        \
2196     cmp_status->ncomps = nbytes - 4;                                    \
2197     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2198   } while (0)
2199
2200
2201 /* Start of Emacs 20 style format for relative composition.  */
2202
2203 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2204   do {                                                          \
2205     cmp_status->old_form = 1;                                   \
2206     cmp_status->method = COMPOSITION_RELATIVE;                  \
2207     cmp_status->state = COMPOSING_CHAR;                         \
2208     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2209     cmp_status->nchars = cmp_status->ncomps = 0;                \
2210     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2211   } while (0)
2212
2213
2214 /* Start of Emacs 20 style format for rule-base composition.  */
2215
2216 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2217   do {                                                          \
2218     cmp_status->old_form = 1;                                   \
2219     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2220     cmp_status->state = COMPOSING_CHAR;                         \
2221     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2222     cmp_status->nchars = cmp_status->ncomps = 0;                \
2223     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2224   } while (0)
2225
2226
2227 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2228   do {                                                  \
2229     const unsigned char *current_src = src;             \
2230                                                         \
2231     ONE_MORE_BYTE (c);                                  \
2232     if (c < 0)                                          \
2233       goto invalid_code;                                \
2234     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2235         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2236       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2237     else if (c < 0xA0)                                  \
2238       goto invalid_code;                                \
2239     else if (c < 0xC0)                                  \
2240       {                                                 \
2241         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2242         /* Re-read C as a composition component.  */    \
2243         src = current_src;                              \
2244       }                                                 \
2245     else if (c == 0xFF)                                 \
2246       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2247     else                                                \
2248       goto invalid_code;                                \
2249   } while (0)
2250
2251 #define EMACS_MULE_COMPOSITION_END()                            \
2252   do {                                                          \
2253     int idx = - cmp_status->length;                             \
2254                                                                 \
2255     if (cmp_status->old_form)                                   \
2256       charbuf[idx + 2] = cmp_status->nchars;                    \
2257     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2258       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2259     cmp_status->state = COMPOSING_NO;                           \
2260   } while (0)
2261
2262
2263 static int
2264 emacs_mule_finish_composition (int *charbuf,
2265                                struct composition_status *cmp_status)
2266 {
2267   int idx = - cmp_status->length;
2268   int new_chars;
2269
2270   if (cmp_status->old_form && cmp_status->nchars > 0)
2271     {
2272       charbuf[idx + 2] = cmp_status->nchars;
2273       new_chars = 0;
2274       if (cmp_status->method == COMPOSITION_WITH_RULE
2275           && cmp_status->state == COMPOSING_CHAR)
2276         {
2277           /* The last rule was invalid.  */
2278           int rule = charbuf[-1] + 0xA0;
2279
2280           charbuf[-2] = BYTE8_TO_CHAR (rule);
2281           charbuf[-1] = -1;
2282           new_chars = 1;
2283         }
2284     }
2285   else
2286     {
2287       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2288
2289       if (cmp_status->method == COMPOSITION_WITH_RULE)
2290         {
2291           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2292           charbuf[idx++] = -3;
2293           charbuf[idx++] = 0;
2294           new_chars = 1;
2295         }
2296       else
2297         {
2298           int nchars = charbuf[idx + 1] + 0xA0;
2299           int nbytes = charbuf[idx + 2] + 0xA0;
2300
2301           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2302           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2303           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2304           charbuf[idx++] = -1;
2305           new_chars = 4;
2306         }
2307     }
2308   cmp_status->state = COMPOSING_NO;
2309   return new_chars;
2310 }
2311
2312 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2313   do {                                                                    \
2314     if (cmp_status->state != COMPOSING_NO)                                \
2315       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2316   } while (0)
2317
2318
2319 static void
2320 decode_coding_emacs_mule (struct coding_system *coding)
2321 {
2322   const unsigned char *src = coding->source + coding->consumed;
2323   const unsigned char *src_end = coding->source + coding->src_bytes;
2324   const unsigned char *src_base;
2325   int *charbuf = coding->charbuf + coding->charbuf_used;
2326   /* We may produce two annotations (charset and composition) in one
2327      loop and one more charset annotation at the end.  */
2328   int *charbuf_end
2329     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2330       /* We can produce up to 2 characters in a loop.  */
2331       - 1;
2332   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2333   bool multibytep = coding->src_multibyte;
2334   ptrdiff_t char_offset = coding->produced_char;
2335   ptrdiff_t last_offset = char_offset;
2336   int last_id = charset_ascii;
2337   bool eol_dos
2338     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2339   int byte_after_cr = -1;
2340   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2341
2342   if (cmp_status->state != COMPOSING_NO)
2343     {
2344       int i;
2345
2346       if (charbuf_end - charbuf < cmp_status->length)
2347         emacs_abort ();
2348       for (i = 0; i < cmp_status->length; i++)
2349         *charbuf++ = cmp_status->carryover[i];
2350       coding->annotated = 1;
2351     }
2352
2353   while (1)
2354     {
2355       int c, id IF_LINT (= 0);
2356
2357       src_base = src;
2358       consumed_chars_base = consumed_chars;
2359
2360       if (charbuf >= charbuf_end)
2361         {
2362           if (byte_after_cr >= 0)
2363             src_base--;
2364           break;
2365         }
2366
2367       if (byte_after_cr >= 0)
2368         c = byte_after_cr, byte_after_cr = -1;
2369       else
2370         ONE_MORE_BYTE (c);
2371
2372       if (c < 0 || c == 0x80)
2373         {
2374           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2375           if (c < 0)
2376             {
2377               *charbuf++ = -c;
2378               char_offset++;
2379             }
2380           else
2381             DECODE_EMACS_MULE_COMPOSITION_START ();
2382           continue;
2383         }
2384
2385       if (c < 0x80)
2386         {
2387           if (eol_dos && c == '\r')
2388             ONE_MORE_BYTE (byte_after_cr);
2389           id = charset_ascii;
2390           if (cmp_status->state != COMPOSING_NO)
2391             {
2392               if (cmp_status->old_form)
2393                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2394               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2395                 cmp_status->ncomps--;
2396             }
2397         }
2398       else
2399         {
2400           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2401           /* emacs_mule_char can load a charset map from a file, which
2402              allocates a large structure and might cause buffer text
2403              to be relocated as result.  Thus, we need to remember the
2404              original pointer to buffer text, and fix up all related
2405              pointers after the call.  */
2406           const unsigned char *orig = coding->source;
2407           ptrdiff_t offset;
2408
2409           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2410                                cmp_status);
2411           offset = coding->source - orig;
2412           if (offset)
2413             {
2414               src += offset;
2415               src_base += offset;
2416               src_end += offset;
2417             }
2418           if (c < 0)
2419             {
2420               if (c == -1)
2421                 goto invalid_code;
2422               if (c == -2)
2423                 break;
2424             }
2425           src = src_base + nbytes;
2426           consumed_chars = consumed_chars_base + nchars;
2427           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2428             cmp_status->ncomps -= nchars;
2429         }
2430
2431       /* Now if C >= 0, we found a normally encoded character, if C <
2432          0, we found an old-style composition component character or
2433          rule.  */
2434
2435       if (cmp_status->state == COMPOSING_NO)
2436         {
2437           if (last_id != id)
2438             {
2439               if (last_id != charset_ascii)
2440                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2441                                   last_id);
2442               last_id = id;
2443               last_offset = char_offset;
2444             }
2445           *charbuf++ = c;
2446           char_offset++;
2447         }
2448       else if (cmp_status->state == COMPOSING_CHAR)
2449         {
2450           if (cmp_status->old_form)
2451             {
2452               if (c >= 0)
2453                 {
2454                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2455                   *charbuf++ = c;
2456                   char_offset++;
2457                 }
2458               else
2459                 {
2460                   *charbuf++ = -c;
2461                   cmp_status->nchars++;
2462                   cmp_status->length++;
2463                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2464                     EMACS_MULE_COMPOSITION_END ();
2465                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2466                     cmp_status->state = COMPOSING_RULE;
2467                 }
2468             }
2469           else
2470             {
2471               *charbuf++ = c;
2472               cmp_status->length++;
2473               cmp_status->nchars--;
2474               if (cmp_status->nchars == 0)
2475                 EMACS_MULE_COMPOSITION_END ();
2476             }
2477         }
2478       else if (cmp_status->state == COMPOSING_RULE)
2479         {
2480           int rule;
2481
2482           if (c >= 0)
2483             {
2484               EMACS_MULE_COMPOSITION_END ();
2485               *charbuf++ = c;
2486               char_offset++;
2487             }
2488           else
2489             {
2490               c = -c;
2491               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2492               if (rule < 0)
2493                 goto invalid_code;
2494               *charbuf++ = -2;
2495               *charbuf++ = rule;
2496               cmp_status->length += 2;
2497               cmp_status->state = COMPOSING_CHAR;
2498             }
2499         }
2500       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2501         {
2502           *charbuf++ = c;
2503           cmp_status->length++;
2504           if (cmp_status->ncomps == 0)
2505             cmp_status->state = COMPOSING_CHAR;
2506           else if (cmp_status->ncomps > 0)
2507             {
2508               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2509                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2510             }
2511           else
2512             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513         }
2514       else                      /* COMPOSING_COMPONENT_RULE */
2515         {
2516           int rule;
2517
2518           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2519           if (rule < 0)
2520             goto invalid_code;
2521           *charbuf++ = -2;
2522           *charbuf++ = rule;
2523           cmp_status->length += 2;
2524           cmp_status->ncomps--;
2525           if (cmp_status->ncomps > 0)
2526             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2527           else
2528             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2529         }
2530       continue;
2531
2532     invalid_code:
2533       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2534       src = src_base;
2535       consumed_chars = consumed_chars_base;
2536       ONE_MORE_BYTE (c);
2537       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2538       char_offset++;
2539       coding->errors++;
2540     }
2541
2542  no_more_source:
2543   if (cmp_status->state != COMPOSING_NO)
2544     {
2545       if (coding->mode & CODING_MODE_LAST_BLOCK)
2546         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2547       else
2548         {
2549           int i;
2550
2551           charbuf -= cmp_status->length;
2552           for (i = 0; i < cmp_status->length; i++)
2553             cmp_status->carryover[i] = charbuf[i];
2554         }
2555     }
2556   if (last_id != charset_ascii)
2557     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2558   coding->consumed_char += consumed_chars_base;
2559   coding->consumed = src_base - coding->source;
2560   coding->charbuf_used = charbuf - coding->charbuf;
2561 }
2562
2563
2564 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2565   do {                                          \
2566     if (id < 0xA0)                              \
2567       codes[0] = id, codes[1] = 0;              \
2568     else if (id < 0xE0)                         \
2569       codes[0] = 0x9A, codes[1] = id;           \
2570     else if (id < 0xF0)                         \
2571       codes[0] = 0x9B, codes[1] = id;           \
2572     else if (id < 0xF5)                         \
2573       codes[0] = 0x9C, codes[1] = id;           \
2574     else                                        \
2575       codes[0] = 0x9D, codes[1] = id;           \
2576   } while (0);
2577
2578
2579 static bool
2580 encode_coding_emacs_mule (struct coding_system *coding)
2581 {
2582   bool multibytep = coding->dst_multibyte;
2583   int *charbuf = coding->charbuf;
2584   int *charbuf_end = charbuf + coding->charbuf_used;
2585   unsigned char *dst = coding->destination + coding->produced;
2586   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2587   int safe_room = 8;
2588   ptrdiff_t produced_chars = 0;
2589   Lisp_Object attrs, charset_list;
2590   int c;
2591   int preferred_charset_id = -1;
2592
2593   CODING_GET_INFO (coding, attrs, charset_list);
2594   if (! EQ (charset_list, Vemacs_mule_charset_list))
2595     {
2596       charset_list = Vemacs_mule_charset_list;
2597       ASET (attrs, coding_attr_charset_list, charset_list);
2598     }
2599
2600   while (charbuf < charbuf_end)
2601     {
2602       ASSURE_DESTINATION (safe_room);
2603       c = *charbuf++;
2604
2605       if (c < 0)
2606         {
2607           /* Handle an annotation.  */
2608           switch (*charbuf)
2609             {
2610             case CODING_ANNOTATE_COMPOSITION_MASK:
2611               /* Not yet implemented.  */
2612               break;
2613             case CODING_ANNOTATE_CHARSET_MASK:
2614               preferred_charset_id = charbuf[3];
2615               if (preferred_charset_id >= 0
2616                   && NILP (Fmemq (make_number (preferred_charset_id),
2617                                   charset_list)))
2618                 preferred_charset_id = -1;
2619               break;
2620             default:
2621               emacs_abort ();
2622             }
2623           charbuf += -c - 1;
2624           continue;
2625         }
2626
2627       if (ASCII_CHAR_P (c))
2628         EMIT_ONE_ASCII_BYTE (c);
2629       else if (CHAR_BYTE8_P (c))
2630         {
2631           c = CHAR_TO_BYTE8 (c);
2632           EMIT_ONE_BYTE (c);
2633         }
2634       else
2635         {
2636           struct charset *charset;
2637           unsigned code;
2638           int dimension;
2639           int emacs_mule_id;
2640           unsigned char leading_codes[2];
2641
2642           if (preferred_charset_id >= 0)
2643             {
2644               bool result;
2645
2646               charset = CHARSET_FROM_ID (preferred_charset_id);
2647               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2648               if (result)
2649                 code = ENCODE_CHAR (charset, c);
2650               else
2651                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2652                                      &code, charset);
2653             }
2654           else
2655             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2656                                  &code, charset);
2657           if (! charset)
2658             {
2659               c = coding->default_char;
2660               if (ASCII_CHAR_P (c))
2661                 {
2662                   EMIT_ONE_ASCII_BYTE (c);
2663                   continue;
2664                 }
2665               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2666                                    &code, charset);
2667             }
2668           dimension = CHARSET_DIMENSION (charset);
2669           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2670           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2671           EMIT_ONE_BYTE (leading_codes[0]);
2672           if (leading_codes[1])
2673             EMIT_ONE_BYTE (leading_codes[1]);
2674           if (dimension == 1)
2675             EMIT_ONE_BYTE (code | 0x80);
2676           else
2677             {
2678               code |= 0x8080;
2679               EMIT_ONE_BYTE (code >> 8);
2680               EMIT_ONE_BYTE (code & 0xFF);
2681             }
2682         }
2683     }
2684   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2685   coding->produced_char += produced_chars;
2686   coding->produced = dst - coding->destination;
2687   return 0;
2688 }
2689
2690 \f
2691 /*** 7. ISO2022 handlers ***/
2692
2693 /* The following note describes the coding system ISO2022 briefly.
2694    Since the intention of this note is to help understand the
2695    functions in this file, some parts are NOT ACCURATE or are OVERLY
2696    SIMPLIFIED.  For thorough understanding, please refer to the
2697    original document of ISO2022.  This is equivalent to the standard
2698    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2699
2700    ISO2022 provides many mechanisms to encode several character sets
2701    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2702    is encoded using bytes less than 128.  This may make the encoded
2703    text a little bit longer, but the text passes more easily through
2704    several types of gateway, some of which strip off the MSB (Most
2705    Significant Bit).
2706
2707    There are two kinds of character sets: control character sets and
2708    graphic character sets.  The former contain control characters such
2709    as `newline' and `escape' to provide control functions (control
2710    functions are also provided by escape sequences).  The latter
2711    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2712    two control character sets and many graphic character sets.
2713
2714    Graphic character sets are classified into one of the following
2715    four classes, according to the number of bytes (DIMENSION) and
2716    number of characters in one dimension (CHARS) of the set:
2717    - DIMENSION1_CHARS94
2718    - DIMENSION1_CHARS96
2719    - DIMENSION2_CHARS94
2720    - DIMENSION2_CHARS96
2721
2722    In addition, each character set is assigned an identification tag,
2723    unique for each set, called the "final character" (denoted as <F>
2724    hereafter).  The <F> of each character set is decided by ECMA(*)
2725    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2726    (0x30..0x3F are for private use only).
2727
2728    Note (*): ECMA = European Computer Manufacturers Association
2729
2730    Here are examples of graphic character sets [NAME(<F>)]:
2731         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2732         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2733         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2734         o DIMENSION2_CHARS96 -- none for the moment
2735
2736    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2737         C0 [0x00..0x1F] -- control character plane 0
2738         GL [0x20..0x7F] -- graphic character plane 0
2739         C1 [0x80..0x9F] -- control character plane 1
2740         GR [0xA0..0xFF] -- graphic character plane 1
2741
2742    A control character set is directly designated and invoked to C0 or
2743    C1 by an escape sequence.  The most common case is that:
2744    - ISO646's  control character set is designated/invoked to C0, and
2745    - ISO6429's control character set is designated/invoked to C1,
2746    and usually these designations/invocations are omitted in encoded
2747    text.  In a 7-bit environment, only C0 can be used, and a control
2748    character for C1 is encoded by an appropriate escape sequence to
2749    fit into the environment.  All control characters for C1 are
2750    defined to have corresponding escape sequences.
2751
2752    A graphic character set is at first designated to one of four
2753    graphic registers (G0 through G3), then these graphic registers are
2754    invoked to GL or GR.  These designations and invocations can be
2755    done independently.  The most common case is that G0 is invoked to
2756    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2757    these invocations and designations are omitted in encoded text.
2758    In a 7-bit environment, only GL can be used.
2759
2760    When a graphic character set of CHARS94 is invoked to GL, codes
2761    0x20 and 0x7F of the GL area work as control characters SPACE and
2762    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2763    be used.
2764
2765    There are two ways of invocation: locking-shift and single-shift.
2766    With locking-shift, the invocation lasts until the next different
2767    invocation, whereas with single-shift, the invocation affects the
2768    following character only and doesn't affect the locking-shift
2769    state.  Invocations are done by the following control characters or
2770    escape sequences:
2771
2772    ----------------------------------------------------------------------
2773    abbrev  function                  cntrl escape seq   description
2774    ----------------------------------------------------------------------
2775    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2776    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2777    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2778    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2779    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2780    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2781    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2782    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2783    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2784    ----------------------------------------------------------------------
2785    (*) These are not used by any known coding system.
2786
2787    Control characters for these functions are defined by macros
2788    ISO_CODE_XXX in `coding.h'.
2789
2790    Designations are done by the following escape sequences:
2791    ----------------------------------------------------------------------
2792    escape sequence      description
2793    ----------------------------------------------------------------------
2794    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2795    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2796    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2797    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2798    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2799    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2800    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2801    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2802    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2803    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2804    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2805    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2806    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2807    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2808    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2809    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2810    ----------------------------------------------------------------------
2811
2812    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2813    of dimension 1, chars 94, and final character <F>, etc...
2814
2815    Note (*): Although these designations are not allowed in ISO2022,
2816    Emacs accepts them on decoding, and produces them on encoding
2817    CHARS96 character sets in a coding system which is characterized as
2818    7-bit environment, non-locking-shift, and non-single-shift.
2819
2820    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2821    '(' must be omitted.  We refer to this as "short-form" hereafter.
2822
2823    Now you may notice that there are a lot of ways of encoding the
2824    same multilingual text in ISO2022.  Actually, there exist many
2825    coding systems such as Compound Text (used in X11's inter client
2826    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2827    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2828    localized platforms), and all of these are variants of ISO2022.
2829
2830    In addition to the above, Emacs handles two more kinds of escape
2831    sequences: ISO6429's direction specification and Emacs' private
2832    sequence for specifying character composition.
2833
2834    ISO6429's direction specification takes the following form:
2835         o CSI ']'      -- end of the current direction
2836         o CSI '0' ']'  -- end of the current direction
2837         o CSI '1' ']'  -- start of left-to-right text
2838         o CSI '2' ']'  -- start of right-to-left text
2839    The control character CSI (0x9B: control sequence introducer) is
2840    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2841
2842    Character composition specification takes the following form:
2843         o ESC '0' -- start relative composition
2844         o ESC '1' -- end composition
2845         o ESC '2' -- start rule-base composition (*)
2846         o ESC '3' -- start relative composition with alternate chars  (**)
2847         o ESC '4' -- start rule-base composition with alternate chars  (**)
2848   Since these are not standard escape sequences of any ISO standard,
2849   the use of them with these meanings is restricted to Emacs only.
2850
2851   (*) This form is used only in Emacs 20.7 and older versions,
2852   but newer versions can safely decode it.
2853   (**) This form is used only in Emacs 21.1 and newer versions,
2854   and older versions can't decode it.
2855
2856   Here's a list of example usages of these composition escape
2857   sequences (categorized by `enum composition_method').
2858
2859   COMPOSITION_RELATIVE:
2860         ESC 0 CHAR [ CHAR ] ESC 1
2861   COMPOSITION_WITH_RULE:
2862         ESC 2 CHAR [ RULE CHAR ] ESC 1
2863   COMPOSITION_WITH_ALTCHARS:
2864         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2865   COMPOSITION_WITH_RULE_ALTCHARS:
2866         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2867
2868 static enum iso_code_class_type iso_code_class[256];
2869
2870 #define SAFE_CHARSET_P(coding, id)      \
2871   ((id) <= (coding)->max_charset_id     \
2872    && (coding)->safe_charsets[id] != 255)
2873
2874 static void
2875 setup_iso_safe_charsets (Lisp_Object attrs)
2876 {
2877   Lisp_Object charset_list, safe_charsets;
2878   Lisp_Object request;
2879   Lisp_Object reg_usage;
2880   Lisp_Object tail;
2881   EMACS_INT reg94, reg96;
2882   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2883   int max_charset_id;
2884
2885   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2886   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2887       && ! EQ (charset_list, Viso_2022_charset_list))
2888     {
2889       charset_list = Viso_2022_charset_list;
2890       ASET (attrs, coding_attr_charset_list, charset_list);
2891       ASET (attrs, coding_attr_safe_charsets, Qnil);
2892     }
2893
2894   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2895     return;
2896
2897   max_charset_id = 0;
2898   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2899     {
2900       int id = XINT (XCAR (tail));
2901       if (max_charset_id < id)
2902         max_charset_id = id;
2903     }
2904
2905   safe_charsets = make_uninit_string (max_charset_id + 1);
2906   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2907   request = AREF (attrs, coding_attr_iso_request);
2908   reg_usage = AREF (attrs, coding_attr_iso_usage);
2909   reg94 = XINT (XCAR (reg_usage));
2910   reg96 = XINT (XCDR (reg_usage));
2911
2912   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2913     {
2914       Lisp_Object id;
2915       Lisp_Object reg;
2916       struct charset *charset;
2917
2918       id = XCAR (tail);
2919       charset = CHARSET_FROM_ID (XINT (id));
2920       reg = Fcdr (Fassq (id, request));
2921       if (! NILP (reg))
2922         SSET (safe_charsets, XINT (id), XINT (reg));
2923       else if (charset->iso_chars_96)
2924         {
2925           if (reg96 < 4)
2926             SSET (safe_charsets, XINT (id), reg96);
2927         }
2928       else
2929         {
2930           if (reg94 < 4)
2931             SSET (safe_charsets, XINT (id), reg94);
2932         }
2933     }
2934   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2935 }
2936
2937
2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2939    Return true if a text is encoded in one of ISO-2022 based coding
2940    systems.  */
2941
2942 static bool
2943 detect_coding_iso_2022 (struct coding_system *coding,
2944                         struct coding_detection_info *detect_info)
2945 {
2946   const unsigned char *src = coding->source, *src_base = src;
2947   const unsigned char *src_end = coding->source + coding->src_bytes;
2948   bool multibytep = coding->src_multibyte;
2949   bool single_shifting = 0;
2950   int id;
2951   int c, c1;
2952   ptrdiff_t consumed_chars = 0;
2953   int i;
2954   int rejected = 0;
2955   int found = 0;
2956   int composition_count = -1;
2957
2958   detect_info->checked |= CATEGORY_MASK_ISO;
2959
2960   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2961     {
2962       struct coding_system *this = &(coding_categories[i]);
2963       Lisp_Object attrs, val;
2964
2965       if (this->id < 0)
2966         continue;
2967       attrs = CODING_ID_ATTRS (this->id);
2968       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2969           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2970         setup_iso_safe_charsets (attrs);
2971       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2972       this->max_charset_id = SCHARS (val) - 1;
2973       this->safe_charsets = SDATA (val);
2974     }
2975
2976   /* A coding system of this category is always ASCII compatible.  */
2977   src += coding->head_ascii;
2978
2979   while (rejected != CATEGORY_MASK_ISO)
2980     {
2981       src_base = src;
2982       ONE_MORE_BYTE (c);
2983       switch (c)
2984         {
2985         case ISO_CODE_ESC:
2986           if (inhibit_iso_escape_detection)
2987             break;
2988           single_shifting = 0;
2989           ONE_MORE_BYTE (c);
2990           if (c == 'N' || c == 'O')
2991             {
2992               /* ESC <Fe> for SS2 or SS3.  */
2993               single_shifting = 1;
2994               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2995             }
2996           else if (c == '1')
2997             {
2998               /* End of composition.  */
2999               if (composition_count < 0
3000                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3001                 /* Invalid */
3002                 break;
3003               composition_count = -1;
3004               found |= CATEGORY_MASK_ISO;
3005             }
3006           else if (c >= '0' && c <= '4')
3007             {
3008               /* ESC <Fp> for start/end composition.  */
3009               composition_count = 0;
3010             }
3011           else
3012             {
3013               if (c >= '(' && c <= '/')
3014                 {
3015                   /* Designation sequence for a charset of dimension 1.  */
3016                   ONE_MORE_BYTE (c1);
3017                   if (c1 < ' ' || c1 >= 0x80
3018                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3019                     /* Invalid designation sequence.  Just ignore.  */
3020                     break;
3021                 }
3022               else if (c == '$')
3023                 {
3024                   /* Designation sequence for a charset of dimension 2.  */
3025                   ONE_MORE_BYTE (c);
3026                   if (c >= '@' && c <= 'B')
3027                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3028                     id = iso_charset_table[1][0][c];
3029                   else if (c >= '(' && c <= '/')
3030                     {
3031                       ONE_MORE_BYTE (c1);
3032                       if (c1 < ' ' || c1 >= 0x80
3033                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3034                         /* Invalid designation sequence.  Just ignore.  */
3035                         break;
3036                     }
3037                   else
3038                     /* Invalid designation sequence.  Just ignore it.  */
3039                     break;
3040                 }
3041               else
3042                 {
3043                   /* Invalid escape sequence.  Just ignore it.  */
3044                   break;
3045                 }
3046
3047               /* We found a valid designation sequence for CHARSET.  */
3048               rejected |= CATEGORY_MASK_ISO_8BIT;
3049               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3050                                   id))
3051                 found |= CATEGORY_MASK_ISO_7;
3052               else
3053                 rejected |= CATEGORY_MASK_ISO_7;
3054               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3055                                   id))
3056                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3057               else
3058                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3059               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3060                                   id))
3061                 found |= CATEGORY_MASK_ISO_7_ELSE;
3062               else
3063                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3064               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3065                                   id))
3066                 found |= CATEGORY_MASK_ISO_8_ELSE;
3067               else
3068                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3069             }
3070           break;
3071
3072         case ISO_CODE_SO:
3073         case ISO_CODE_SI:
3074           /* Locking shift out/in.  */
3075           if (inhibit_iso_escape_detection)
3076             break;
3077           single_shifting = 0;
3078           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3079           break;
3080
3081         case ISO_CODE_CSI:
3082           /* Control sequence introducer.  */
3083           single_shifting = 0;
3084           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3085           found |= CATEGORY_MASK_ISO_8_ELSE;
3086           goto check_extra_latin;
3087
3088         case ISO_CODE_SS2:
3089         case ISO_CODE_SS3:
3090           /* Single shift.   */
3091           if (inhibit_iso_escape_detection)
3092             break;
3093           single_shifting = 0;
3094           rejected |= CATEGORY_MASK_ISO_7BIT;
3095           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3096               & CODING_ISO_FLAG_SINGLE_SHIFT)
3097             {
3098               found |= CATEGORY_MASK_ISO_8_1;
3099               single_shifting = 1;
3100             }
3101           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3102               & CODING_ISO_FLAG_SINGLE_SHIFT)
3103             {
3104               found |= CATEGORY_MASK_ISO_8_2;
3105               single_shifting = 1;
3106             }
3107           if (single_shifting)
3108             break;
3109         check_extra_latin:
3110           if (! VECTORP (Vlatin_extra_code_table)
3111               || NILP (AREF (Vlatin_extra_code_table, c)))
3112             {
3113               rejected = CATEGORY_MASK_ISO;
3114               break;
3115             }
3116           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3117               & CODING_ISO_FLAG_LATIN_EXTRA)
3118             found |= CATEGORY_MASK_ISO_8_1;
3119           else
3120             rejected |= CATEGORY_MASK_ISO_8_1;
3121           rejected |= CATEGORY_MASK_ISO_8_2;
3122           break;
3123
3124         default:
3125           if (c < 0)
3126             continue;
3127           if (c < 0x80)
3128             {
3129               if (composition_count >= 0)
3130                 composition_count++;
3131               single_shifting = 0;
3132               break;
3133             }
3134           if (c >= 0xA0)
3135             {
3136               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3137               found |= CATEGORY_MASK_ISO_8_1;
3138               /* Check the length of succeeding codes of the range
3139                  0xA0..0FF.  If the byte length is even, we include
3140                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3141                  only when we are not single shifting.  */
3142               if (! single_shifting
3143                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3144                 {
3145                   int len = 1;
3146                   while (src < src_end)
3147                     {
3148                       src_base = src;
3149                       ONE_MORE_BYTE (c);
3150                       if (c < 0xA0)
3151                         {
3152                           src = src_base;
3153                           break;
3154                         }
3155                       len++;
3156                     }
3157
3158                   if (len & 1 && src < src_end)
3159                     {
3160                       rejected |= CATEGORY_MASK_ISO_8_2;
3161                       if (composition_count >= 0)
3162                         composition_count += len;
3163                     }
3164                   else
3165                     {
3166                       found |= CATEGORY_MASK_ISO_8_2;
3167                       if (composition_count >= 0)
3168                         composition_count += len / 2;
3169                     }
3170                 }
3171               break;
3172             }
3173         }
3174     }
3175   detect_info->rejected |= CATEGORY_MASK_ISO;
3176   return 0;
3177
3178  no_more_source:
3179   detect_info->rejected |= rejected;
3180   detect_info->found |= (found & ~rejected);
3181   return 1;
3182 }
3183
3184
3185 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3186    escape sequence should be kept.  */
3187 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3188   do {                                                                  \
3189     int id, prev;                                                       \
3190                                                                         \
3191     if (final < '0' || final >= 128                                     \
3192         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3193         || !SAFE_CHARSET_P (coding, id))                                \
3194       {                                                                 \
3195         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3196         chars_96 = -1;                                                  \
3197         break;                                                          \
3198       }                                                                 \
3199     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3200     if (id == charset_jisx0201_roman)                                   \
3201       {                                                                 \
3202         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3203           id = charset_ascii;                                           \
3204       }                                                                 \
3205     else if (id == charset_jisx0208_1978)                               \
3206       {                                                                 \
3207         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3208           id = charset_jisx0208;                                        \
3209       }                                                                 \
3210     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3211     /* If there was an invalid designation to REG previously, and this  \
3212        designation is ASCII to REG, we should keep this designation     \
3213        sequence.  */                                                    \
3214     if (prev == -2 && id == charset_ascii)                              \
3215       chars_96 = -1;                                                    \
3216   } while (0)
3217
3218
3219 /* Handle these composition sequence (ALT: alternate char):
3220
3221    (1) relative composition: ESC 0 CHAR ... ESC 1
3222    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3223    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3224    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3225
3226    When the start sequence (ESC 0/2/3/4) is found, this annotation
3227    header is produced.
3228
3229         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3230
3231    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3232    produced until the end sequence (ESC 1) is found:
3233
3234    (1) CHAR ... CHAR
3235    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3236    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3237    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3238
3239    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3240    annotation header is updated as below:
3241
3242    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3243    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3244    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3245    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3246
3247    If an error is found while composing, the annotation header is
3248    changed to:
3249
3250         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3251
3252    and the sequence [ -2 DECODED-RULE ] is changed to the original
3253    byte sequence as below:
3254         o the original byte sequence is B: [ B -1 ]
3255         o the original byte sequence is B1 B2: [ B1 B2 ]
3256    and the sequence [ -1 -1 ] is changed to the original byte
3257    sequence:
3258         [ ESC '0' ]
3259 */
3260
3261 /* Decode a composition rule C1 and maybe one more byte from the
3262    source, and set RULE to the encoded composition rule.  If the rule
3263    is invalid, goto invalid_code.  */
3264
3265 #define DECODE_COMPOSITION_RULE(rule)                                   \
3266   do {                                                                  \
3267     rule = c1 - 32;                                                     \
3268     if (rule < 0)                                                       \
3269       goto invalid_code;                                                \
3270     if (rule < 81)              /* old format (before ver.21) */        \
3271       {                                                                 \
3272         int gref = (rule) / 9;                                          \
3273         int nref = (rule) % 9;                                          \
3274         if (gref == 4) gref = 10;                                       \
3275         if (nref == 4) nref = 10;                                       \
3276         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3277       }                                                                 \
3278     else                        /* new format (after ver.21) */         \
3279       {                                                                 \
3280         int b;                                                          \
3281                                                                         \
3282         ONE_MORE_BYTE (b);                                              \
3283         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3284           goto invalid_code;                                            \
3285         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3286         rule += 0x100;   /* Distinguish it from the old format.  */     \
3287       }                                                                 \
3288   } while (0)
3289
3290 #define ENCODE_COMPOSITION_RULE(rule)                           \
3291   do {                                                          \
3292     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3293                                                                 \
3294     if (rule < 0x100)           /* old format */                \
3295       {                                                         \
3296         if (gref == 10) gref = 4;                               \
3297         if (nref == 10) nref = 4;                               \
3298         charbuf[idx] = 32 + gref * 9 + nref;                    \
3299         charbuf[idx + 1] = -1;                                  \
3300         new_chars++;                                            \
3301       }                                                         \
3302     else                                /* new format */        \
3303       {                                                         \
3304         charbuf[idx] = 32 + 81 + gref;                          \
3305         charbuf[idx + 1] = 32 + nref;                           \
3306         new_chars += 2;                                         \
3307       }                                                         \
3308   } while (0)
3309
3310 /* Finish the current composition as invalid.  */
3311
3312 static int
3313 finish_composition (int *charbuf, struct composition_status *cmp_status)
3314 {
3315   int idx = - cmp_status->length;
3316   int new_chars;
3317
3318   /* Recover the original ESC sequence */
3319   charbuf[idx++] = ISO_CODE_ESC;
3320   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3321                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3322                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3323                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3324                     : '4');
3325   charbuf[idx++] = -2;
3326   charbuf[idx++] = 0;
3327   charbuf[idx++] = -1;
3328   new_chars = cmp_status->nchars;
3329   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3330     for (; idx < 0; idx++)
3331       {
3332         int elt = charbuf[idx];
3333
3334         if (elt == -2)
3335           {
3336             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3337             idx++;
3338           }
3339         else if (elt == -1)
3340           {
3341             charbuf[idx++] = ISO_CODE_ESC;
3342             charbuf[idx] = '0';
3343             new_chars += 2;
3344           }
3345       }
3346   cmp_status->state = COMPOSING_NO;
3347   return new_chars;
3348 }
3349
3350 /* If characters are under composition, finish the composition.  */
3351 #define MAYBE_FINISH_COMPOSITION()                              \
3352   do {                                                          \
3353     if (cmp_status->state != COMPOSING_NO)                      \
3354       char_offset += finish_composition (charbuf, cmp_status);  \
3355   } while (0)
3356
3357 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3358
3359    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3360    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3361    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3362    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3363
3364    Produce this annotation sequence now:
3365
3366    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3367 */
3368
3369 #define DECODE_COMPOSITION_START(c1)                                       \
3370   do {                                                                     \
3371     if (c1 == '0'                                                          \
3372         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3373              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3374             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3375                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3376       {                                                                    \
3377         *charbuf++ = -1;                                                   \
3378         *charbuf++= -1;                                                    \
3379         cmp_status->state = COMPOSING_CHAR;                                \
3380         cmp_status->length += 2;                                           \
3381       }                                                                    \
3382     else                                                                   \
3383       {                                                                    \
3384         MAYBE_FINISH_COMPOSITION ();                                       \
3385         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3386                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3387                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3388                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3389         cmp_status->state                                                  \
3390           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3391         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3392         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3393         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3394         coding->annotated = 1;                                             \
3395       }                                                                    \
3396   } while (0)
3397
3398
3399 /* Handle composition end sequence ESC 1.  */
3400
3401 #define DECODE_COMPOSITION_END()                                        \
3402   do {                                                                  \
3403     if (cmp_status->nchars == 0                                         \
3404         || ((cmp_status->state == COMPOSING_CHAR)                       \
3405             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3406       {                                                                 \
3407         MAYBE_FINISH_COMPOSITION ();                                    \
3408         goto invalid_code;                                              \
3409       }                                                                 \
3410     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3411       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3412     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3413       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3414     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3415     char_offset += cmp_status->nchars;                                  \
3416     cmp_status->state = COMPOSING_NO;                                   \
3417   } while (0)
3418
3419 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3420
3421 #define STORE_COMPOSITION_RULE(rule)    \
3422   do {                                  \
3423     *charbuf++ = -2;                    \
3424     *charbuf++ = rule;                  \
3425     cmp_status->length += 2;            \
3426     cmp_status->state--;                \
3427   } while (0)
3428
3429 /* Store a composed char or a component char C in charbuf, and update
3430    cmp_status.  */
3431
3432 #define STORE_COMPOSITION_CHAR(c)                                       \
3433   do {                                                                  \
3434     *charbuf++ = (c);                                                   \
3435     cmp_status->length++;                                               \
3436     if (cmp_status->state == COMPOSING_CHAR)                            \
3437       cmp_status->nchars++;                                             \
3438     else                                                                \
3439       cmp_status->ncomps++;                                             \
3440     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3441         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3442             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3443       cmp_status->state++;                                              \
3444   } while (0)
3445
3446
3447 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3448
3449 static void
3450 decode_coding_iso_2022 (struct coding_system *coding)
3451 {
3452   const unsigned char *src = coding->source + coding->consumed;
3453   const unsigned char *src_end = coding->source + coding->src_bytes;
3454   const unsigned char *src_base;
3455   int *charbuf = coding->charbuf + coding->charbuf_used;
3456   /* We may produce two annotations (charset and composition) in one
3457      loop and one more charset annotation at the end.  */
3458   int *charbuf_end
3459     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3460   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3461   bool multibytep = coding->src_multibyte;
3462   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3463   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3464   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3465   int charset_id_2, charset_id_3;
3466   struct charset *charset;
3467   int c;
3468   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3469   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3470   ptrdiff_t char_offset = coding->produced_char;
3471   ptrdiff_t last_offset = char_offset;
3472   int last_id = charset_ascii;
3473   bool eol_dos
3474     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3475   int byte_after_cr = -1;
3476   int i;
3477
3478   setup_iso_safe_charsets (attrs);
3479   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3480
3481   if (cmp_status->state != COMPOSING_NO)
3482     {
3483       if (charbuf_end - charbuf < cmp_status->length)
3484         emacs_abort ();
3485       for (i = 0; i < cmp_status->length; i++)
3486         *charbuf++ = cmp_status->carryover[i];
3487       coding->annotated = 1;
3488     }
3489
3490   while (1)
3491     {
3492       int c1, c2, c3;
3493
3494       src_base = src;
3495       consumed_chars_base = consumed_chars;
3496
3497       if (charbuf >= charbuf_end)
3498         {
3499           if (byte_after_cr >= 0)
3500             src_base--;
3501           break;
3502         }
3503
3504       if (byte_after_cr >= 0)
3505         c1 = byte_after_cr, byte_after_cr = -1;
3506       else
3507         ONE_MORE_BYTE (c1);
3508       if (c1 < 0)
3509         goto invalid_code;
3510
3511       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3512         {
3513           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3514           char_offset++;
3515           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3516           continue;
3517         }
3518
3519       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3520         {
3521           if (c1 == ISO_CODE_ESC)
3522             {
3523               if (src + 1 >= src_end)
3524                 goto no_more_source;
3525               *charbuf++ = ISO_CODE_ESC;
3526               char_offset++;
3527               if (src[0] == '%' && src[1] == '@')
3528                 {
3529                   src += 2;
3530                   consumed_chars += 2;
3531                   char_offset += 2;
3532                   /* We are sure charbuf can contain two more chars. */
3533                   *charbuf++ = '%';
3534                   *charbuf++ = '@';
3535                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3536                 }
3537             }
3538           else
3539             {
3540               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3541               char_offset++;
3542             }
3543           continue;
3544         }
3545
3546       if ((cmp_status->state == COMPOSING_RULE
3547            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3548           && c1 != ISO_CODE_ESC)
3549         {
3550           int rule;
3551
3552           DECODE_COMPOSITION_RULE (rule);
3553           STORE_COMPOSITION_RULE (rule);
3554           continue;
3555         }
3556
3557       /* We produce at most one character.  */
3558       switch (iso_code_class [c1])
3559         {
3560         case ISO_0x20_or_0x7F:
3561           if (charset_id_0 < 0
3562               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3563             /* This is SPACE or DEL.  */
3564             charset = CHARSET_FROM_ID (charset_ascii);
3565           else
3566             charset = CHARSET_FROM_ID (charset_id_0);
3567           break;
3568
3569         case ISO_graphic_plane_0:
3570           if (charset_id_0 < 0)
3571             charset = CHARSET_FROM_ID (charset_ascii);
3572           else
3573             charset = CHARSET_FROM_ID (charset_id_0);
3574           break;
3575
3576         case ISO_0xA0_or_0xFF:
3577           if (charset_id_1 < 0
3578               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3579               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3580             goto invalid_code;
3581           /* This is a graphic character, we fall down ... */
3582
3583         case ISO_graphic_plane_1:
3584           if (charset_id_1 < 0)
3585             goto invalid_code;
3586           charset = CHARSET_FROM_ID (charset_id_1);
3587           break;
3588
3589         case ISO_control_0:
3590           if (eol_dos && c1 == '\r')
3591             ONE_MORE_BYTE (byte_after_cr);
3592           MAYBE_FINISH_COMPOSITION ();
3593           charset = CHARSET_FROM_ID (charset_ascii);
3594           break;
3595
3596         case ISO_control_1:
3597           goto invalid_code;
3598
3599         case ISO_shift_out:
3600           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3601               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3602             goto invalid_code;
3603           CODING_ISO_INVOCATION (coding, 0) = 1;
3604           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3605           continue;
3606
3607         case ISO_shift_in:
3608           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3609             goto invalid_code;
3610           CODING_ISO_INVOCATION (coding, 0) = 0;
3611           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3612           continue;
3613
3614         case ISO_single_shift_2_7:
3615           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3616             goto invalid_code;
3617         case ISO_single_shift_2:
3618           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3619             goto invalid_code;
3620           /* SS2 is handled as an escape sequence of ESC 'N' */
3621           c1 = 'N';
3622           goto label_escape_sequence;
3623
3624         case ISO_single_shift_3:
3625           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3626             goto invalid_code;
3627           /* SS2 is handled as an escape sequence of ESC 'O' */
3628           c1 = 'O';
3629           goto label_escape_sequence;
3630
3631         case ISO_control_sequence_introducer:
3632           /* CSI is handled as an escape sequence of ESC '[' ...  */
3633           c1 = '[';
3634           goto label_escape_sequence;
3635
3636         case ISO_escape:
3637           ONE_MORE_BYTE (c1);
3638         label_escape_sequence:
3639           /* Escape sequences handled here are invocation,
3640              designation, direction specification, and character
3641              composition specification.  */
3642           switch (c1)
3643             {
3644             case '&':           /* revision of following character set */
3645               ONE_MORE_BYTE (c1);
3646               if (!(c1 >= '@' && c1 <= '~'))
3647                 goto invalid_code;
3648               ONE_MORE_BYTE (c1);
3649               if (c1 != ISO_CODE_ESC)
3650                 goto invalid_code;
3651               ONE_MORE_BYTE (c1);
3652               goto label_escape_sequence;
3653
3654             case '$':           /* designation of 2-byte character set */
3655               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3656                 goto invalid_code;
3657               {
3658                 int reg, chars96;
3659
3660                 ONE_MORE_BYTE (c1);
3661                 if (c1 >= '@' && c1 <= 'B')
3662                   {     /* designation of JISX0208.1978, GB2312.1980,
3663                            or JISX0208.1980 */
3664                     reg = 0, chars96 = 0;
3665                   }
3666                 else if (c1 >= 0x28 && c1 <= 0x2B)
3667                   { /* designation of DIMENSION2_CHARS94 character set */
3668                     reg = c1 - 0x28, chars96 = 0;
3669                     ONE_MORE_BYTE (c1);
3670                   }
3671                 else if (c1 >= 0x2C && c1 <= 0x2F)
3672                   { /* designation of DIMENSION2_CHARS96 character set */
3673                     reg = c1 - 0x2C, chars96 = 1;
3674                     ONE_MORE_BYTE (c1);
3675                   }
3676                 else
3677                   goto invalid_code;
3678                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3679                 /* We must update these variables now.  */
3680                 if (reg == 0)
3681                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3682                 else if (reg == 1)
3683                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3684                 if (chars96 < 0)
3685                   goto invalid_code;
3686               }
3687               continue;
3688
3689             case 'n':           /* invocation of locking-shift-2 */
3690               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3691                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3692                 goto invalid_code;
3693               CODING_ISO_INVOCATION (coding, 0) = 2;
3694               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3695               continue;
3696
3697             case 'o':           /* invocation of locking-shift-3 */
3698               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3699                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3700                 goto invalid_code;
3701               CODING_ISO_INVOCATION (coding, 0) = 3;
3702               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3703               continue;
3704
3705             case 'N':           /* invocation of single-shift-2 */
3706               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3707                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3708                 goto invalid_code;
3709               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3710               if (charset_id_2 < 0)
3711                 charset = CHARSET_FROM_ID (charset_ascii);
3712               else
3713                 charset = CHARSET_FROM_ID (charset_id_2);
3714               ONE_MORE_BYTE (c1);
3715               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3716                 goto invalid_code;
3717               break;
3718
3719             case 'O':           /* invocation of single-shift-3 */
3720               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3721                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3722                 goto invalid_code;
3723               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3724               if (charset_id_3 < 0)
3725                 charset = CHARSET_FROM_ID (charset_ascii);
3726               else
3727                 charset = CHARSET_FROM_ID (charset_id_3);
3728               ONE_MORE_BYTE (c1);
3729               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3730                 goto invalid_code;
3731               break;
3732
3733             case '0': case '2': case '3': case '4': /* start composition */
3734               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3735                 goto invalid_code;
3736               if (last_id != charset_ascii)
3737                 {
3738                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3739                   last_id = charset_ascii;
3740                   last_offset = char_offset;
3741                 }
3742               DECODE_COMPOSITION_START (c1);
3743               continue;
3744
3745             case '1':           /* end composition */
3746               if (cmp_status->state == COMPOSING_NO)
3747                 goto invalid_code;
3748               DECODE_COMPOSITION_END ();
3749               continue;
3750
3751             case '[':           /* specification of direction */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3753                 goto invalid_code;
3754               /* For the moment, nested direction is not supported.
3755                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3756                  left-to-right, and nonzero means right-to-left.  */
3757               ONE_MORE_BYTE (c1);
3758               switch (c1)
3759                 {
3760                 case ']':       /* end of the current direction */
3761                   coding->mode &= ~CODING_MODE_DIRECTION;
3762
3763                 case '0':       /* end of the current direction */
3764                 case '1':       /* start of left-to-right direction */
3765                   ONE_MORE_BYTE (c1);
3766                   if (c1 == ']')
3767                     coding->mode &= ~CODING_MODE_DIRECTION;
3768                   else
3769                     goto invalid_code;
3770                   break;
3771
3772                 case '2':       /* start of right-to-left direction */
3773                   ONE_MORE_BYTE (c1);
3774                   if (c1 == ']')
3775                     coding->mode |= CODING_MODE_DIRECTION;
3776                   else
3777                     goto invalid_code;
3778                   break;
3779
3780                 default:
3781                   goto invalid_code;
3782                 }
3783               continue;
3784
3785             case '%':
3786               ONE_MORE_BYTE (c1);
3787               if (c1 == '/')
3788                 {
3789                   /* CTEXT extended segment:
3790                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3791                      We keep these bytes as is for the moment.
3792                      They may be decoded by post-read-conversion.  */
3793                   int dim, M, L;
3794                   int size;
3795
3796                   ONE_MORE_BYTE (dim);
3797                   if (dim < '0' || dim > '4')
3798                     goto invalid_code;
3799                   ONE_MORE_BYTE (M);
3800                   if (M < 128)
3801                     goto invalid_code;
3802                   ONE_MORE_BYTE (L);
3803                   if (L < 128)
3804                     goto invalid_code;
3805                   size = ((M - 128) * 128) + (L - 128);
3806                   if (charbuf + 6 > charbuf_end)
3807                     goto break_loop;
3808                   *charbuf++ = ISO_CODE_ESC;
3809                   *charbuf++ = '%';
3810                   *charbuf++ = '/';
3811                   *charbuf++ = dim;
3812                   *charbuf++ = BYTE8_TO_CHAR (M);
3813                   *charbuf++ = BYTE8_TO_CHAR (L);
3814                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3815                 }
3816               else if (c1 == 'G')
3817                 {
3818                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3819                      ESC % G --UTF-8-BYTES-- ESC % @
3820                      We keep these bytes as is for the moment.
3821                      They may be decoded by post-read-conversion.  */
3822                   if (charbuf + 3 > charbuf_end)
3823                     goto break_loop;
3824                   *charbuf++ = ISO_CODE_ESC;
3825                   *charbuf++ = '%';
3826                   *charbuf++ = 'G';
3827                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3828                 }
3829               else
3830                 goto invalid_code;
3831               continue;
3832               break;
3833
3834             default:
3835               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3836                 goto invalid_code;
3837               {
3838                 int reg, chars96;
3839
3840                 if (c1 >= 0x28 && c1 <= 0x2B)
3841                   { /* designation of DIMENSION1_CHARS94 character set */
3842                     reg = c1 - 0x28, chars96 = 0;
3843                     ONE_MORE_BYTE (c1);
3844                   }
3845                 else if (c1 >= 0x2C && c1 <= 0x2F)
3846                   { /* designation of DIMENSION1_CHARS96 character set */
3847                     reg = c1 - 0x2C, chars96 = 1;
3848                     ONE_MORE_BYTE (c1);
3849                   }
3850                 else
3851                   goto invalid_code;
3852                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3853                 /* We must update these variables now.  */
3854                 if (reg == 0)
3855                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3856                 else if (reg == 1)
3857                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3858                 if (chars96 < 0)
3859                   goto invalid_code;
3860               }
3861               continue;
3862             }
3863           break;
3864
3865         default:
3866           emacs_abort ();
3867         }
3868
3869       if (cmp_status->state == COMPOSING_NO
3870           && charset->id != charset_ascii
3871           && last_id != charset->id)
3872         {
3873           if (last_id != charset_ascii)
3874             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3875           last_id = charset->id;
3876           last_offset = char_offset;
3877         }
3878
3879       /* Now we know CHARSET and 1st position code C1 of a character.
3880          Produce a decoded character while getting 2nd and 3rd
3881          position codes C2, C3 if necessary.  */
3882       if (CHARSET_DIMENSION (charset) > 1)
3883         {
3884           ONE_MORE_BYTE (c2);
3885           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3886               || ((c1 & 0x80) != (c2 & 0x80)))
3887             /* C2 is not in a valid range.  */
3888             goto invalid_code;
3889           if (CHARSET_DIMENSION (charset) == 2)
3890             c1 = (c1 << 8) | c2;
3891           else
3892             {
3893               ONE_MORE_BYTE (c3);
3894               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3895                   || ((c1 & 0x80) != (c3 & 0x80)))
3896                 /* C3 is not in a valid range.  */
3897                 goto invalid_code;
3898               c1 = (c1 << 16) | (c2 << 8) | c2;
3899             }
3900         }
3901       c1 &= 0x7F7F7F;
3902       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3903       if (c < 0)
3904         {
3905           MAYBE_FINISH_COMPOSITION ();
3906           for (; src_base < src; src_base++, char_offset++)
3907             {
3908               if (ASCII_BYTE_P (*src_base))
3909                 *charbuf++ = *src_base;
3910               else
3911                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3912             }
3913         }
3914       else if (cmp_status->state == COMPOSING_NO)
3915         {
3916           *charbuf++ = c;
3917           char_offset++;
3918         }
3919       else if ((cmp_status->state == COMPOSING_CHAR
3920                 ? cmp_status->nchars
3921                 : cmp_status->ncomps)
3922                >= MAX_COMPOSITION_COMPONENTS)
3923         {
3924           /* Too long composition.  */
3925           MAYBE_FINISH_COMPOSITION ();
3926           *charbuf++ = c;
3927           char_offset++;
3928         }
3929       else
3930         STORE_COMPOSITION_CHAR (c);
3931       continue;
3932
3933     invalid_code:
3934       MAYBE_FINISH_COMPOSITION ();
3935       src = src_base;
3936       consumed_chars = consumed_chars_base;
3937       ONE_MORE_BYTE (c);
3938       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3939       char_offset++;
3940       coding->errors++;
3941       continue;
3942
3943     break_loop:
3944       break;
3945     }
3946
3947  no_more_source:
3948   if (cmp_status->state != COMPOSING_NO)
3949     {
3950       if (coding->mode & CODING_MODE_LAST_BLOCK)
3951         MAYBE_FINISH_COMPOSITION ();
3952       else
3953         {
3954           charbuf -= cmp_status->length;
3955           for (i = 0; i < cmp_status->length; i++)
3956             cmp_status->carryover[i] = charbuf[i];
3957         }
3958     }
3959   else if (last_id != charset_ascii)
3960     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3961   coding->consumed_char += consumed_chars_base;
3962   coding->consumed = src_base - coding->source;
3963   coding->charbuf_used = charbuf - coding->charbuf;
3964 }
3965
3966
3967 /* ISO2022 encoding stuff.  */
3968
3969 /*
3970    It is not enough to say just "ISO2022" on encoding, we have to
3971    specify more details.  In Emacs, each coding system of ISO2022
3972    variant has the following specifications:
3973         1. Initial designation to G0 thru G3.
3974         2. Allows short-form designation?
3975         3. ASCII should be designated to G0 before control characters?
3976         4. ASCII should be designated to G0 at end of line?
3977         5. 7-bit environment or 8-bit environment?
3978         6. Use locking-shift?
3979         7. Use Single-shift?
3980    And the following two are only for Japanese:
3981         8. Use ASCII in place of JIS0201-1976-Roman?
3982         9. Use JISX0208-1983 in place of JISX0208-1978?
3983    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3984    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3985    details.
3986 */
3987
3988 /* Produce codes (escape sequence) for designating CHARSET to graphic
3989    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3990    '@', 'A', or 'B' and the coding system CODING allows, produce
3991    designation sequence of short-form.  */
3992
3993 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3994   do {                                                                  \
3995     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3996     const char *intermediate_char_94 = "()*+";                          \
3997     const char *intermediate_char_96 = ",-./";                          \
3998     int revision = -1;                                                  \
3999                                                                         \
4000     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4001       revision = CHARSET_ISO_REVISION (charset);                        \
4002                                                                         \
4003     if (revision >= 0)                                                  \
4004       {                                                                 \
4005         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4006         EMIT_ONE_BYTE ('@' + revision);                                 \
4007       }                                                                 \
4008     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4009     if (CHARSET_DIMENSION (charset) == 1)                               \
4010       {                                                                 \
4011         int b;                                                          \
4012         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4013           b = intermediate_char_94[reg];                                \
4014         else                                                            \
4015           b = intermediate_char_96[reg];                                \
4016         EMIT_ONE_ASCII_BYTE (b);                                        \
4017       }                                                                 \
4018     else                                                                \
4019       {                                                                 \
4020         EMIT_ONE_ASCII_BYTE ('$');                                      \
4021         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4022           {                                                             \
4023             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4024                 || reg != 0                                             \
4025                 || final_char < '@' || final_char > 'B')                \
4026               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4027           }                                                             \
4028         else                                                            \
4029           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4030       }                                                                 \
4031     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4032                                                                         \
4033     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4034   } while (0)
4035
4036
4037 /* The following two macros produce codes (control character or escape
4038    sequence) for ISO2022 single-shift functions (single-shift-2 and
4039    single-shift-3).  */
4040
4041 #define ENCODE_SINGLE_SHIFT_2                                           \
4042   do {                                                                  \
4043     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4044       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4045     else                                                                \
4046       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4047     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4048   } while (0)
4049
4050
4051 #define ENCODE_SINGLE_SHIFT_3                                           \
4052   do {                                                                  \
4053     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4054       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4055     else                                                                \
4056       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4057     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4058   } while (0)
4059
4060
4061 /* The following four macros produce codes (control character or
4062    escape sequence) for ISO2022 locking-shift functions (shift-in,
4063    shift-out, locking-shift-2, and locking-shift-3).  */
4064
4065 #define ENCODE_SHIFT_IN                                 \
4066   do {                                                  \
4067     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4068     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4069   } while (0)
4070
4071
4072 #define ENCODE_SHIFT_OUT                                \
4073   do {                                                  \
4074     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4075     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4076   } while (0)
4077
4078
4079 #define ENCODE_LOCKING_SHIFT_2                          \
4080   do {                                                  \
4081     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4082     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4083   } while (0)
4084
4085
4086 #define ENCODE_LOCKING_SHIFT_3                          \
4087   do {                                                  \
4088     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4089     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4090   } while (0)
4091
4092
4093 /* Produce codes for a DIMENSION1 character whose character set is
4094    CHARSET and whose position-code is C1.  Designation and invocation
4095    sequences are also produced in advance if necessary.  */
4096
4097 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4098   do {                                                                  \
4099     int id = CHARSET_ID (charset);                                      \
4100                                                                         \
4101     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4102         && id == charset_ascii)                                         \
4103       {                                                                 \
4104         id = charset_jisx0201_roman;                                    \
4105         charset = CHARSET_FROM_ID (id);                                 \
4106       }                                                                 \
4107                                                                         \
4108     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4109       {                                                                 \
4110         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4111           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4112         else                                                            \
4113           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4114         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4115         break;                                                          \
4116       }                                                                 \
4117     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4118       {                                                                 \
4119         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4120         break;                                                          \
4121       }                                                                 \
4122     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4123       {                                                                 \
4124         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4125         break;                                                          \
4126       }                                                                 \
4127     else                                                                \
4128       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4129          must invoke it, or, at first, designate it to some graphic     \
4130          register.  Then repeat the loop to actually produce the        \
4131          character.  */                                                 \
4132       dst = encode_invocation_designation (charset, coding, dst,        \
4133                                            &produced_chars);            \
4134   } while (1)
4135
4136
4137 /* Produce codes for a DIMENSION2 character whose character set is
4138    CHARSET and whose position-codes are C1 and C2.  Designation and
4139    invocation codes are also produced in advance if necessary.  */
4140
4141 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4142   do {                                                                  \
4143     int id = CHARSET_ID (charset);                                      \
4144                                                                         \
4145     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4146         && id == charset_jisx0208)                                      \
4147       {                                                                 \
4148         id = charset_jisx0208_1978;                                     \
4149         charset = CHARSET_FROM_ID (id);                                 \
4150       }                                                                 \
4151                                                                         \
4152     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4153       {                                                                 \
4154         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4155           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4156         else                                                            \
4157           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4158         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4159         break;                                                          \
4160       }                                                                 \
4161     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4162       {                                                                 \
4163         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4164         break;                                                          \
4165       }                                                                 \
4166     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4167       {                                                                 \
4168         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4169         break;                                                          \
4170       }                                                                 \
4171     else                                                                \
4172       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4173          must invoke it, or, at first, designate it to some graphic     \
4174          register.  Then repeat the loop to actually produce the        \
4175          character.  */                                                 \
4176       dst = encode_invocation_designation (charset, coding, dst,        \
4177                                            &produced_chars);            \
4178   } while (1)
4179
4180
4181 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4182   do {                                                                     \
4183     unsigned code;                                                         \
4184     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4185                                                                            \
4186     if (CHARSET_DIMENSION (charset) == 1)                                  \
4187       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4188     else                                                                   \
4189       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4190   } while (0)
4191
4192
4193 /* Produce designation and invocation codes at a place pointed by DST
4194    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4195    Return new DST.  */
4196
4197 static unsigned char *
4198 encode_invocation_designation (struct charset *charset,
4199                                struct coding_system *coding,
4200                                unsigned char *dst, ptrdiff_t *p_nchars)
4201 {
4202   bool multibytep = coding->dst_multibyte;
4203   ptrdiff_t produced_chars = *p_nchars;
4204   int reg;                      /* graphic register number */
4205   int id = CHARSET_ID (charset);
4206
4207   /* At first, check designations.  */
4208   for (reg = 0; reg < 4; reg++)
4209     if (id == CODING_ISO_DESIGNATION (coding, reg))
4210       break;
4211
4212   if (reg >= 4)
4213     {
4214       /* CHARSET is not yet designated to any graphic registers.  */
4215       /* At first check the requested designation.  */
4216       reg = CODING_ISO_REQUEST (coding, id);
4217       if (reg < 0)
4218         /* Since CHARSET requests no special designation, designate it
4219            to graphic register 0.  */
4220         reg = 0;
4221
4222       ENCODE_DESIGNATION (charset, reg, coding);
4223     }
4224
4225   if (CODING_ISO_INVOCATION (coding, 0) != reg
4226       && CODING_ISO_INVOCATION (coding, 1) != reg)
4227     {
4228       /* Since the graphic register REG is not invoked to any graphic
4229          planes, invoke it to graphic plane 0.  */
4230       switch (reg)
4231         {
4232         case 0:                 /* graphic register 0 */
4233           ENCODE_SHIFT_IN;
4234           break;
4235
4236         case 1:                 /* graphic register 1 */
4237           ENCODE_SHIFT_OUT;
4238           break;
4239
4240         case 2:                 /* graphic register 2 */
4241           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4242             ENCODE_SINGLE_SHIFT_2;
4243           else
4244             ENCODE_LOCKING_SHIFT_2;
4245           break;
4246
4247         case 3:                 /* graphic register 3 */
4248           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4249             ENCODE_SINGLE_SHIFT_3;
4250           else
4251             ENCODE_LOCKING_SHIFT_3;
4252           break;
4253         }
4254     }
4255
4256   *p_nchars = produced_chars;
4257   return dst;
4258 }
4259
4260
4261 /* Produce codes for designation and invocation to reset the graphic
4262    planes and registers to initial state.  */
4263 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4264   do {                                                                  \
4265     int reg;                                                            \
4266     struct charset *charset;                                            \
4267                                                                         \
4268     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4269       ENCODE_SHIFT_IN;                                                  \
4270     for (reg = 0; reg < 4; reg++)                                       \
4271       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4272           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4273               != CODING_ISO_INITIAL (coding, reg)))                     \
4274         {                                                               \
4275           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4276           ENCODE_DESIGNATION (charset, reg, coding);                    \
4277         }                                                               \
4278   } while (0)
4279
4280
4281 /* Produce designation sequences of charsets in the line started from
4282    CHARBUF to a place pointed by DST, and return the number of
4283    produced bytes.  DST should not directly point a buffer text area
4284    which may be relocated by char_charset call.
4285
4286    If the current block ends before any end-of-line, we may fail to
4287    find all the necessary designations.  */
4288
4289 static ptrdiff_t
4290 encode_designation_at_bol (struct coding_system *coding,
4291                            int *charbuf, int *charbuf_end,
4292                            unsigned char *dst)
4293 {
4294   unsigned char *orig = dst;
4295   struct charset *charset;
4296   /* Table of charsets to be designated to each graphic register.  */
4297   int r[4];
4298   int c, found = 0, reg;
4299   ptrdiff_t produced_chars = 0;
4300   bool multibytep = coding->dst_multibyte;
4301   Lisp_Object attrs;
4302   Lisp_Object charset_list;
4303
4304   attrs = CODING_ID_ATTRS (coding->id);
4305   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4306   if (EQ (charset_list, Qiso_2022))
4307     charset_list = Viso_2022_charset_list;
4308
4309   for (reg = 0; reg < 4; reg++)
4310     r[reg] = -1;
4311
4312   while (charbuf < charbuf_end && found < 4)
4313     {
4314       int id;
4315
4316       c = *charbuf++;
4317       if (c == '\n')
4318         break;
4319       charset = char_charset (c, charset_list, NULL);
4320       id = CHARSET_ID (charset);
4321       reg = CODING_ISO_REQUEST (coding, id);
4322       if (reg >= 0 && r[reg] < 0)
4323         {
4324           found++;
4325           r[reg] = id;
4326         }
4327     }
4328
4329   if (found)
4330     {
4331       for (reg = 0; reg < 4; reg++)
4332         if (r[reg] >= 0
4333             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4334           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4335     }
4336
4337   return dst - orig;
4338 }
4339
4340 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4341
4342 static bool
4343 encode_coding_iso_2022 (struct coding_system *coding)
4344 {
4345   bool multibytep = coding->dst_multibyte;
4346   int *charbuf = coding->charbuf;
4347   int *charbuf_end = charbuf + coding->charbuf_used;
4348   unsigned char *dst = coding->destination + coding->produced;
4349   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4350   int safe_room = 16;
4351   bool bol_designation
4352     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4353        && CODING_ISO_BOL (coding));
4354   ptrdiff_t produced_chars = 0;
4355   Lisp_Object attrs, eol_type, charset_list;
4356   bool ascii_compatible;
4357   int c;
4358   int preferred_charset_id = -1;
4359
4360   CODING_GET_INFO (coding, attrs, charset_list);
4361   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4362   if (VECTORP (eol_type))
4363     eol_type = Qunix;
4364
4365   setup_iso_safe_charsets (attrs);
4366   /* Charset list may have been changed.  */
4367   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4368   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4369
4370   ascii_compatible
4371     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4372        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4373                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4374
4375   while (charbuf < charbuf_end)
4376     {
4377       ASSURE_DESTINATION (safe_room);
4378
4379       if (bol_designation)
4380         {
4381           /* We have to produce designation sequences if any now.  */
4382           unsigned char desig_buf[16];
4383           int nbytes;
4384           ptrdiff_t offset;
4385
4386           charset_map_loaded = 0;
4387           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4388                                               desig_buf);
4389           if (charset_map_loaded
4390               && (offset = coding_change_destination (coding)))
4391             {
4392               dst += offset;
4393               dst_end += offset;
4394             }
4395           memcpy (dst, desig_buf, nbytes);
4396           dst += nbytes;
4397           /* We are sure that designation sequences are all ASCII bytes.  */
4398           produced_chars += nbytes;
4399           bol_designation = 0;
4400           ASSURE_DESTINATION (safe_room);
4401         }
4402
4403       c = *charbuf++;
4404
4405       if (c < 0)
4406         {
4407           /* Handle an annotation.  */
4408           switch (*charbuf)
4409             {
4410             case CODING_ANNOTATE_COMPOSITION_MASK:
4411               /* Not yet implemented.  */
4412               break;
4413             case CODING_ANNOTATE_CHARSET_MASK:
4414               preferred_charset_id = charbuf[2];
4415               if (preferred_charset_id >= 0
4416                   && NILP (Fmemq (make_number (preferred_charset_id),
4417                                   charset_list)))
4418                 preferred_charset_id = -1;
4419               break;
4420             default:
4421               emacs_abort ();
4422             }
4423           charbuf += -c - 1;
4424           continue;
4425         }
4426
4427       /* Now encode the character C.  */
4428       if (c < 0x20 || c == 0x7F)
4429         {
4430           if (c == '\n'
4431               || (c == '\r' && EQ (eol_type, Qmac)))
4432             {
4433               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4434                 ENCODE_RESET_PLANE_AND_REGISTER ();
4435               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4436                 {
4437                   int i;
4438
4439                   for (i = 0; i < 4; i++)
4440                     CODING_ISO_DESIGNATION (coding, i)
4441                       = CODING_ISO_INITIAL (coding, i);
4442                 }
4443               bol_designation = ((CODING_ISO_FLAGS (coding)
4444                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4445                                  != 0);
4446             }
4447           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4448             ENCODE_RESET_PLANE_AND_REGISTER ();
4449           EMIT_ONE_ASCII_BYTE (c);
4450         }
4451       else if (ASCII_CHAR_P (c))
4452         {
4453           if (ascii_compatible)
4454             EMIT_ONE_ASCII_BYTE (c);
4455           else
4456             {
4457               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4458               ENCODE_ISO_CHARACTER (charset, c);
4459             }
4460         }
4461       else if (CHAR_BYTE8_P (c))
4462         {
4463           c = CHAR_TO_BYTE8 (c);
4464           EMIT_ONE_BYTE (c);
4465         }
4466       else
4467         {
4468           struct charset *charset;
4469
4470           if (preferred_charset_id >= 0)
4471             {
4472               bool result;
4473
4474               charset = CHARSET_FROM_ID (preferred_charset_id);
4475               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4476               if (! result)
4477                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4478                                      NULL, charset);
4479             }
4480           else
4481             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4482                                  NULL, charset);
4483           if (!charset)
4484             {
4485               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4486                 {
4487                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4488                   charset = CHARSET_FROM_ID (charset_ascii);
4489                 }
4490               else
4491                 {
4492                   c = coding->default_char;
4493                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4494                                        charset_list, NULL, charset);
4495                 }
4496             }
4497           ENCODE_ISO_CHARACTER (charset, c);
4498         }
4499     }
4500
4501   if (coding->mode & CODING_MODE_LAST_BLOCK
4502       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4503     {
4504       ASSURE_DESTINATION (safe_room);
4505       ENCODE_RESET_PLANE_AND_REGISTER ();
4506     }
4507   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4508   CODING_ISO_BOL (coding) = bol_designation;
4509   coding->produced_char += produced_chars;
4510   coding->produced = dst - coding->destination;
4511   return 0;
4512 }
4513
4514 \f
4515 /*** 8,9. SJIS and BIG5 handlers ***/
4516
4517 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4518    quite widely.  So, for the moment, Emacs supports them in the bare
4519    C code.  But, in the future, they may be supported only by CCL.  */
4520
4521 /* SJIS is a coding system encoding three character sets: ASCII, right
4522    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4523    as is.  A character of charset katakana-jisx0201 is encoded by
4524    "position-code + 0x80".  A character of charset japanese-jisx0208
4525    is encoded in 2-byte but two position-codes are divided and shifted
4526    so that it fit in the range below.
4527
4528    --- CODE RANGE of SJIS ---
4529    (character set)      (range)
4530    ASCII                0x00 .. 0x7F
4531    KATAKANA-JISX0201    0xA0 .. 0xDF
4532    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4533             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4534    -------------------------------
4535
4536 */
4537
4538 /* BIG5 is a coding system encoding two character sets: ASCII and
4539    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4540    character set and is encoded in two-byte.
4541
4542    --- CODE RANGE of BIG5 ---
4543    (character set)      (range)
4544    ASCII                0x00 .. 0x7F
4545    Big5 (1st byte)      0xA1 .. 0xFE
4546         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4547    --------------------------
4548
4549   */
4550
4551 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4552    Return true if a text is encoded in SJIS.  */
4553
4554 static bool
4555 detect_coding_sjis (struct coding_system *coding,
4556                     struct coding_detection_info *detect_info)
4557 {
4558   const unsigned char *src = coding->source, *src_base;
4559   const unsigned char *src_end = coding->source + coding->src_bytes;
4560   bool multibytep = coding->src_multibyte;
4561   ptrdiff_t consumed_chars = 0;
4562   int found = 0;
4563   int c;
4564   Lisp_Object attrs, charset_list;
4565   int max_first_byte_of_2_byte_code;
4566
4567   CODING_GET_INFO (coding, attrs, charset_list);
4568   max_first_byte_of_2_byte_code
4569     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4570
4571   detect_info->checked |= CATEGORY_MASK_SJIS;
4572   /* A coding system of this category is always ASCII compatible.  */
4573   src += coding->head_ascii;
4574
4575   while (1)
4576     {
4577       src_base = src;
4578       ONE_MORE_BYTE (c);
4579       if (c < 0x80)
4580         continue;
4581       if ((c >= 0x81 && c <= 0x9F)
4582           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4583         {
4584           ONE_MORE_BYTE (c);
4585           if (c < 0x40 || c == 0x7F || c > 0xFC)
4586             break;
4587           found = CATEGORY_MASK_SJIS;
4588         }
4589       else if (c >= 0xA0 && c < 0xE0)
4590         found = CATEGORY_MASK_SJIS;
4591       else
4592         break;
4593     }
4594   detect_info->rejected |= CATEGORY_MASK_SJIS;
4595   return 0;
4596
4597  no_more_source:
4598   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4599     {
4600       detect_info->rejected |= CATEGORY_MASK_SJIS;
4601       return 0;
4602     }
4603   detect_info->found |= found;
4604   return 1;
4605 }
4606
4607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4608    Return true if a text is encoded in BIG5.  */
4609
4610 static bool
4611 detect_coding_big5 (struct coding_system *coding,
4612                     struct coding_detection_info *detect_info)
4613 {
4614   const unsigned char *src = coding->source, *src_base;
4615   const unsigned char *src_end = coding->source + coding->src_bytes;
4616   bool multibytep = coding->src_multibyte;
4617   ptrdiff_t consumed_chars = 0;
4618   int found = 0;
4619   int c;
4620
4621   detect_info->checked |= CATEGORY_MASK_BIG5;
4622   /* A coding system of this category is always ASCII compatible.  */
4623   src += coding->head_ascii;
4624
4625   while (1)
4626     {
4627       src_base = src;
4628       ONE_MORE_BYTE (c);
4629       if (c < 0x80)
4630         continue;
4631       if (c >= 0xA1)
4632         {
4633           ONE_MORE_BYTE (c);
4634           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4635             return 0;
4636           found = CATEGORY_MASK_BIG5;
4637         }
4638       else
4639         break;
4640     }
4641   detect_info->rejected |= CATEGORY_MASK_BIG5;
4642   return 0;
4643
4644  no_more_source:
4645   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4646     {
4647       detect_info->rejected |= CATEGORY_MASK_BIG5;
4648       return 0;
4649     }
4650   detect_info->found |= found;
4651   return 1;
4652 }
4653
4654 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4655
4656 static void
4657 decode_coding_sjis (struct coding_system *coding)
4658 {
4659   const unsigned char *src = coding->source + coding->consumed;
4660   const unsigned char *src_end = coding->source + coding->src_bytes;
4661   const unsigned char *src_base;
4662   int *charbuf = coding->charbuf + coding->charbuf_used;
4663   /* We may produce one charset annotation in one loop and one more at
4664      the end.  */
4665   int *charbuf_end
4666     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4667   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4668   bool multibytep = coding->src_multibyte;
4669   struct charset *charset_roman, *charset_kanji, *charset_kana;
4670   struct charset *charset_kanji2;
4671   Lisp_Object attrs, charset_list, val;
4672   ptrdiff_t char_offset = coding->produced_char;
4673   ptrdiff_t last_offset = char_offset;
4674   int last_id = charset_ascii;
4675   bool eol_dos
4676     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4677   int byte_after_cr = -1;
4678
4679   CODING_GET_INFO (coding, attrs, charset_list);
4680
4681   val = charset_list;
4682   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4683   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4684   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4685   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4686
4687   while (1)
4688     {
4689       int c, c1;
4690       struct charset *charset;
4691
4692       src_base = src;
4693       consumed_chars_base = consumed_chars;
4694
4695       if (charbuf >= charbuf_end)
4696         {
4697           if (byte_after_cr >= 0)
4698             src_base--;
4699           break;
4700         }
4701
4702       if (byte_after_cr >= 0)
4703         c = byte_after_cr, byte_after_cr = -1;
4704       else
4705         ONE_MORE_BYTE (c);
4706       if (c < 0)
4707         goto invalid_code;
4708       if (c < 0x80)
4709         {
4710           if (eol_dos && c == '\r')
4711             ONE_MORE_BYTE (byte_after_cr);
4712           charset = charset_roman;
4713         }
4714       else if (c == 0x80 || c == 0xA0)
4715         goto invalid_code;
4716       else if (c >= 0xA1 && c <= 0xDF)
4717         {
4718           /* SJIS -> JISX0201-Kana */
4719           c &= 0x7F;
4720           charset = charset_kana;
4721         }
4722       else if (c <= 0xEF)
4723         {
4724           /* SJIS -> JISX0208 */
4725           ONE_MORE_BYTE (c1);
4726           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4727             goto invalid_code;
4728           c = (c << 8) | c1;
4729           SJIS_TO_JIS (c);
4730           charset = charset_kanji;
4731         }
4732       else if (c <= 0xFC && charset_kanji2)
4733         {
4734           /* SJIS -> JISX0213-2 */
4735           ONE_MORE_BYTE (c1);
4736           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4737             goto invalid_code;
4738           c = (c << 8) | c1;
4739           SJIS_TO_JIS2 (c);
4740           charset = charset_kanji2;
4741         }
4742       else
4743         goto invalid_code;
4744       if (charset->id != charset_ascii
4745           && last_id != charset->id)
4746         {
4747           if (last_id != charset_ascii)
4748             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4749           last_id = charset->id;
4750           last_offset = char_offset;
4751         }
4752       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4753       *charbuf++ = c;
4754       char_offset++;
4755       continue;
4756
4757     invalid_code:
4758       src = src_base;
4759       consumed_chars = consumed_chars_base;
4760       ONE_MORE_BYTE (c);
4761       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4762       char_offset++;
4763       coding->errors++;
4764     }
4765
4766  no_more_source:
4767   if (last_id != charset_ascii)
4768     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4769   coding->consumed_char += consumed_chars_base;
4770   coding->consumed = src_base - coding->source;
4771   coding->charbuf_used = charbuf - coding->charbuf;
4772 }
4773
4774 static void
4775 decode_coding_big5 (struct coding_system *coding)
4776 {
4777   const unsigned char *src = coding->source + coding->consumed;
4778   const unsigned char *src_end = coding->source + coding->src_bytes;
4779   const unsigned char *src_base;
4780   int *charbuf = coding->charbuf + coding->charbuf_used;
4781   /* We may produce one charset annotation in one loop and one more at
4782      the end.  */
4783   int *charbuf_end
4784     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4785   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4786   bool multibytep = coding->src_multibyte;
4787   struct charset *charset_roman, *charset_big5;
4788   Lisp_Object attrs, charset_list, val;
4789   ptrdiff_t char_offset = coding->produced_char;
4790   ptrdiff_t last_offset = char_offset;
4791   int last_id = charset_ascii;
4792   bool eol_dos
4793     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4794   int byte_after_cr = -1;
4795
4796   CODING_GET_INFO (coding, attrs, charset_list);
4797   val = charset_list;
4798   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4799   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4800
4801   while (1)
4802     {
4803       int c, c1;
4804       struct charset *charset;
4805
4806       src_base = src;
4807       consumed_chars_base = consumed_chars;
4808
4809       if (charbuf >= charbuf_end)
4810         {
4811           if (byte_after_cr >= 0)
4812             src_base--;
4813           break;
4814         }
4815
4816       if (byte_after_cr >= 0)
4817         c = byte_after_cr, byte_after_cr = -1;
4818       else
4819         ONE_MORE_BYTE (c);
4820
4821       if (c < 0)
4822         goto invalid_code;
4823       if (c < 0x80)
4824         {
4825           if (eol_dos && c == '\r')
4826             ONE_MORE_BYTE (byte_after_cr);
4827           charset = charset_roman;
4828         }
4829       else
4830         {
4831           /* BIG5 -> Big5 */
4832           if (c < 0xA1 || c > 0xFE)
4833             goto invalid_code;
4834           ONE_MORE_BYTE (c1);
4835           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4836             goto invalid_code;
4837           c = c << 8 | c1;
4838           charset = charset_big5;
4839         }
4840       if (charset->id != charset_ascii
4841           && last_id != charset->id)
4842         {
4843           if (last_id != charset_ascii)
4844             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845           last_id = charset->id;
4846           last_offset = char_offset;
4847         }
4848       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4849       *charbuf++ = c;
4850       char_offset++;
4851       continue;
4852
4853     invalid_code:
4854       src = src_base;
4855       consumed_chars = consumed_chars_base;
4856       ONE_MORE_BYTE (c);
4857       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4858       char_offset++;
4859       coding->errors++;
4860     }
4861
4862  no_more_source:
4863   if (last_id != charset_ascii)
4864     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4865   coding->consumed_char += consumed_chars_base;
4866   coding->consumed = src_base - coding->source;
4867   coding->charbuf_used = charbuf - coding->charbuf;
4868 }
4869
4870 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4871    This function can encode charsets `ascii', `katakana-jisx0201',
4872    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4873    are sure that all these charsets are registered as official charset
4874    (i.e. do not have extended leading-codes).  Characters of other
4875    charsets are produced without any encoding.  */
4876
4877 static bool
4878 encode_coding_sjis (struct coding_system *coding)
4879 {
4880   bool multibytep = coding->dst_multibyte;
4881   int *charbuf = coding->charbuf;
4882   int *charbuf_end = charbuf + coding->charbuf_used;
4883   unsigned char *dst = coding->destination + coding->produced;
4884   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4885   int safe_room = 4;
4886   ptrdiff_t produced_chars = 0;
4887   Lisp_Object attrs, charset_list, val;
4888   bool ascii_compatible;
4889   struct charset *charset_kanji, *charset_kana;
4890   struct charset *charset_kanji2;
4891   int c;
4892
4893   CODING_GET_INFO (coding, attrs, charset_list);
4894   val = XCDR (charset_list);
4895   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4896   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4897   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4898
4899   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4900
4901   while (charbuf < charbuf_end)
4902     {
4903       ASSURE_DESTINATION (safe_room);
4904       c = *charbuf++;
4905       /* Now encode the character C.  */
4906       if (ASCII_CHAR_P (c) && ascii_compatible)
4907         EMIT_ONE_ASCII_BYTE (c);
4908       else if (CHAR_BYTE8_P (c))
4909         {
4910           c = CHAR_TO_BYTE8 (c);
4911           EMIT_ONE_BYTE (c);
4912         }
4913       else
4914         {
4915           unsigned code;
4916           struct charset *charset;
4917           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4918                                &code, charset);
4919
4920           if (!charset)
4921             {
4922               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4923                 {
4924                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4925                   charset = CHARSET_FROM_ID (charset_ascii);
4926                 }
4927               else
4928                 {
4929                   c = coding->default_char;
4930                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4931                                        charset_list, &code, charset);
4932                 }
4933             }
4934           if (code == CHARSET_INVALID_CODE (charset))
4935             emacs_abort ();
4936           if (charset == charset_kanji)
4937             {
4938               int c1, c2;
4939               JIS_TO_SJIS (code);
4940               c1 = code >> 8, c2 = code & 0xFF;
4941               EMIT_TWO_BYTES (c1, c2);
4942             }
4943           else if (charset == charset_kana)
4944             EMIT_ONE_BYTE (code | 0x80);
4945           else if (charset_kanji2 && charset == charset_kanji2)
4946             {
4947               int c1, c2;
4948
4949               c1 = code >> 8;
4950               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4951                   || c1 == 0x28
4952                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4953                 {
4954                   JIS_TO_SJIS2 (code);
4955                   c1 = code >> 8, c2 = code & 0xFF;
4956                   EMIT_TWO_BYTES (c1, c2);
4957                 }
4958               else
4959                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4960             }
4961           else
4962             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4963         }
4964     }
4965   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4966   coding->produced_char += produced_chars;
4967   coding->produced = dst - coding->destination;
4968   return 0;
4969 }
4970
4971 static bool
4972 encode_coding_big5 (struct coding_system *coding)
4973 {
4974   bool multibytep = coding->dst_multibyte;
4975   int *charbuf = coding->charbuf;
4976   int *charbuf_end = charbuf + coding->charbuf_used;
4977   unsigned char *dst = coding->destination + coding->produced;
4978   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4979   int safe_room = 4;
4980   ptrdiff_t produced_chars = 0;
4981   Lisp_Object attrs, charset_list, val;
4982   bool ascii_compatible;
4983   struct charset *charset_big5;
4984   int c;
4985
4986   CODING_GET_INFO (coding, attrs, charset_list);
4987   val = XCDR (charset_list);
4988   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4989   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4990
4991   while (charbuf < charbuf_end)
4992     {
4993       ASSURE_DESTINATION (safe_room);
4994       c = *charbuf++;
4995       /* Now encode the character C.  */
4996       if (ASCII_CHAR_P (c) && ascii_compatible)
4997         EMIT_ONE_ASCII_BYTE (c);
4998       else if (CHAR_BYTE8_P (c))
4999         {
5000           c = CHAR_TO_BYTE8 (c);
5001           EMIT_ONE_BYTE (c);
5002         }
5003       else
5004         {
5005           unsigned code;
5006           struct charset *charset;
5007           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5008                                &code, charset);
5009
5010           if (! charset)
5011             {
5012               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5013                 {
5014                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5015                   charset = CHARSET_FROM_ID (charset_ascii);
5016                 }
5017               else
5018                 {
5019                   c = coding->default_char;
5020                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5021                                        charset_list, &code, charset);
5022                 }
5023             }
5024           if (code == CHARSET_INVALID_CODE (charset))
5025             emacs_abort ();
5026           if (charset == charset_big5)
5027             {
5028               int c1, c2;
5029
5030               c1 = code >> 8, c2 = code & 0xFF;
5031               EMIT_TWO_BYTES (c1, c2);
5032             }
5033           else
5034             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5035         }
5036     }
5037   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5038   coding->produced_char += produced_chars;
5039   coding->produced = dst - coding->destination;
5040   return 0;
5041 }
5042
5043 \f
5044 /*** 10. CCL handlers ***/
5045
5046 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5047    Return true if a text is encoded in a coding system of which
5048    encoder/decoder are written in CCL program.  */
5049
5050 static bool
5051 detect_coding_ccl (struct coding_system *coding,
5052                    struct coding_detection_info *detect_info)
5053 {
5054   const unsigned char *src = coding->source, *src_base;
5055   const unsigned char *src_end = coding->source + coding->src_bytes;
5056   bool multibytep = coding->src_multibyte;
5057   ptrdiff_t consumed_chars = 0;
5058   int found = 0;
5059   unsigned char *valids;
5060   ptrdiff_t head_ascii = coding->head_ascii;
5061   Lisp_Object attrs;
5062
5063   detect_info->checked |= CATEGORY_MASK_CCL;
5064
5065   coding = &coding_categories[coding_category_ccl];
5066   valids = CODING_CCL_VALIDS (coding);
5067   attrs = CODING_ID_ATTRS (coding->id);
5068   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5069     src += head_ascii;
5070
5071   while (1)
5072     {
5073       int c;
5074
5075       src_base = src;
5076       ONE_MORE_BYTE (c);
5077       if (c < 0 || ! valids[c])
5078         break;
5079       if ((valids[c] > 1))
5080         found = CATEGORY_MASK_CCL;
5081     }
5082   detect_info->rejected |= CATEGORY_MASK_CCL;
5083   return 0;
5084
5085  no_more_source:
5086   detect_info->found |= found;
5087   return 1;
5088 }
5089
5090 static void
5091 decode_coding_ccl (struct coding_system *coding)
5092 {
5093   const unsigned char *src = coding->source + coding->consumed;
5094   const unsigned char *src_end = coding->source + coding->src_bytes;
5095   int *charbuf = coding->charbuf + coding->charbuf_used;
5096   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5097   ptrdiff_t consumed_chars = 0;
5098   bool multibytep = coding->src_multibyte;
5099   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5100   int source_charbuf[1024];
5101   int source_byteidx[1025];
5102   Lisp_Object attrs, charset_list;
5103
5104   CODING_GET_INFO (coding, attrs, charset_list);
5105
5106   while (1)
5107     {
5108       const unsigned char *p = src;
5109       int i = 0;
5110
5111       if (multibytep)
5112         {
5113           while (i < 1024 && p < src_end)
5114             {
5115               source_byteidx[i] = p - src;
5116               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5117             }
5118           source_byteidx[i] = p - src;
5119         }
5120       else
5121         while (i < 1024 && p < src_end)
5122           source_charbuf[i++] = *p++;
5123
5124       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5125         ccl->last_block = 1;
5126       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5127                   charset_list);
5128       charbuf += ccl->produced;
5129       if (multibytep)
5130         src += source_byteidx[ccl->consumed];
5131       else
5132         src += ccl->consumed;
5133       consumed_chars += ccl->consumed;
5134       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5135         break;
5136     }
5137
5138   switch (ccl->status)
5139     {
5140     case CCL_STAT_SUSPEND_BY_SRC:
5141       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5142       break;
5143     case CCL_STAT_SUSPEND_BY_DST:
5144       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5145       break;
5146     case CCL_STAT_QUIT:
5147     case CCL_STAT_INVALID_CMD:
5148       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5149       break;
5150     default:
5151       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5152       break;
5153     }
5154   coding->consumed_char += consumed_chars;
5155   coding->consumed = src - coding->source;
5156   coding->charbuf_used = charbuf - coding->charbuf;
5157 }
5158
5159 static bool
5160 encode_coding_ccl (struct coding_system *coding)
5161 {
5162   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5163   bool multibytep = coding->dst_multibyte;
5164   int *charbuf = coding->charbuf;
5165   int *charbuf_end = charbuf + coding->charbuf_used;
5166   unsigned char *dst = coding->destination + coding->produced;
5167   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5168   int destination_charbuf[1024];
5169   ptrdiff_t produced_chars = 0;
5170   int i;
5171   Lisp_Object attrs, charset_list;
5172
5173   CODING_GET_INFO (coding, attrs, charset_list);
5174   if (coding->consumed_char == coding->src_chars
5175       && coding->mode & CODING_MODE_LAST_BLOCK)
5176     ccl->last_block = 1;
5177
5178   do
5179     {
5180       ccl_driver (ccl, charbuf, destination_charbuf,
5181                   charbuf_end - charbuf, 1024, charset_list);
5182       if (multibytep)
5183         {
5184           ASSURE_DESTINATION (ccl->produced * 2);
5185           for (i = 0; i < ccl->produced; i++)
5186             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5187         }
5188       else
5189         {
5190           ASSURE_DESTINATION (ccl->produced);
5191           for (i = 0; i < ccl->produced; i++)
5192             *dst++ = destination_charbuf[i] & 0xFF;
5193           produced_chars += ccl->produced;
5194         }
5195       charbuf += ccl->consumed;
5196       if (ccl->status == CCL_STAT_QUIT
5197           || ccl->status == CCL_STAT_INVALID_CMD)
5198         break;
5199     }
5200   while (charbuf < charbuf_end);
5201
5202   switch (ccl->status)
5203     {
5204     case CCL_STAT_SUSPEND_BY_SRC:
5205       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5206       break;
5207     case CCL_STAT_SUSPEND_BY_DST:
5208       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5209       break;
5210     case CCL_STAT_QUIT:
5211     case CCL_STAT_INVALID_CMD:
5212       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5213       break;
5214     default:
5215       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5216       break;
5217     }
5218
5219   coding->produced_char += produced_chars;
5220   coding->produced = dst - coding->destination;
5221   return 0;
5222 }
5223
5224 \f
5225 /*** 10, 11. no-conversion handlers ***/
5226
5227 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5228
5229 static void
5230 decode_coding_raw_text (struct coding_system *coding)
5231 {
5232   bool eol_dos
5233     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5234
5235   coding->chars_at_source = 1;
5236   coding->consumed_char = coding->src_chars;
5237   coding->consumed = coding->src_bytes;
5238   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5239     {
5240       coding->consumed_char--;
5241       coding->consumed--;
5242       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5243     }
5244   else
5245     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5246 }
5247
5248 static bool
5249 encode_coding_raw_text (struct coding_system *coding)
5250 {
5251   bool multibytep = coding->dst_multibyte;
5252   int *charbuf = coding->charbuf;
5253   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5254   unsigned char *dst = coding->destination + coding->produced;
5255   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5256   ptrdiff_t produced_chars = 0;
5257   int c;
5258
5259   if (multibytep)
5260     {
5261       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5262
5263       if (coding->src_multibyte)
5264         while (charbuf < charbuf_end)
5265           {
5266             ASSURE_DESTINATION (safe_room);
5267             c = *charbuf++;
5268             if (ASCII_CHAR_P (c))
5269               EMIT_ONE_ASCII_BYTE (c);
5270             else if (CHAR_BYTE8_P (c))
5271               {
5272                 c = CHAR_TO_BYTE8 (c);
5273                 EMIT_ONE_BYTE (c);
5274               }
5275             else
5276               {
5277                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5278
5279                 CHAR_STRING_ADVANCE (c, p1);
5280                 do
5281                   {
5282                     EMIT_ONE_BYTE (*p0);
5283                     p0++;
5284                   }
5285                 while (p0 < p1);
5286               }
5287           }
5288       else
5289         while (charbuf < charbuf_end)
5290           {
5291             ASSURE_DESTINATION (safe_room);
5292             c = *charbuf++;
5293             EMIT_ONE_BYTE (c);
5294           }
5295     }
5296   else
5297     {
5298       if (coding->src_multibyte)
5299         {
5300           int safe_room = MAX_MULTIBYTE_LENGTH;
5301
5302           while (charbuf < charbuf_end)
5303             {
5304               ASSURE_DESTINATION (safe_room);
5305               c = *charbuf++;
5306               if (ASCII_CHAR_P (c))
5307                 *dst++ = c;
5308               else if (CHAR_BYTE8_P (c))
5309                 *dst++ = CHAR_TO_BYTE8 (c);
5310               else
5311                 CHAR_STRING_ADVANCE (c, dst);
5312             }
5313         }
5314       else
5315         {
5316           ASSURE_DESTINATION (charbuf_end - charbuf);
5317           while (charbuf < charbuf_end && dst < dst_end)
5318             *dst++ = *charbuf++;
5319         }
5320       produced_chars = dst - (coding->destination + coding->produced);
5321     }
5322   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5323   coding->produced_char += produced_chars;
5324   coding->produced = dst - coding->destination;
5325   return 0;
5326 }
5327
5328 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5329    Return true if a text is encoded in a charset-based coding system.  */
5330
5331 static bool
5332 detect_coding_charset (struct coding_system *coding,
5333                        struct coding_detection_info *detect_info)
5334 {
5335   const unsigned char *src = coding->source, *src_base;
5336   const unsigned char *src_end = coding->source + coding->src_bytes;
5337   bool multibytep = coding->src_multibyte;
5338   ptrdiff_t consumed_chars = 0;
5339   Lisp_Object attrs, valids, name;
5340   int found = 0;
5341   ptrdiff_t head_ascii = coding->head_ascii;
5342   bool check_latin_extra = 0;
5343
5344   detect_info->checked |= CATEGORY_MASK_CHARSET;
5345
5346   coding = &coding_categories[coding_category_charset];
5347   attrs = CODING_ID_ATTRS (coding->id);
5348   valids = AREF (attrs, coding_attr_charset_valids);
5349   name = CODING_ID_NAME (coding->id);
5350   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5351                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5352       || strncmp (SSDATA (SYMBOL_NAME (name)),
5353                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5354     check_latin_extra = 1;
5355
5356   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5357     src += head_ascii;
5358
5359   while (1)
5360     {
5361       int c;
5362       Lisp_Object val;
5363       struct charset *charset;
5364       int dim, idx;
5365
5366       src_base = src;
5367       ONE_MORE_BYTE (c);
5368       if (c < 0)
5369         continue;
5370       val = AREF (valids, c);
5371       if (NILP (val))
5372         break;
5373       if (c >= 0x80)
5374         {
5375           if (c < 0xA0
5376               && check_latin_extra
5377               && (!VECTORP (Vlatin_extra_code_table)
5378                   || NILP (AREF (Vlatin_extra_code_table, c))))
5379             break;
5380           found = CATEGORY_MASK_CHARSET;
5381         }
5382       if (INTEGERP (val))
5383         {
5384           charset = CHARSET_FROM_ID (XFASTINT (val));
5385           dim = CHARSET_DIMENSION (charset);
5386           for (idx = 1; idx < dim; idx++)
5387             {
5388               if (src == src_end)
5389                 goto too_short;
5390               ONE_MORE_BYTE (c);
5391               if (c < charset->code_space[(dim - 1 - idx) * 4]
5392                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5393                 break;
5394             }
5395           if (idx < dim)
5396             break;
5397         }
5398       else
5399         {
5400           idx = 1;
5401           for (; CONSP (val); val = XCDR (val))
5402             {
5403               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5404               dim = CHARSET_DIMENSION (charset);
5405               while (idx < dim)
5406                 {
5407                   if (src == src_end)
5408                     goto too_short;
5409                   ONE_MORE_BYTE (c);
5410                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5411                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5412                     break;
5413                   idx++;
5414                 }
5415               if (idx == dim)
5416                 {
5417                   val = Qnil;
5418                   break;
5419                 }
5420             }
5421           if (CONSP (val))
5422             break;
5423         }
5424     }
5425  too_short:
5426   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5427   return 0;
5428
5429  no_more_source:
5430   detect_info->found |= found;
5431   return 1;
5432 }
5433
5434 static void
5435 decode_coding_charset (struct coding_system *coding)
5436 {
5437   const unsigned char *src = coding->source + coding->consumed;
5438   const unsigned char *src_end = coding->source + coding->src_bytes;
5439   const unsigned char *src_base;
5440   int *charbuf = coding->charbuf + coding->charbuf_used;
5441   /* We may produce one charset annotation in one loop and one more at
5442      the end.  */
5443   int *charbuf_end
5444     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5445   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5446   bool multibytep = coding->src_multibyte;
5447   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5448   Lisp_Object valids;
5449   ptrdiff_t char_offset = coding->produced_char;
5450   ptrdiff_t last_offset = char_offset;
5451   int last_id = charset_ascii;
5452   bool eol_dos
5453     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5454   int byte_after_cr = -1;
5455
5456   valids = AREF (attrs, coding_attr_charset_valids);
5457
5458   while (1)
5459     {
5460       int c;
5461       Lisp_Object val;
5462       struct charset *charset;
5463       int dim;
5464       int len = 1;
5465       unsigned code;
5466
5467       src_base = src;
5468       consumed_chars_base = consumed_chars;
5469
5470       if (charbuf >= charbuf_end)
5471         {
5472           if (byte_after_cr >= 0)
5473             src_base--;
5474           break;
5475         }
5476
5477       if (byte_after_cr >= 0)
5478         {
5479           c = byte_after_cr;
5480           byte_after_cr = -1;
5481         }
5482       else
5483         {
5484           ONE_MORE_BYTE (c);
5485           if (eol_dos && c == '\r')
5486             ONE_MORE_BYTE (byte_after_cr);
5487         }
5488       if (c < 0)
5489         goto invalid_code;
5490       code = c;
5491
5492       val = AREF (valids, c);
5493       if (! INTEGERP (val) && ! CONSP (val))
5494         goto invalid_code;
5495       if (INTEGERP (val))
5496         {
5497           charset = CHARSET_FROM_ID (XFASTINT (val));
5498           dim = CHARSET_DIMENSION (charset);
5499           while (len < dim)
5500             {
5501               ONE_MORE_BYTE (c);
5502               code = (code << 8) | c;
5503               len++;
5504             }
5505           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5506                               charset, code, c);
5507         }
5508       else
5509         {
5510           /* VAL is a list of charset IDs.  It is assured that the
5511              list is sorted by charset dimensions (smaller one
5512              comes first).  */
5513           while (CONSP (val))
5514             {
5515               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5516               dim = CHARSET_DIMENSION (charset);
5517               while (len < dim)
5518                 {
5519                   ONE_MORE_BYTE (c);
5520                   code = (code << 8) | c;
5521                   len++;
5522                 }
5523               CODING_DECODE_CHAR (coding, src, src_base,
5524                                   src_end, charset, code, c);
5525               if (c >= 0)
5526                 break;
5527               val = XCDR (val);
5528             }
5529         }
5530       if (c < 0)
5531         goto invalid_code;
5532       if (charset->id != charset_ascii
5533           && last_id != charset->id)
5534         {
5535           if (last_id != charset_ascii)
5536             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5537           last_id = charset->id;
5538           last_offset = char_offset;
5539         }
5540
5541       *charbuf++ = c;
5542       char_offset++;
5543       continue;
5544
5545     invalid_code:
5546       src = src_base;
5547       consumed_chars = consumed_chars_base;
5548       ONE_MORE_BYTE (c);
5549       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5550       char_offset++;
5551       coding->errors++;
5552     }
5553
5554  no_more_source:
5555   if (last_id != charset_ascii)
5556     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5557   coding->consumed_char += consumed_chars_base;
5558   coding->consumed = src_base - coding->source;
5559   coding->charbuf_used = charbuf - coding->charbuf;
5560 }
5561
5562 static bool
5563 encode_coding_charset (struct coding_system *coding)
5564 {
5565   bool multibytep = coding->dst_multibyte;
5566   int *charbuf = coding->charbuf;
5567   int *charbuf_end = charbuf + coding->charbuf_used;
5568   unsigned char *dst = coding->destination + coding->produced;
5569   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5570   int safe_room = MAX_MULTIBYTE_LENGTH;
5571   ptrdiff_t produced_chars = 0;
5572   Lisp_Object attrs, charset_list;
5573   bool ascii_compatible;
5574   int c;
5575
5576   CODING_GET_INFO (coding, attrs, charset_list);
5577   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5578
5579   while (charbuf < charbuf_end)
5580     {
5581       struct charset *charset;
5582       unsigned code;
5583
5584       ASSURE_DESTINATION (safe_room);
5585       c = *charbuf++;
5586       if (ascii_compatible && ASCII_CHAR_P (c))
5587         EMIT_ONE_ASCII_BYTE (c);
5588       else if (CHAR_BYTE8_P (c))
5589         {
5590           c = CHAR_TO_BYTE8 (c);
5591           EMIT_ONE_BYTE (c);
5592         }
5593       else
5594         {
5595           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5596                                &code, charset);
5597
5598           if (charset)
5599             {
5600               if (CHARSET_DIMENSION (charset) == 1)
5601                 EMIT_ONE_BYTE (code);
5602               else if (CHARSET_DIMENSION (charset) == 2)
5603                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5604               else if (CHARSET_DIMENSION (charset) == 3)
5605                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5606               else
5607                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5608                                  (code >> 8) & 0xFF, code & 0xFF);
5609             }
5610           else
5611             {
5612               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5613                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5614               else
5615                 c = coding->default_char;
5616               EMIT_ONE_BYTE (c);
5617             }
5618         }
5619     }
5620
5621   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5622   coding->produced_char += produced_chars;
5623   coding->produced = dst - coding->destination;
5624   return 0;
5625 }
5626
5627 \f
5628 /*** 7. C library functions ***/
5629
5630 /* Setup coding context CODING from information about CODING_SYSTEM.
5631    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5632    CODING_SYSTEM is invalid, signal an error.  */
5633
5634 void
5635 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5636 {
5637   Lisp_Object attrs;
5638   Lisp_Object eol_type;
5639   Lisp_Object coding_type;
5640   Lisp_Object val;
5641
5642   if (NILP (coding_system))
5643     coding_system = Qundecided;
5644
5645   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5646
5647   attrs = CODING_ID_ATTRS (coding->id);
5648   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5649
5650   coding->mode = 0;
5651   coding->head_ascii = -1;
5652   if (VECTORP (eol_type))
5653     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5654                             | CODING_REQUIRE_DETECTION_MASK);
5655   else if (! EQ (eol_type, Qunix))
5656     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5657                             | CODING_REQUIRE_ENCODING_MASK);
5658   else
5659     coding->common_flags = 0;
5660   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5661     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5662   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5663     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5664   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5665     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5666
5667   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5668   coding->max_charset_id = SCHARS (val) - 1;
5669   coding->safe_charsets = SDATA (val);
5670   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5671   coding->carryover_bytes = 0;
5672
5673   coding_type = CODING_ATTR_TYPE (attrs);
5674   if (EQ (coding_type, Qundecided))
5675     {
5676       coding->detector = NULL;
5677       coding->decoder = decode_coding_raw_text;
5678       coding->encoder = encode_coding_raw_text;
5679       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5680     }
5681   else if (EQ (coding_type, Qiso_2022))
5682     {
5683       int i;
5684       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5685
5686       /* Invoke graphic register 0 to plane 0.  */
5687       CODING_ISO_INVOCATION (coding, 0) = 0;
5688       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5689       CODING_ISO_INVOCATION (coding, 1)
5690         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5691       /* Setup the initial status of designation.  */
5692       for (i = 0; i < 4; i++)
5693         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5694       /* Not single shifting initially.  */
5695       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5696       /* Beginning of buffer should also be regarded as bol. */
5697       CODING_ISO_BOL (coding) = 1;
5698       coding->detector = detect_coding_iso_2022;
5699       coding->decoder = decode_coding_iso_2022;
5700       coding->encoder = encode_coding_iso_2022;
5701       if (flags & CODING_ISO_FLAG_SAFE)
5702         coding->mode |= CODING_MODE_SAFE_ENCODING;
5703       coding->common_flags
5704         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5705             | CODING_REQUIRE_FLUSHING_MASK);
5706       if (flags & CODING_ISO_FLAG_COMPOSITION)
5707         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5708       if (flags & CODING_ISO_FLAG_DESIGNATION)
5709         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5710       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5711         {
5712           setup_iso_safe_charsets (attrs);
5713           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5714           coding->max_charset_id = SCHARS (val) - 1;
5715           coding->safe_charsets = SDATA (val);
5716         }
5717       CODING_ISO_FLAGS (coding) = flags;
5718       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5719       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5720       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5721       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5722     }
5723   else if (EQ (coding_type, Qcharset))
5724     {
5725       coding->detector = detect_coding_charset;
5726       coding->decoder = decode_coding_charset;
5727       coding->encoder = encode_coding_charset;
5728       coding->common_flags
5729         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5730     }
5731   else if (EQ (coding_type, Qutf_8))
5732     {
5733       val = AREF (attrs, coding_attr_utf_bom);
5734       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5735                                    : EQ (val, Qt) ? utf_with_bom
5736                                    : utf_without_bom);
5737       coding->detector = detect_coding_utf_8;
5738       coding->decoder = decode_coding_utf_8;
5739       coding->encoder = encode_coding_utf_8;
5740       coding->common_flags
5741         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5742       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5743         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5744     }
5745   else if (EQ (coding_type, Qutf_16))
5746     {
5747       val = AREF (attrs, coding_attr_utf_bom);
5748       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5749                                     : EQ (val, Qt) ? utf_with_bom
5750                                     : utf_without_bom);
5751       val = AREF (attrs, coding_attr_utf_16_endian);
5752       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5753                                        : utf_16_little_endian);
5754       CODING_UTF_16_SURROGATE (coding) = 0;
5755       coding->detector = detect_coding_utf_16;
5756       coding->decoder = decode_coding_utf_16;
5757       coding->encoder = encode_coding_utf_16;
5758       coding->common_flags
5759         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5760       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5761         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5762     }
5763   else if (EQ (coding_type, Qccl))
5764     {
5765       coding->detector = detect_coding_ccl;
5766       coding->decoder = decode_coding_ccl;
5767       coding->encoder = encode_coding_ccl;
5768       coding->common_flags
5769         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5770             | CODING_REQUIRE_FLUSHING_MASK);
5771     }
5772   else if (EQ (coding_type, Qemacs_mule))
5773     {
5774       coding->detector = detect_coding_emacs_mule;
5775       coding->decoder = decode_coding_emacs_mule;
5776       coding->encoder = encode_coding_emacs_mule;
5777       coding->common_flags
5778         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5779       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5780           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5781         {
5782           Lisp_Object tail, safe_charsets;
5783           int max_charset_id = 0;
5784
5785           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5786                tail = XCDR (tail))
5787             if (max_charset_id < XFASTINT (XCAR (tail)))
5788               max_charset_id = XFASTINT (XCAR (tail));
5789           safe_charsets = make_uninit_string (max_charset_id + 1);
5790           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5791           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5792                tail = XCDR (tail))
5793             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5794           coding->max_charset_id = max_charset_id;
5795           coding->safe_charsets = SDATA (safe_charsets);
5796         }
5797       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5798       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5799     }
5800   else if (EQ (coding_type, Qshift_jis))
5801     {
5802       coding->detector = detect_coding_sjis;
5803       coding->decoder = decode_coding_sjis;
5804       coding->encoder = encode_coding_sjis;
5805       coding->common_flags
5806         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5807     }
5808   else if (EQ (coding_type, Qbig5))
5809     {
5810       coding->detector = detect_coding_big5;
5811       coding->decoder = decode_coding_big5;
5812       coding->encoder = encode_coding_big5;
5813       coding->common_flags
5814         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5815     }
5816   else                          /* EQ (coding_type, Qraw_text) */
5817     {
5818       coding->detector = NULL;
5819       coding->decoder = decode_coding_raw_text;
5820       coding->encoder = encode_coding_raw_text;
5821       if (! EQ (eol_type, Qunix))
5822         {
5823           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5824           if (! VECTORP (eol_type))
5825             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5826         }
5827
5828     }
5829
5830   return;
5831 }
5832
5833 /* Return a list of charsets supported by CODING.  */
5834
5835 Lisp_Object
5836 coding_charset_list (struct coding_system *coding)
5837 {
5838   Lisp_Object attrs, charset_list;
5839
5840   CODING_GET_INFO (coding, attrs, charset_list);
5841   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5842     {
5843       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5844
5845       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5846         charset_list = Viso_2022_charset_list;
5847     }
5848   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5849     {
5850       charset_list = Vemacs_mule_charset_list;
5851     }
5852   return charset_list;
5853 }
5854
5855
5856 /* Return a list of charsets supported by CODING-SYSTEM.  */
5857
5858 Lisp_Object
5859 coding_system_charset_list (Lisp_Object coding_system)
5860 {
5861   ptrdiff_t id;
5862   Lisp_Object attrs, charset_list;
5863
5864   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5865   attrs = CODING_ID_ATTRS (id);
5866
5867   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5868     {
5869       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5870
5871       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5872         charset_list = Viso_2022_charset_list;
5873       else
5874         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5875     }
5876   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5877     {
5878       charset_list = Vemacs_mule_charset_list;
5879     }
5880   else
5881     {
5882       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5883     }
5884   return charset_list;
5885 }
5886
5887
5888 /* Return raw-text or one of its subsidiaries that has the same
5889    eol_type as CODING-SYSTEM.  */
5890
5891 Lisp_Object
5892 raw_text_coding_system (Lisp_Object coding_system)
5893 {
5894   Lisp_Object spec, attrs;
5895   Lisp_Object eol_type, raw_text_eol_type;
5896
5897   if (NILP (coding_system))
5898     return Qraw_text;
5899   spec = CODING_SYSTEM_SPEC (coding_system);
5900   attrs = AREF (spec, 0);
5901
5902   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5903     return coding_system;
5904
5905   eol_type = AREF (spec, 2);
5906   if (VECTORP (eol_type))
5907     return Qraw_text;
5908   spec = CODING_SYSTEM_SPEC (Qraw_text);
5909   raw_text_eol_type = AREF (spec, 2);
5910   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5911           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5912           : AREF (raw_text_eol_type, 2));
5913 }
5914
5915
5916 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5917    the subsidiary that has the same eol-spec as PARENT (if it is not
5918    nil and specifies end-of-line format) or the system's setting
5919    (system_eol_type).  */
5920
5921 Lisp_Object
5922 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5923 {
5924   Lisp_Object spec, eol_type;
5925
5926   if (NILP (coding_system))
5927     coding_system = Qraw_text;
5928   spec = CODING_SYSTEM_SPEC (coding_system);
5929   eol_type = AREF (spec, 2);
5930   if (VECTORP (eol_type))
5931     {
5932       Lisp_Object parent_eol_type;
5933
5934       if (! NILP (parent))
5935         {
5936           Lisp_Object parent_spec;
5937
5938           parent_spec = CODING_SYSTEM_SPEC (parent);
5939           parent_eol_type = AREF (parent_spec, 2);
5940           if (VECTORP (parent_eol_type))
5941             parent_eol_type = system_eol_type;
5942         }
5943       else
5944         parent_eol_type = system_eol_type;
5945       if (EQ (parent_eol_type, Qunix))
5946         coding_system = AREF (eol_type, 0);
5947       else if (EQ (parent_eol_type, Qdos))
5948         coding_system = AREF (eol_type, 1);
5949       else if (EQ (parent_eol_type, Qmac))
5950         coding_system = AREF (eol_type, 2);
5951     }
5952   return coding_system;
5953 }
5954
5955
5956 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5957    decided for writing to a process.  If not, complement them, and
5958    return a new coding system.  */
5959
5960 Lisp_Object
5961 complement_process_encoding_system (Lisp_Object coding_system)
5962 {
5963   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5964   Lisp_Object spec, attrs;
5965   int i;
5966
5967   for (i = 0; i < 3; i++)
5968     {
5969       if (i == 1)
5970         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5971       else if (i == 2)
5972         coding_system = preferred_coding_system ();
5973       spec = CODING_SYSTEM_SPEC (coding_system);
5974       if (NILP (spec))
5975         continue;
5976       attrs = AREF (spec, 0);
5977       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5978         coding_base = CODING_ATTR_BASE_NAME (attrs);
5979       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5980         eol_base = coding_system;
5981       if (! NILP (coding_base) && ! NILP (eol_base))
5982         break;
5983     }
5984
5985   if (i > 0)
5986     /* The original CODING_SYSTEM didn't specify text-conversion or
5987        eol-conversion.  Be sure that we return a fully complemented
5988        coding system.  */
5989     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5990   return coding_system;
5991 }
5992
5993
5994 /* Emacs has a mechanism to automatically detect a coding system if it
5995    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5996    it's impossible to distinguish some coding systems accurately
5997    because they use the same range of codes.  So, at first, coding
5998    systems are categorized into 7, those are:
5999
6000    o coding-category-emacs-mule
6001
6002         The category for a coding system which has the same code range
6003         as Emacs' internal format.  Assigned the coding-system (Lisp
6004         symbol) `emacs-mule' by default.
6005
6006    o coding-category-sjis
6007
6008         The category for a coding system which has the same code range
6009         as SJIS.  Assigned the coding-system (Lisp
6010         symbol) `japanese-shift-jis' by default.
6011
6012    o coding-category-iso-7
6013
6014         The category for a coding system which has the same code range
6015         as ISO2022 of 7-bit environment.  This doesn't use any locking
6016         shift and single shift functions.  This can encode/decode all
6017         charsets.  Assigned the coding-system (Lisp symbol)
6018         `iso-2022-7bit' by default.
6019
6020    o coding-category-iso-7-tight
6021
6022         Same as coding-category-iso-7 except that this can
6023         encode/decode only the specified charsets.
6024
6025    o coding-category-iso-8-1
6026
6027         The category for a coding system which has the same code range
6028         as ISO2022 of 8-bit environment and graphic plane 1 used only
6029         for DIMENSION1 charset.  This doesn't use any locking shift
6030         and single shift functions.  Assigned the coding-system (Lisp
6031         symbol) `iso-latin-1' by default.
6032
6033    o coding-category-iso-8-2
6034
6035         The category for a coding system which has the same code range
6036         as ISO2022 of 8-bit environment and graphic plane 1 used only
6037         for DIMENSION2 charset.  This doesn't use any locking shift
6038         and single shift functions.  Assigned the coding-system (Lisp
6039         symbol) `japanese-iso-8bit' by default.
6040
6041    o coding-category-iso-7-else
6042
6043         The category for a coding system which has the same code range
6044         as ISO2022 of 7-bit environment but uses locking shift or
6045         single shift functions.  Assigned the coding-system (Lisp
6046         symbol) `iso-2022-7bit-lock' by default.
6047
6048    o coding-category-iso-8-else
6049
6050         The category for a coding system which has the same code range
6051         as ISO2022 of 8-bit environment but uses locking shift or
6052         single shift functions.  Assigned the coding-system (Lisp
6053         symbol) `iso-2022-8bit-ss2' by default.
6054
6055    o coding-category-big5
6056
6057         The category for a coding system which has the same code range
6058         as BIG5.  Assigned the coding-system (Lisp symbol)
6059         `cn-big5' by default.
6060
6061    o coding-category-utf-8
6062
6063         The category for a coding system which has the same code range
6064         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6065         symbol) `utf-8' by default.
6066
6067    o coding-category-utf-16-be
6068
6069         The category for a coding system in which a text has an
6070         Unicode signature (cf. Unicode Standard) in the order of BIG
6071         endian at the head.  Assigned the coding-system (Lisp symbol)
6072         `utf-16-be' by default.
6073
6074    o coding-category-utf-16-le
6075
6076         The category for a coding system in which a text has an
6077         Unicode signature (cf. Unicode Standard) in the order of
6078         LITTLE endian at the head.  Assigned the coding-system (Lisp
6079         symbol) `utf-16-le' by default.
6080
6081    o coding-category-ccl
6082
6083         The category for a coding system of which encoder/decoder is
6084         written in CCL programs.  The default value is nil, i.e., no
6085         coding system is assigned.
6086
6087    o coding-category-binary
6088
6089         The category for a coding system not categorized in any of the
6090         above.  Assigned the coding-system (Lisp symbol)
6091         `no-conversion' by default.
6092
6093    Each of them is a Lisp symbol and the value is an actual
6094    `coding-system's (this is also a Lisp symbol) assigned by a user.
6095    What Emacs does actually is to detect a category of coding system.
6096    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6097    decide only one possible category, it selects a category of the
6098    highest priority.  Priorities of categories are also specified by a
6099    user in a Lisp variable `coding-category-list'.
6100
6101 */
6102
6103 #define EOL_SEEN_NONE   0
6104 #define EOL_SEEN_LF     1
6105 #define EOL_SEEN_CR     2
6106 #define EOL_SEEN_CRLF   4
6107
6108 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6109    SOURCE is encoded.  If CATEGORY is one of
6110    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6111    two-byte, else they are encoded by one-byte.
6112
6113    Return one of EOL_SEEN_XXX.  */
6114
6115 #define MAX_EOL_CHECK_COUNT 3
6116
6117 static int
6118 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6119             enum coding_category category)
6120 {
6121   const unsigned char *src = source, *src_end = src + src_bytes;
6122   unsigned char c;
6123   int total  = 0;
6124   int eol_seen = EOL_SEEN_NONE;
6125
6126   if ((1 << category) & CATEGORY_MASK_UTF_16)
6127     {
6128       bool msb = category == (coding_category_utf_16_le
6129                               | coding_category_utf_16_le_nosig);
6130       bool lsb = !msb;
6131
6132       while (src + 1 < src_end)
6133         {
6134           c = src[lsb];
6135           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6136             {
6137               int this_eol;
6138
6139               if (c == '\n')
6140                 this_eol = EOL_SEEN_LF;
6141               else if (src + 3 >= src_end
6142                        || src[msb + 2] != 0
6143                        || src[lsb + 2] != '\n')
6144                 this_eol = EOL_SEEN_CR;
6145               else
6146                 {
6147                   this_eol = EOL_SEEN_CRLF;
6148                   src += 2;
6149                 }
6150
6151               if (eol_seen == EOL_SEEN_NONE)
6152                 /* This is the first end-of-line.  */
6153                 eol_seen = this_eol;
6154               else if (eol_seen != this_eol)
6155                 {
6156                   /* The found type is different from what found before.
6157                      Allow for stray ^M characters in DOS EOL files.  */
6158                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6159                       || (eol_seen == EOL_SEEN_CRLF
6160                           && this_eol == EOL_SEEN_CR))
6161                     eol_seen = EOL_SEEN_CRLF;
6162                   else
6163                     {
6164                       eol_seen = EOL_SEEN_LF;
6165                       break;
6166                     }
6167                 }
6168               if (++total == MAX_EOL_CHECK_COUNT)
6169                 break;
6170             }
6171           src += 2;
6172         }
6173     }
6174   else
6175     while (src < src_end)
6176       {
6177         c = *src++;
6178         if (c == '\n' || c == '\r')
6179           {
6180             int this_eol;
6181
6182             if (c == '\n')
6183               this_eol = EOL_SEEN_LF;
6184             else if (src >= src_end || *src != '\n')
6185               this_eol = EOL_SEEN_CR;
6186             else
6187               this_eol = EOL_SEEN_CRLF, src++;
6188
6189             if (eol_seen == EOL_SEEN_NONE)
6190               /* This is the first end-of-line.  */
6191               eol_seen = this_eol;
6192             else if (eol_seen != this_eol)
6193               {
6194                 /* The found type is different from what found before.
6195                    Allow for stray ^M characters in DOS EOL files.  */
6196                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6197                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6198                   eol_seen = EOL_SEEN_CRLF;
6199                 else
6200                   {
6201                     eol_seen = EOL_SEEN_LF;
6202                     break;
6203                   }
6204               }
6205             if (++total == MAX_EOL_CHECK_COUNT)
6206               break;
6207           }
6208       }
6209   return eol_seen;
6210 }
6211
6212
6213 static Lisp_Object
6214 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6215 {
6216   Lisp_Object eol_type;
6217
6218   eol_type = CODING_ID_EOL_TYPE (coding->id);
6219   if (eol_seen & EOL_SEEN_LF)
6220     {
6221       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6222       eol_type = Qunix;
6223     }
6224   else if (eol_seen & EOL_SEEN_CRLF)
6225     {
6226       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6227       eol_type = Qdos;
6228     }
6229   else if (eol_seen & EOL_SEEN_CR)
6230     {
6231       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6232       eol_type = Qmac;
6233     }
6234   return eol_type;
6235 }
6236
6237 /* Detect how a text specified in CODING is encoded.  If a coding
6238    system is detected, update fields of CODING by the detected coding
6239    system.  */
6240
6241 static void
6242 detect_coding (struct coding_system *coding)
6243 {
6244   const unsigned char *src, *src_end;
6245   unsigned int saved_mode = coding->mode;
6246
6247   coding->consumed = coding->consumed_char = 0;
6248   coding->produced = coding->produced_char = 0;
6249   coding_set_source (coding);
6250
6251   src_end = coding->source + coding->src_bytes;
6252   coding->head_ascii = 0;
6253
6254   /* If we have not yet decided the text encoding type, detect it
6255      now.  */
6256   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6257     {
6258       int c, i;
6259       struct coding_detection_info detect_info;
6260       bool null_byte_found = 0, eight_bit_found = 0;
6261
6262       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6263       for (src = coding->source; src < src_end; src++)
6264         {
6265           c = *src;
6266           if (c & 0x80)
6267             {
6268               eight_bit_found = 1;
6269               if (null_byte_found)
6270                 break;
6271             }
6272           else if (c < 0x20)
6273             {
6274               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6275                   && ! inhibit_iso_escape_detection
6276                   && ! detect_info.checked)
6277                 {
6278                   if (detect_coding_iso_2022 (coding, &detect_info))
6279                     {
6280                       /* We have scanned the whole data.  */
6281                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6282                         {
6283                           /* We didn't find an 8-bit code.  We may
6284                              have found a null-byte, but it's very
6285                              rare that a binary file conforms to
6286                              ISO-2022.  */
6287                           src = src_end;
6288                           coding->head_ascii = src - coding->source;
6289                         }
6290                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6291                       break;
6292                     }
6293                 }
6294               else if (! c && !inhibit_null_byte_detection)
6295                 {
6296                   null_byte_found = 1;
6297                   if (eight_bit_found)
6298                     break;
6299                 }
6300               if (! eight_bit_found)
6301                 coding->head_ascii++;
6302             }
6303           else if (! eight_bit_found)
6304             coding->head_ascii++;
6305         }
6306
6307       if (null_byte_found || eight_bit_found
6308           || coding->head_ascii < coding->src_bytes
6309           || detect_info.found)
6310         {
6311           enum coding_category category;
6312           struct coding_system *this;
6313
6314           if (coding->head_ascii == coding->src_bytes)
6315             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6316             for (i = 0; i < coding_category_raw_text; i++)
6317               {
6318                 category = coding_priorities[i];
6319                 this = coding_categories + category;
6320                 if (detect_info.found & (1 << category))
6321                   break;
6322               }
6323           else
6324             {
6325               if (null_byte_found)
6326                 {
6327                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6328                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6329                 }
6330               for (i = 0; i < coding_category_raw_text; i++)
6331                 {
6332                   category = coding_priorities[i];
6333                   this = coding_categories + category;
6334                   if (this->id < 0)
6335                     {
6336                       /* No coding system of this category is defined.  */
6337                       detect_info.rejected |= (1 << category);
6338                     }
6339                   else if (category >= coding_category_raw_text)
6340                     continue;
6341                   else if (detect_info.checked & (1 << category))
6342                     {
6343                       if (detect_info.found & (1 << category))
6344                         break;
6345                     }
6346                   else if ((*(this->detector)) (coding, &detect_info)
6347                            && detect_info.found & (1 << category))
6348                     {
6349                       if (category == coding_category_utf_16_auto)
6350                         {
6351                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6352                             category = coding_category_utf_16_le;
6353                           else
6354                             category = coding_category_utf_16_be;
6355                         }
6356                       break;
6357                     }
6358                 }
6359             }
6360
6361           if (i < coding_category_raw_text)
6362             setup_coding_system (CODING_ID_NAME (this->id), coding);
6363           else if (null_byte_found)
6364             setup_coding_system (Qno_conversion, coding);
6365           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6366                    == CATEGORY_MASK_ANY)
6367             setup_coding_system (Qraw_text, coding);
6368           else if (detect_info.rejected)
6369             for (i = 0; i < coding_category_raw_text; i++)
6370               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6371                 {
6372                   this = coding_categories + coding_priorities[i];
6373                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6374                   break;
6375                 }
6376         }
6377     }
6378   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6379            == coding_category_utf_8_auto)
6380     {
6381       Lisp_Object coding_systems;
6382       struct coding_detection_info detect_info;
6383
6384       coding_systems
6385         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6386       detect_info.found = detect_info.rejected = 0;
6387       coding->head_ascii = 0;
6388       if (CONSP (coding_systems)
6389           && detect_coding_utf_8 (coding, &detect_info))
6390         {
6391           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6392             setup_coding_system (XCAR (coding_systems), coding);
6393           else
6394             setup_coding_system (XCDR (coding_systems), coding);
6395         }
6396     }
6397   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6398            == coding_category_utf_16_auto)
6399     {
6400       Lisp_Object coding_systems;
6401       struct coding_detection_info detect_info;
6402
6403       coding_systems
6404         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6405       detect_info.found = detect_info.rejected = 0;
6406       coding->head_ascii = 0;
6407       if (CONSP (coding_systems)
6408           && detect_coding_utf_16 (coding, &detect_info))
6409         {
6410           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6411             setup_coding_system (XCAR (coding_systems), coding);
6412           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6413             setup_coding_system (XCDR (coding_systems), coding);
6414         }
6415     }
6416   coding->mode = saved_mode;
6417 }
6418
6419
6420 static void
6421 decode_eol (struct coding_system *coding)
6422 {
6423   Lisp_Object eol_type;
6424   unsigned char *p, *pbeg, *pend;
6425
6426   eol_type = CODING_ID_EOL_TYPE (coding->id);
6427   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6428     return;
6429
6430   if (NILP (coding->dst_object))
6431     pbeg = coding->destination;
6432   else
6433     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6434   pend = pbeg + coding->produced;
6435
6436   if (VECTORP (eol_type))
6437     {
6438       int eol_seen = EOL_SEEN_NONE;
6439
6440       for (p = pbeg; p < pend; p++)
6441         {
6442           if (*p == '\n')
6443             eol_seen |= EOL_SEEN_LF;
6444           else if (*p == '\r')
6445             {
6446               if (p + 1 < pend && *(p + 1) == '\n')
6447                 {
6448                   eol_seen |= EOL_SEEN_CRLF;
6449                   p++;
6450                 }
6451               else
6452                 eol_seen |= EOL_SEEN_CR;
6453             }
6454         }
6455       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6456       if ((eol_seen & EOL_SEEN_CRLF) != 0
6457           && (eol_seen & EOL_SEEN_CR) != 0
6458           && (eol_seen & EOL_SEEN_LF) == 0)
6459         eol_seen = EOL_SEEN_CRLF;
6460       else if (eol_seen != EOL_SEEN_NONE
6461           && eol_seen != EOL_SEEN_LF
6462           && eol_seen != EOL_SEEN_CRLF
6463           && eol_seen != EOL_SEEN_CR)
6464         eol_seen = EOL_SEEN_LF;
6465       if (eol_seen != EOL_SEEN_NONE)
6466         eol_type = adjust_coding_eol_type (coding, eol_seen);
6467     }
6468
6469   if (EQ (eol_type, Qmac))
6470     {
6471       for (p = pbeg; p < pend; p++)
6472         if (*p == '\r')
6473           *p = '\n';
6474     }
6475   else if (EQ (eol_type, Qdos))
6476     {
6477       ptrdiff_t n = 0;
6478
6479       if (NILP (coding->dst_object))
6480         {
6481           /* Start deleting '\r' from the tail to minimize the memory
6482              movement.  */
6483           for (p = pend - 2; p >= pbeg; p--)
6484             if (*p == '\r')
6485               {
6486                 memmove (p, p + 1, pend-- - p - 1);
6487                 n++;
6488               }
6489         }
6490       else
6491         {
6492           ptrdiff_t pos_byte = coding->dst_pos_byte;
6493           ptrdiff_t pos = coding->dst_pos;
6494           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6495
6496           while (pos < pos_end)
6497             {
6498               p = BYTE_POS_ADDR (pos_byte);
6499               if (*p == '\r' && p[1] == '\n')
6500                 {
6501                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6502                   n++;
6503                   pos_end--;
6504                 }
6505               pos++;
6506               if (coding->dst_multibyte)
6507                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6508               else
6509                 pos_byte++;
6510             }
6511         }
6512       coding->produced -= n;
6513       coding->produced_char -= n;
6514     }
6515 }
6516
6517
6518 /* Return a translation table (or list of them) from coding system
6519    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6520    not ENCODEP). */
6521
6522 static Lisp_Object
6523 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6524 {
6525   Lisp_Object standard, translation_table;
6526   Lisp_Object val;
6527
6528   if (NILP (Venable_character_translation))
6529     {
6530       if (max_lookup)
6531         *max_lookup = 0;
6532       return Qnil;
6533     }
6534   if (encodep)
6535     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6536       standard = Vstandard_translation_table_for_encode;
6537   else
6538     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6539       standard = Vstandard_translation_table_for_decode;
6540   if (NILP (translation_table))
6541     translation_table = standard;
6542   else
6543     {
6544       if (SYMBOLP (translation_table))
6545         translation_table = Fget (translation_table, Qtranslation_table);
6546       else if (CONSP (translation_table))
6547         {
6548           translation_table = Fcopy_sequence (translation_table);
6549           for (val = translation_table; CONSP (val); val = XCDR (val))
6550             if (SYMBOLP (XCAR (val)))
6551               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6552         }
6553       if (CHAR_TABLE_P (standard))
6554         {
6555           if (CONSP (translation_table))
6556             translation_table = nconc2 (translation_table,
6557                                         Fcons (standard, Qnil));
6558           else
6559             translation_table = Fcons (translation_table,
6560                                        Fcons (standard, Qnil));
6561         }
6562     }
6563
6564   if (max_lookup)
6565     {
6566       *max_lookup = 1;
6567       if (CHAR_TABLE_P (translation_table)
6568           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6569         {
6570           val = XCHAR_TABLE (translation_table)->extras[1];
6571           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6572             *max_lookup = XFASTINT (val);
6573         }
6574       else if (CONSP (translation_table))
6575         {
6576           Lisp_Object tail;
6577
6578           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6579             if (CHAR_TABLE_P (XCAR (tail))
6580                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6581               {
6582                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6583                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6584                   *max_lookup = XFASTINT (tailval);
6585               }
6586         }
6587     }
6588   return translation_table;
6589 }
6590
6591 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6592   do {                                                          \
6593     trans = Qnil;                                               \
6594     if (CHAR_TABLE_P (table))                                   \
6595       {                                                         \
6596         trans = CHAR_TABLE_REF (table, c);                      \
6597         if (CHARACTERP (trans))                                 \
6598           c = XFASTINT (trans), trans = Qnil;                   \
6599       }                                                         \
6600     else if (CONSP (table))                                     \
6601       {                                                         \
6602         Lisp_Object tail;                                       \
6603                                                                 \
6604         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6605           if (CHAR_TABLE_P (XCAR (tail)))                       \
6606             {                                                   \
6607               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6608               if (CHARACTERP (trans))                           \
6609                 c = XFASTINT (trans), trans = Qnil;             \
6610               else if (! NILP (trans))                          \
6611                 break;                                          \
6612             }                                                   \
6613       }                                                         \
6614   } while (0)
6615
6616
6617 /* Return a translation of character(s) at BUF according to TRANS.
6618    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6619    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6620    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6621    translation is found, and Qnil if not found..
6622    If BUF is too short to lookup characters in FROM, return Qt.  */
6623
6624 static Lisp_Object
6625 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6626 {
6627
6628   if (INTEGERP (trans))
6629     return trans;
6630   for (; CONSP (trans); trans = XCDR (trans))
6631     {
6632       Lisp_Object val = XCAR (trans);
6633       Lisp_Object from = XCAR (val);
6634       ptrdiff_t len = ASIZE (from);
6635       ptrdiff_t i;
6636
6637       for (i = 0; i < len; i++)
6638         {
6639           if (buf + i == buf_end)
6640             return Qt;
6641           if (XINT (AREF (from, i)) != buf[i])
6642             break;
6643         }
6644       if (i == len)
6645         return val;
6646     }
6647   return Qnil;
6648 }
6649
6650
6651 static int
6652 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6653                bool last_block)
6654 {
6655   unsigned char *dst = coding->destination + coding->produced;
6656   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6657   ptrdiff_t produced;
6658   ptrdiff_t produced_chars = 0;
6659   int carryover = 0;
6660
6661   if (! coding->chars_at_source)
6662     {
6663       /* Source characters are in coding->charbuf.  */
6664       int *buf = coding->charbuf;
6665       int *buf_end = buf + coding->charbuf_used;
6666
6667       if (EQ (coding->src_object, coding->dst_object))
6668         {
6669           coding_set_source (coding);
6670           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6671         }
6672
6673       while (buf < buf_end)
6674         {
6675           int c = *buf;
6676           ptrdiff_t i;
6677
6678           if (c >= 0)
6679             {
6680               ptrdiff_t from_nchars = 1, to_nchars = 1;
6681               Lisp_Object trans = Qnil;
6682
6683               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6684               if (! NILP (trans))
6685                 {
6686                   trans = get_translation (trans, buf, buf_end);
6687                   if (INTEGERP (trans))
6688                     c = XINT (trans);
6689                   else if (CONSP (trans))
6690                     {
6691                       from_nchars = ASIZE (XCAR (trans));
6692                       trans = XCDR (trans);
6693                       if (INTEGERP (trans))
6694                         c = XINT (trans);
6695                       else
6696                         {
6697                           to_nchars = ASIZE (trans);
6698                           c = XINT (AREF (trans, 0));
6699                         }
6700                     }
6701                   else if (EQ (trans, Qt) && ! last_block)
6702                     break;
6703                 }
6704
6705               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6706                 {
6707                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6708                        / MAX_MULTIBYTE_LENGTH)
6709                       < to_nchars)
6710                     memory_full (SIZE_MAX);
6711                   dst = alloc_destination (coding,
6712                                            buf_end - buf
6713                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6714                                            dst);
6715                   if (EQ (coding->src_object, coding->dst_object))
6716                     {
6717                       coding_set_source (coding);
6718                       dst_end = (((unsigned char *) coding->source)
6719                                  + coding->consumed);
6720                     }
6721                   else
6722                     dst_end = coding->destination + coding->dst_bytes;
6723                 }
6724
6725               for (i = 0; i < to_nchars; i++)
6726                 {
6727                   if (i > 0)
6728                     c = XINT (AREF (trans, i));
6729                   if (coding->dst_multibyte
6730                       || ! CHAR_BYTE8_P (c))
6731                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6732                   else
6733                     *dst++ = CHAR_TO_BYTE8 (c);
6734                 }
6735               produced_chars += to_nchars;
6736               buf += from_nchars;
6737             }
6738           else
6739             /* This is an annotation datum.  (-C) is the length.  */
6740             buf += -c;
6741         }
6742       carryover = buf_end - buf;
6743     }
6744   else
6745     {
6746       /* Source characters are at coding->source.  */
6747       const unsigned char *src = coding->source;
6748       const unsigned char *src_end = src + coding->consumed;
6749
6750       if (EQ (coding->dst_object, coding->src_object))
6751         dst_end = (unsigned char *) src;
6752       if (coding->src_multibyte != coding->dst_multibyte)
6753         {
6754           if (coding->src_multibyte)
6755             {
6756               bool multibytep = 1;
6757               ptrdiff_t consumed_chars = 0;
6758
6759               while (1)
6760                 {
6761                   const unsigned char *src_base = src;
6762                   int c;
6763
6764                   ONE_MORE_BYTE (c);
6765                   if (dst == dst_end)
6766                     {
6767                       if (EQ (coding->src_object, coding->dst_object))
6768                         dst_end = (unsigned char *) src;
6769                       if (dst == dst_end)
6770                         {
6771                           ptrdiff_t offset = src - coding->source;
6772
6773                           dst = alloc_destination (coding, src_end - src + 1,
6774                                                    dst);
6775                           dst_end = coding->destination + coding->dst_bytes;
6776                           coding_set_source (coding);
6777                           src = coding->source + offset;
6778                           src_end = coding->source + coding->consumed;
6779                           if (EQ (coding->src_object, coding->dst_object))
6780                             dst_end = (unsigned char *) src;
6781                         }
6782                     }
6783                   *dst++ = c;
6784                   produced_chars++;
6785                 }
6786             no_more_source:
6787               ;
6788             }
6789           else
6790             while (src < src_end)
6791               {
6792                 bool multibytep = 1;
6793                 int c = *src++;
6794
6795                 if (dst >= dst_end - 1)
6796                   {
6797                     if (EQ (coding->src_object, coding->dst_object))
6798                       dst_end = (unsigned char *) src;
6799                     if (dst >= dst_end - 1)
6800                       {
6801                         ptrdiff_t offset = src - coding->source;
6802                         ptrdiff_t more_bytes;
6803
6804                         if (EQ (coding->src_object, coding->dst_object))
6805                           more_bytes = ((src_end - src) / 2) + 2;
6806                         else
6807                           more_bytes = src_end - src + 2;
6808                         dst = alloc_destination (coding, more_bytes, dst);
6809                         dst_end = coding->destination + coding->dst_bytes;
6810                         coding_set_source (coding);
6811                         src = coding->source + offset;
6812                         src_end = coding->source + coding->consumed;
6813                         if (EQ (coding->src_object, coding->dst_object))
6814                           dst_end = (unsigned char *) src;
6815                       }
6816                   }
6817                 EMIT_ONE_BYTE (c);
6818               }
6819         }
6820       else
6821         {
6822           if (!EQ (coding->src_object, coding->dst_object))
6823             {
6824               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6825
6826               if (require > 0)
6827                 {
6828                   ptrdiff_t offset = src - coding->source;
6829
6830                   dst = alloc_destination (coding, require, dst);
6831                   coding_set_source (coding);
6832                   src = coding->source + offset;
6833                   src_end = coding->source + coding->consumed;
6834                 }
6835             }
6836           produced_chars = coding->consumed_char;
6837           while (src < src_end)
6838             *dst++ = *src++;
6839         }
6840     }
6841
6842   produced = dst - (coding->destination + coding->produced);
6843   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6844     insert_from_gap (produced_chars, produced);
6845   coding->produced += produced;
6846   coding->produced_char += produced_chars;
6847   return carryover;
6848 }
6849
6850 /* Compose text in CODING->object according to the annotation data at
6851    CHARBUF.  CHARBUF is an array:
6852      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6853  */
6854
6855 static inline void
6856 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6857 {
6858   int len;
6859   ptrdiff_t to;
6860   enum composition_method method;
6861   Lisp_Object components;
6862
6863   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6864   to = pos + charbuf[2];
6865   method = (enum composition_method) (charbuf[4]);
6866
6867   if (method == COMPOSITION_RELATIVE)
6868     components = Qnil;
6869   else
6870     {
6871       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6872       int i, j;
6873
6874       if (method == COMPOSITION_WITH_RULE)
6875         len = charbuf[2] * 3 - 2;
6876       charbuf += MAX_ANNOTATION_LENGTH;
6877       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6878       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6879         {
6880           if (charbuf[i] >= 0)
6881             args[j] = make_number (charbuf[i]);
6882           else
6883             {
6884               i++;
6885               args[j] = make_number (charbuf[i] % 0x100);
6886             }
6887         }
6888       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6889     }
6890   compose_text (pos, to, components, Qnil, coding->dst_object);
6891 }
6892
6893
6894 /* Put `charset' property on text in CODING->object according to
6895    the annotation data at CHARBUF.  CHARBUF is an array:
6896      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6897  */
6898
6899 static inline void
6900 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6901 {
6902   ptrdiff_t from = pos - charbuf[2];
6903   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6904
6905   Fput_text_property (make_number (from), make_number (pos),
6906                       Qcharset, CHARSET_NAME (charset),
6907                       coding->dst_object);
6908 }
6909
6910
6911 #define CHARBUF_SIZE 0x4000
6912
6913 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6914   do {                                                                  \
6915     int size = CHARBUF_SIZE;                                            \
6916                                                                         \
6917     coding->charbuf = NULL;                                             \
6918     while (size > 1024)                                                 \
6919       {                                                                 \
6920         coding->charbuf = alloca (sizeof (int) * size);                 \
6921         if (coding->charbuf)                                            \
6922           break;                                                        \
6923         size >>= 1;                                                     \
6924       }                                                                 \
6925     if (! coding->charbuf)                                              \
6926       {                                                                 \
6927         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6928         return;                                                         \
6929       }                                                                 \
6930     coding->charbuf_size = size;                                        \
6931   } while (0)
6932
6933
6934 static void
6935 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6936 {
6937   int *charbuf = coding->charbuf;
6938   int *charbuf_end = charbuf + coding->charbuf_used;
6939
6940   if (NILP (coding->dst_object))
6941     return;
6942
6943   while (charbuf < charbuf_end)
6944     {
6945       if (*charbuf >= 0)
6946         pos++, charbuf++;
6947       else
6948         {
6949           int len = -*charbuf;
6950
6951           if (len > 2)
6952             switch (charbuf[1])
6953               {
6954               case CODING_ANNOTATE_COMPOSITION_MASK:
6955                 produce_composition (coding, charbuf, pos);
6956                 break;
6957               case CODING_ANNOTATE_CHARSET_MASK:
6958                 produce_charset (coding, charbuf, pos);
6959                 break;
6960               }
6961           charbuf += len;
6962         }
6963     }
6964 }
6965
6966 /* Decode the data at CODING->src_object into CODING->dst_object.
6967    CODING->src_object is a buffer, a string, or nil.
6968    CODING->dst_object is a buffer.
6969
6970    If CODING->src_object is a buffer, it must be the current buffer.
6971    In this case, if CODING->src_pos is positive, it is a position of
6972    the source text in the buffer, otherwise, the source text is in the
6973    gap area of the buffer, and CODING->src_pos specifies the offset of
6974    the text from GPT (which must be the same as PT).  If this is the
6975    same buffer as CODING->dst_object, CODING->src_pos must be
6976    negative.
6977
6978    If CODING->src_object is a string, CODING->src_pos is an index to
6979    that string.
6980
6981    If CODING->src_object is nil, CODING->source must already point to
6982    the non-relocatable memory area.  In this case, CODING->src_pos is
6983    an offset from CODING->source.
6984
6985    The decoded data is inserted at the current point of the buffer
6986    CODING->dst_object.
6987 */
6988
6989 static void
6990 decode_coding (struct coding_system *coding)
6991 {
6992   Lisp_Object attrs;
6993   Lisp_Object undo_list;
6994   Lisp_Object translation_table;
6995   struct ccl_spec cclspec;
6996   int carryover;
6997   int i;
6998
6999   if (BUFFERP (coding->src_object)
7000       && coding->src_pos > 0
7001       && coding->src_pos < GPT
7002       && coding->src_pos + coding->src_chars > GPT)
7003     move_gap_both (coding->src_pos, coding->src_pos_byte);
7004
7005   undo_list = Qt;
7006   if (BUFFERP (coding->dst_object))
7007     {
7008       set_buffer_internal (XBUFFER (coding->dst_object));
7009       if (GPT != PT)
7010         move_gap_both (PT, PT_BYTE);
7011
7012       /* We must disable undo_list in order to record the whole insert
7013          transaction via record_insert at the end.  But doing so also
7014          disables the recording of the first change to the undo_list.
7015          Therefore we check for first change here and record it via
7016          record_first_change if needed.  */
7017       if (MODIFF <= SAVE_MODIFF)
7018         record_first_change ();
7019
7020       undo_list = BVAR (current_buffer, undo_list);
7021       bset_undo_list (current_buffer, Qt);
7022     }
7023
7024   coding->consumed = coding->consumed_char = 0;
7025   coding->produced = coding->produced_char = 0;
7026   coding->chars_at_source = 0;
7027   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7028   coding->errors = 0;
7029
7030   ALLOC_CONVERSION_WORK_AREA (coding);
7031
7032   attrs = CODING_ID_ATTRS (coding->id);
7033   translation_table = get_translation_table (attrs, 0, NULL);
7034
7035   carryover = 0;
7036   if (coding->decoder == decode_coding_ccl)
7037     {
7038       coding->spec.ccl = &cclspec;
7039       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7040     }
7041   do
7042     {
7043       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7044
7045       coding_set_source (coding);
7046       coding->annotated = 0;
7047       coding->charbuf_used = carryover;
7048       (*(coding->decoder)) (coding);
7049       coding_set_destination (coding);
7050       carryover = produce_chars (coding, translation_table, 0);
7051       if (coding->annotated)
7052         produce_annotation (coding, pos);
7053       for (i = 0; i < carryover; i++)
7054         coding->charbuf[i]
7055           = coding->charbuf[coding->charbuf_used - carryover + i];
7056     }
7057   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7058          || (coding->consumed < coding->src_bytes
7059              && (coding->result == CODING_RESULT_SUCCESS
7060                  || coding->result == CODING_RESULT_INVALID_SRC)));
7061
7062   if (carryover > 0)
7063     {
7064       coding_set_destination (coding);
7065       coding->charbuf_used = carryover;
7066       produce_chars (coding, translation_table, 1);
7067     }
7068
7069   coding->carryover_bytes = 0;
7070   if (coding->consumed < coding->src_bytes)
7071     {
7072       int nbytes = coding->src_bytes - coding->consumed;
7073       const unsigned char *src;
7074
7075       coding_set_source (coding);
7076       coding_set_destination (coding);
7077       src = coding->source + coding->consumed;
7078
7079       if (coding->mode & CODING_MODE_LAST_BLOCK)
7080         {
7081           /* Flush out unprocessed data as binary chars.  We are sure
7082              that the number of data is less than the size of
7083              coding->charbuf.  */
7084           coding->charbuf_used = 0;
7085           coding->chars_at_source = 0;
7086
7087           while (nbytes-- > 0)
7088             {
7089               int c = *src++;
7090
7091               if (c & 0x80)
7092                 c = BYTE8_TO_CHAR (c);
7093               coding->charbuf[coding->charbuf_used++] = c;
7094             }
7095           produce_chars (coding, Qnil, 1);
7096         }
7097       else
7098         {
7099           /* Record unprocessed bytes in coding->carryover.  We are
7100              sure that the number of data is less than the size of
7101              coding->carryover.  */
7102           unsigned char *p = coding->carryover;
7103
7104           if (nbytes > sizeof coding->carryover)
7105             nbytes = sizeof coding->carryover;
7106           coding->carryover_bytes = nbytes;
7107           while (nbytes-- > 0)
7108             *p++ = *src++;
7109         }
7110       coding->consumed = coding->src_bytes;
7111     }
7112
7113   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7114       && !inhibit_eol_conversion)
7115     decode_eol (coding);
7116   if (BUFFERP (coding->dst_object))
7117     {
7118       bset_undo_list (current_buffer, undo_list);
7119       record_insert (coding->dst_pos, coding->produced_char);
7120     }
7121 }
7122
7123
7124 /* Extract an annotation datum from a composition starting at POS and
7125    ending before LIMIT of CODING->src_object (buffer or string), store
7126    the data in BUF, set *STOP to a starting position of the next
7127    composition (if any) or to LIMIT, and return the address of the
7128    next element of BUF.
7129
7130    If such an annotation is not found, set *STOP to a starting
7131    position of a composition after POS (if any) or to LIMIT, and
7132    return BUF.  */
7133
7134 static inline int *
7135 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7136                                struct coding_system *coding, int *buf,
7137                                ptrdiff_t *stop)
7138 {
7139   ptrdiff_t start, end;
7140   Lisp_Object prop;
7141
7142   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7143       || end > limit)
7144     *stop = limit;
7145   else if (start > pos)
7146     *stop = start;
7147   else
7148     {
7149       if (start == pos)
7150         {
7151           /* We found a composition.  Store the corresponding
7152              annotation data in BUF.  */
7153           int *head = buf;
7154           enum composition_method method = COMPOSITION_METHOD (prop);
7155           int nchars = COMPOSITION_LENGTH (prop);
7156
7157           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7158           if (method != COMPOSITION_RELATIVE)
7159             {
7160               Lisp_Object components;
7161               ptrdiff_t i, len, i_byte;
7162
7163               components = COMPOSITION_COMPONENTS (prop);
7164               if (VECTORP (components))
7165                 {
7166                   len = ASIZE (components);
7167                   for (i = 0; i < len; i++)
7168                     *buf++ = XINT (AREF (components, i));
7169                 }
7170               else if (STRINGP (components))
7171                 {
7172                   len = SCHARS (components);
7173                   i = i_byte = 0;
7174                   while (i < len)
7175                     {
7176                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7177                       buf++;
7178                     }
7179                 }
7180               else if (INTEGERP (components))
7181                 {
7182                   len = 1;
7183                   *buf++ = XINT (components);
7184                 }
7185               else if (CONSP (components))
7186                 {
7187                   for (len = 0; CONSP (components);
7188                        len++, components = XCDR (components))
7189                     *buf++ = XINT (XCAR (components));
7190                 }
7191               else
7192                 emacs_abort ();
7193               *head -= len;
7194             }
7195         }
7196
7197       if (find_composition (end, limit, &start, &end, &prop,
7198                             coding->src_object)
7199           && end <= limit)
7200         *stop = start;
7201       else
7202         *stop = limit;
7203     }
7204   return buf;
7205 }
7206
7207
7208 /* Extract an annotation datum from a text property `charset' at POS of
7209    CODING->src_object (buffer of string), store the data in BUF, set
7210    *STOP to the position where the value of `charset' property changes
7211    (limiting by LIMIT), and return the address of the next element of
7212    BUF.
7213
7214    If the property value is nil, set *STOP to the position where the
7215    property value is non-nil (limiting by LIMIT), and return BUF.  */
7216
7217 static inline int *
7218 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7219                            struct coding_system *coding, int *buf,
7220                            ptrdiff_t *stop)
7221 {
7222   Lisp_Object val, next;
7223   int id;
7224
7225   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7226   if (! NILP (val) && CHARSETP (val))
7227     id = XINT (CHARSET_SYMBOL_ID (val));
7228   else
7229     id = -1;
7230   ADD_CHARSET_DATA (buf, 0, id);
7231   next = Fnext_single_property_change (make_number (pos), Qcharset,
7232                                        coding->src_object,
7233                                        make_number (limit));
7234   *stop = XINT (next);
7235   return buf;
7236 }
7237
7238
7239 static void
7240 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7241                int max_lookup)
7242 {
7243   int *buf = coding->charbuf;
7244   int *buf_end = coding->charbuf + coding->charbuf_size;
7245   const unsigned char *src = coding->source + coding->consumed;
7246   const unsigned char *src_end = coding->source + coding->src_bytes;
7247   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7248   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7249   bool multibytep = coding->src_multibyte;
7250   Lisp_Object eol_type;
7251   int c;
7252   ptrdiff_t stop, stop_composition, stop_charset;
7253   int *lookup_buf = NULL;
7254
7255   if (! NILP (translation_table))
7256     lookup_buf = alloca (sizeof (int) * max_lookup);
7257
7258   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7259   if (VECTORP (eol_type))
7260     eol_type = Qunix;
7261
7262   /* Note: composition handling is not yet implemented.  */
7263   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7264
7265   if (NILP (coding->src_object))
7266     stop = stop_composition = stop_charset = end_pos;
7267   else
7268     {
7269       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7270         stop = stop_composition = pos;
7271       else
7272         stop = stop_composition = end_pos;
7273       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7274         stop = stop_charset = pos;
7275       else
7276         stop_charset = end_pos;
7277     }
7278
7279   /* Compensate for CRLF and conversion.  */
7280   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7281   while (buf < buf_end)
7282     {
7283       Lisp_Object trans;
7284
7285       if (pos == stop)
7286         {
7287           if (pos == end_pos)
7288             break;
7289           if (pos == stop_composition)
7290             buf = handle_composition_annotation (pos, end_pos, coding,
7291                                                  buf, &stop_composition);
7292           if (pos == stop_charset)
7293             buf = handle_charset_annotation (pos, end_pos, coding,
7294                                              buf, &stop_charset);
7295           stop = (stop_composition < stop_charset
7296                   ? stop_composition : stop_charset);
7297         }
7298
7299       if (! multibytep)
7300         {
7301           int bytes;
7302
7303           if (coding->encoder == encode_coding_raw_text
7304               || coding->encoder == encode_coding_ccl)
7305             c = *src++, pos++;
7306           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7307             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7308           else
7309             c = BYTE8_TO_CHAR (*src), src++, pos++;
7310         }
7311       else
7312         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7313       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7314         c = '\n';
7315       if (! EQ (eol_type, Qunix))
7316         {
7317           if (c == '\n')
7318             {
7319               if (EQ (eol_type, Qdos))
7320                 *buf++ = '\r';
7321               else
7322                 c = '\r';
7323             }
7324         }
7325
7326       trans = Qnil;
7327       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7328       if (NILP (trans))
7329         *buf++ = c;
7330       else
7331         {
7332           ptrdiff_t from_nchars = 1, to_nchars = 1;
7333           int *lookup_buf_end;
7334           const unsigned char *p = src;
7335           int i;
7336
7337           lookup_buf[0] = c;
7338           for (i = 1; i < max_lookup && p < src_end; i++)
7339             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7340           lookup_buf_end = lookup_buf + i;
7341           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7342           if (INTEGERP (trans))
7343             c = XINT (trans);
7344           else if (CONSP (trans))
7345             {
7346               from_nchars = ASIZE (XCAR (trans));
7347               trans = XCDR (trans);
7348               if (INTEGERP (trans))
7349                 c = XINT (trans);
7350               else
7351                 {
7352                   to_nchars = ASIZE (trans);
7353                   if (buf_end - buf < to_nchars)
7354                     break;
7355                   c = XINT (AREF (trans, 0));
7356                 }
7357             }
7358           else
7359             break;
7360           *buf++ = c;
7361           for (i = 1; i < to_nchars; i++)
7362             *buf++ = XINT (AREF (trans, i));
7363           for (i = 1; i < from_nchars; i++, pos++)
7364             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7365         }
7366     }
7367
7368   coding->consumed = src - coding->source;
7369   coding->consumed_char = pos - coding->src_pos;
7370   coding->charbuf_used = buf - coding->charbuf;
7371   coding->chars_at_source = 0;
7372 }
7373
7374
7375 /* Encode the text at CODING->src_object into CODING->dst_object.
7376    CODING->src_object is a buffer or a string.
7377    CODING->dst_object is a buffer or nil.
7378
7379    If CODING->src_object is a buffer, it must be the current buffer.
7380    In this case, if CODING->src_pos is positive, it is a position of
7381    the source text in the buffer, otherwise. the source text is in the
7382    gap area of the buffer, and coding->src_pos specifies the offset of
7383    the text from GPT (which must be the same as PT).  If this is the
7384    same buffer as CODING->dst_object, CODING->src_pos must be
7385    negative and CODING should not have `pre-write-conversion'.
7386
7387    If CODING->src_object is a string, CODING should not have
7388    `pre-write-conversion'.
7389
7390    If CODING->dst_object is a buffer, the encoded data is inserted at
7391    the current point of that buffer.
7392
7393    If CODING->dst_object is nil, the encoded data is placed at the
7394    memory area specified by CODING->destination.  */
7395
7396 static void
7397 encode_coding (struct coding_system *coding)
7398 {
7399   Lisp_Object attrs;
7400   Lisp_Object translation_table;
7401   int max_lookup;
7402   struct ccl_spec cclspec;
7403
7404   attrs = CODING_ID_ATTRS (coding->id);
7405   if (coding->encoder == encode_coding_raw_text)
7406     translation_table = Qnil, max_lookup = 0;
7407   else
7408     translation_table = get_translation_table (attrs, 1, &max_lookup);
7409
7410   if (BUFFERP (coding->dst_object))
7411     {
7412       set_buffer_internal (XBUFFER (coding->dst_object));
7413       coding->dst_multibyte
7414         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7415     }
7416
7417   coding->consumed = coding->consumed_char = 0;
7418   coding->produced = coding->produced_char = 0;
7419   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7420   coding->errors = 0;
7421
7422   ALLOC_CONVERSION_WORK_AREA (coding);
7423
7424   if (coding->encoder == encode_coding_ccl)
7425     {
7426       coding->spec.ccl = &cclspec;
7427       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7428     }
7429   do {
7430     coding_set_source (coding);
7431     consume_chars (coding, translation_table, max_lookup);
7432     coding_set_destination (coding);
7433     (*(coding->encoder)) (coding);
7434   } while (coding->consumed_char < coding->src_chars);
7435
7436   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7437     insert_from_gap (coding->produced_char, coding->produced);
7438 }
7439
7440
7441 /* Name (or base name) of work buffer for code conversion.  */
7442 static Lisp_Object Vcode_conversion_workbuf_name;
7443
7444 /* A working buffer used by the top level conversion.  Once it is
7445    created, it is never destroyed.  It has the name
7446    Vcode_conversion_workbuf_name.  The other working buffers are
7447    destroyed after the use is finished, and their names are modified
7448    versions of Vcode_conversion_workbuf_name.  */
7449 static Lisp_Object Vcode_conversion_reused_workbuf;
7450
7451 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7452 static bool reused_workbuf_in_use;
7453
7454
7455 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7456    multibyteness of returning buffer.  */
7457
7458 static Lisp_Object
7459 make_conversion_work_buffer (bool multibyte)
7460 {
7461   Lisp_Object name, workbuf;
7462   struct buffer *current;
7463
7464   if (reused_workbuf_in_use)
7465     {
7466       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7467       workbuf = Fget_buffer_create (name);
7468     }
7469   else
7470     {
7471       reused_workbuf_in_use = 1;
7472       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7473         Vcode_conversion_reused_workbuf
7474           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7475       workbuf = Vcode_conversion_reused_workbuf;
7476     }
7477   current = current_buffer;
7478   set_buffer_internal (XBUFFER (workbuf));
7479   /* We can't allow modification hooks to run in the work buffer.  For
7480      instance, directory_files_internal assumes that file decoding
7481      doesn't compile new regexps.  */
7482   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7483   Ferase_buffer ();
7484   bset_undo_list (current_buffer, Qt);
7485   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7486   set_buffer_internal (current);
7487   return workbuf;
7488 }
7489
7490
7491 static Lisp_Object
7492 code_conversion_restore (Lisp_Object arg)
7493 {
7494   Lisp_Object current, workbuf;
7495   struct gcpro gcpro1;
7496
7497   GCPRO1 (arg);
7498   current = XCAR (arg);
7499   workbuf = XCDR (arg);
7500   if (! NILP (workbuf))
7501     {
7502       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7503         reused_workbuf_in_use = 0;
7504       else
7505         Fkill_buffer (workbuf);
7506     }
7507   set_buffer_internal (XBUFFER (current));
7508   UNGCPRO;
7509   return Qnil;
7510 }
7511
7512 Lisp_Object
7513 code_conversion_save (bool with_work_buf, bool multibyte)
7514 {
7515   Lisp_Object workbuf = Qnil;
7516
7517   if (with_work_buf)
7518     workbuf = make_conversion_work_buffer (multibyte);
7519   record_unwind_protect (code_conversion_restore,
7520                          Fcons (Fcurrent_buffer (), workbuf));
7521   return workbuf;
7522 }
7523
7524 void
7525 decode_coding_gap (struct coding_system *coding,
7526                    ptrdiff_t chars, ptrdiff_t bytes)
7527 {
7528   ptrdiff_t count = SPECPDL_INDEX ();
7529   Lisp_Object attrs;
7530
7531   code_conversion_save (0, 0);
7532
7533   coding->src_object = Fcurrent_buffer ();
7534   coding->src_chars = chars;
7535   coding->src_bytes = bytes;
7536   coding->src_pos = -chars;
7537   coding->src_pos_byte = -bytes;
7538   coding->src_multibyte = chars < bytes;
7539   coding->dst_object = coding->src_object;
7540   coding->dst_pos = PT;
7541   coding->dst_pos_byte = PT_BYTE;
7542   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7543
7544   if (CODING_REQUIRE_DETECTION (coding))
7545     detect_coding (coding);
7546
7547   coding->mode |= CODING_MODE_LAST_BLOCK;
7548   current_buffer->text->inhibit_shrinking = 1;
7549   decode_coding (coding);
7550   current_buffer->text->inhibit_shrinking = 0;
7551
7552   attrs = CODING_ID_ATTRS (coding->id);
7553   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7554     {
7555       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7556       Lisp_Object val;
7557
7558       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7559       val = call1 (CODING_ATTR_POST_READ (attrs),
7560                    make_number (coding->produced_char));
7561       CHECK_NATNUM (val);
7562       coding->produced_char += Z - prev_Z;
7563       coding->produced += Z_BYTE - prev_Z_BYTE;
7564     }
7565
7566   unbind_to (count, Qnil);
7567 }
7568
7569
7570 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7571    SRC_OBJECT into DST_OBJECT by coding context CODING.
7572
7573    SRC_OBJECT is a buffer, a string, or Qnil.
7574
7575    If it is a buffer, the text is at point of the buffer.  FROM and TO
7576    are positions in the buffer.
7577
7578    If it is a string, the text is at the beginning of the string.
7579    FROM and TO are indices to the string.
7580
7581    If it is nil, the text is at coding->source.  FROM and TO are
7582    indices to coding->source.
7583
7584    DST_OBJECT is a buffer, Qt, or Qnil.
7585
7586    If it is a buffer, the decoded text is inserted at point of the
7587    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7588    is deleted.
7589
7590    If it is Qt, a string is made from the decoded text, and
7591    set in CODING->dst_object.
7592
7593    If it is Qnil, the decoded text is stored at CODING->destination.
7594    The caller must allocate CODING->dst_bytes bytes at
7595    CODING->destination by xmalloc.  If the decoded text is longer than
7596    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7597  */
7598
7599 void
7600 decode_coding_object (struct coding_system *coding,
7601                       Lisp_Object src_object,
7602                       ptrdiff_t from, ptrdiff_t from_byte,
7603                       ptrdiff_t to, ptrdiff_t to_byte,
7604                       Lisp_Object dst_object)
7605 {
7606   ptrdiff_t count = SPECPDL_INDEX ();
7607   unsigned char *destination IF_LINT (= NULL);
7608   ptrdiff_t dst_bytes IF_LINT (= 0);
7609   ptrdiff_t chars = to - from;
7610   ptrdiff_t bytes = to_byte - from_byte;
7611   Lisp_Object attrs;
7612   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7613   bool need_marker_adjustment = 0;
7614   Lisp_Object old_deactivate_mark;
7615
7616   old_deactivate_mark = Vdeactivate_mark;
7617
7618   if (NILP (dst_object))
7619     {
7620       destination = coding->destination;
7621       dst_bytes = coding->dst_bytes;
7622     }
7623
7624   coding->src_object = src_object;
7625   coding->src_chars = chars;
7626   coding->src_bytes = bytes;
7627   coding->src_multibyte = chars < bytes;
7628
7629   if (STRINGP (src_object))
7630     {
7631       coding->src_pos = from;
7632       coding->src_pos_byte = from_byte;
7633     }
7634   else if (BUFFERP (src_object))
7635     {
7636       set_buffer_internal (XBUFFER (src_object));
7637       if (from != GPT)
7638         move_gap_both (from, from_byte);
7639       if (EQ (src_object, dst_object))
7640         {
7641           struct Lisp_Marker *tail;
7642
7643           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7644             {
7645               tail->need_adjustment
7646                 = tail->charpos == (tail->insertion_type ? from : to);
7647               need_marker_adjustment |= tail->need_adjustment;
7648             }
7649           saved_pt = PT, saved_pt_byte = PT_BYTE;
7650           TEMP_SET_PT_BOTH (from, from_byte);
7651           current_buffer->text->inhibit_shrinking = 1;
7652           del_range_both (from, from_byte, to, to_byte, 1);
7653           coding->src_pos = -chars;
7654           coding->src_pos_byte = -bytes;
7655         }
7656       else
7657         {
7658           coding->src_pos = from;
7659           coding->src_pos_byte = from_byte;
7660         }
7661     }
7662
7663   if (CODING_REQUIRE_DETECTION (coding))
7664     detect_coding (coding);
7665   attrs = CODING_ID_ATTRS (coding->id);
7666
7667   if (EQ (dst_object, Qt)
7668       || (! NILP (CODING_ATTR_POST_READ (attrs))
7669           && NILP (dst_object)))
7670     {
7671       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7672       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7673       coding->dst_pos = BEG;
7674       coding->dst_pos_byte = BEG_BYTE;
7675     }
7676   else if (BUFFERP (dst_object))
7677     {
7678       code_conversion_save (0, 0);
7679       coding->dst_object = dst_object;
7680       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7681       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7682       coding->dst_multibyte
7683         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7684     }
7685   else
7686     {
7687       code_conversion_save (0, 0);
7688       coding->dst_object = Qnil;
7689       /* Most callers presume this will return a multibyte result, and they
7690          won't use `binary' or `raw-text' anyway, so let's not worry about
7691          CODING_FOR_UNIBYTE.  */
7692       coding->dst_multibyte = 1;
7693     }
7694
7695   decode_coding (coding);
7696
7697   if (BUFFERP (coding->dst_object))
7698     set_buffer_internal (XBUFFER (coding->dst_object));
7699
7700   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7701     {
7702       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7703       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7704       Lisp_Object val;
7705
7706       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7707       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7708               old_deactivate_mark);
7709       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7710                         make_number (coding->produced_char));
7711       UNGCPRO;
7712       CHECK_NATNUM (val);
7713       coding->produced_char += Z - prev_Z;
7714       coding->produced += Z_BYTE - prev_Z_BYTE;
7715     }
7716
7717   if (EQ (dst_object, Qt))
7718     {
7719       coding->dst_object = Fbuffer_string ();
7720     }
7721   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7722     {
7723       set_buffer_internal (XBUFFER (coding->dst_object));
7724       if (dst_bytes < coding->produced)
7725         {
7726           destination = xrealloc (destination, coding->produced);
7727           if (! destination)
7728             {
7729               record_conversion_result (coding,
7730                                         CODING_RESULT_INSUFFICIENT_MEM);
7731               unbind_to (count, Qnil);
7732               return;
7733             }
7734           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7735             move_gap_both (BEGV, BEGV_BYTE);
7736           memcpy (destination, BEGV_ADDR, coding->produced);
7737           coding->destination = destination;
7738         }
7739     }
7740
7741   if (saved_pt >= 0)
7742     {
7743       /* This is the case of:
7744          (BUFFERP (src_object) && EQ (src_object, dst_object))
7745          As we have moved PT while replacing the original buffer
7746          contents, we must recover it now.  */
7747       set_buffer_internal (XBUFFER (src_object));
7748       current_buffer->text->inhibit_shrinking = 0;
7749       if (saved_pt < from)
7750         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7751       else if (saved_pt < from + chars)
7752         TEMP_SET_PT_BOTH (from, from_byte);
7753       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7754         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7755                           saved_pt_byte + (coding->produced - bytes));
7756       else
7757         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7758                           saved_pt_byte + (coding->produced - bytes));
7759
7760       if (need_marker_adjustment)
7761         {
7762           struct Lisp_Marker *tail;
7763
7764           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7765             if (tail->need_adjustment)
7766               {
7767                 tail->need_adjustment = 0;
7768                 if (tail->insertion_type)
7769                   {
7770                     tail->bytepos = from_byte;
7771                     tail->charpos = from;
7772                   }
7773                 else
7774                   {
7775                     tail->bytepos = from_byte + coding->produced;
7776                     tail->charpos
7777                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7778                          ? tail->bytepos : from + coding->produced_char);
7779                   }
7780               }
7781         }
7782     }
7783
7784   Vdeactivate_mark = old_deactivate_mark;
7785   unbind_to (count, coding->dst_object);
7786 }
7787
7788
7789 void
7790 encode_coding_object (struct coding_system *coding,
7791                       Lisp_Object src_object,
7792                       ptrdiff_t from, ptrdiff_t from_byte,
7793                       ptrdiff_t to, ptrdiff_t to_byte,
7794                       Lisp_Object dst_object)
7795 {
7796   ptrdiff_t count = SPECPDL_INDEX ();
7797   ptrdiff_t chars = to - from;
7798   ptrdiff_t bytes = to_byte - from_byte;
7799   Lisp_Object attrs;
7800   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7801   bool need_marker_adjustment = 0;
7802   bool kill_src_buffer = 0;
7803   Lisp_Object old_deactivate_mark;
7804
7805   old_deactivate_mark = Vdeactivate_mark;
7806
7807   coding->src_object = src_object;
7808   coding->src_chars = chars;
7809   coding->src_bytes = bytes;
7810   coding->src_multibyte = chars < bytes;
7811
7812   attrs = CODING_ID_ATTRS (coding->id);
7813
7814   if (EQ (src_object, dst_object))
7815     {
7816       struct Lisp_Marker *tail;
7817
7818       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7819         {
7820           tail->need_adjustment
7821             = tail->charpos == (tail->insertion_type ? from : to);
7822           need_marker_adjustment |= tail->need_adjustment;
7823         }
7824     }
7825
7826   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7827     {
7828       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7829       set_buffer_internal (XBUFFER (coding->src_object));
7830       if (STRINGP (src_object))
7831         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7832       else if (BUFFERP (src_object))
7833         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7834       else
7835         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7836
7837       if (EQ (src_object, dst_object))
7838         {
7839           set_buffer_internal (XBUFFER (src_object));
7840           saved_pt = PT, saved_pt_byte = PT_BYTE;
7841           del_range_both (from, from_byte, to, to_byte, 1);
7842           set_buffer_internal (XBUFFER (coding->src_object));
7843         }
7844
7845       {
7846         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7847
7848         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7849                 old_deactivate_mark);
7850         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7851                     make_number (BEG), make_number (Z));
7852         UNGCPRO;
7853       }
7854       if (XBUFFER (coding->src_object) != current_buffer)
7855         kill_src_buffer = 1;
7856       coding->src_object = Fcurrent_buffer ();
7857       if (BEG != GPT)
7858         move_gap_both (BEG, BEG_BYTE);
7859       coding->src_chars = Z - BEG;
7860       coding->src_bytes = Z_BYTE - BEG_BYTE;
7861       coding->src_pos = BEG;
7862       coding->src_pos_byte = BEG_BYTE;
7863       coding->src_multibyte = Z < Z_BYTE;
7864     }
7865   else if (STRINGP (src_object))
7866     {
7867       code_conversion_save (0, 0);
7868       coding->src_pos = from;
7869       coding->src_pos_byte = from_byte;
7870     }
7871   else if (BUFFERP (src_object))
7872     {
7873       code_conversion_save (0, 0);
7874       set_buffer_internal (XBUFFER (src_object));
7875       if (EQ (src_object, dst_object))
7876         {
7877           saved_pt = PT, saved_pt_byte = PT_BYTE;
7878           coding->src_object = del_range_1 (from, to, 1, 1);
7879           coding->src_pos = 0;
7880           coding->src_pos_byte = 0;
7881         }
7882       else
7883         {
7884           if (from < GPT && to >= GPT)
7885             move_gap_both (from, from_byte);
7886           coding->src_pos = from;
7887           coding->src_pos_byte = from_byte;
7888         }
7889     }
7890   else
7891     code_conversion_save (0, 0);
7892
7893   if (BUFFERP (dst_object))
7894     {
7895       coding->dst_object = dst_object;
7896       if (EQ (src_object, dst_object))
7897         {
7898           coding->dst_pos = from;
7899           coding->dst_pos_byte = from_byte;
7900         }
7901       else
7902         {
7903           struct buffer *current = current_buffer;
7904
7905           set_buffer_temp (XBUFFER (dst_object));
7906           coding->dst_pos = PT;
7907           coding->dst_pos_byte = PT_BYTE;
7908           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7909           set_buffer_temp (current);
7910         }
7911       coding->dst_multibyte
7912         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7913     }
7914   else if (EQ (dst_object, Qt))
7915     {
7916       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7917       coding->dst_object = Qnil;
7918       coding->destination = xmalloc (dst_bytes);
7919       coding->dst_bytes = dst_bytes;
7920       coding->dst_multibyte = 0;
7921     }
7922   else
7923     {
7924       coding->dst_object = Qnil;
7925       coding->dst_multibyte = 0;
7926     }
7927
7928   encode_coding (coding);
7929
7930   if (EQ (dst_object, Qt))
7931     {
7932       if (BUFFERP (coding->dst_object))
7933         coding->dst_object = Fbuffer_string ();
7934       else
7935         {
7936           coding->dst_object
7937             = make_unibyte_string ((char *) coding->destination,
7938                                    coding->produced);
7939           xfree (coding->destination);
7940         }
7941     }
7942
7943   if (saved_pt >= 0)
7944     {
7945       /* This is the case of:
7946          (BUFFERP (src_object) && EQ (src_object, dst_object))
7947          As we have moved PT while replacing the original buffer
7948          contents, we must recover it now.  */
7949       set_buffer_internal (XBUFFER (src_object));
7950       if (saved_pt < from)
7951         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7952       else if (saved_pt < from + chars)
7953         TEMP_SET_PT_BOTH (from, from_byte);
7954       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7955         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7956                           saved_pt_byte + (coding->produced - bytes));
7957       else
7958         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7959                           saved_pt_byte + (coding->produced - bytes));
7960
7961       if (need_marker_adjustment)
7962         {
7963           struct Lisp_Marker *tail;
7964
7965           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7966             if (tail->need_adjustment)
7967               {
7968                 tail->need_adjustment = 0;
7969                 if (tail->insertion_type)
7970                   {
7971                     tail->bytepos = from_byte;
7972                     tail->charpos = from;
7973                   }
7974                 else
7975                   {
7976                     tail->bytepos = from_byte + coding->produced;
7977                     tail->charpos
7978                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7979                          ? tail->bytepos : from + coding->produced_char);
7980                   }
7981               }
7982         }
7983     }
7984
7985   if (kill_src_buffer)
7986     Fkill_buffer (coding->src_object);
7987
7988   Vdeactivate_mark = old_deactivate_mark;
7989   unbind_to (count, Qnil);
7990 }
7991
7992
7993 Lisp_Object
7994 preferred_coding_system (void)
7995 {
7996   int id = coding_categories[coding_priorities[0]].id;
7997
7998   return CODING_ID_NAME (id);
7999 }
8000
8001 \f
8002 #ifdef emacs
8003 /*** 8. Emacs Lisp library functions ***/
8004
8005 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8006        doc: /* Return t if OBJECT is nil or a coding-system.
8007 See the documentation of `define-coding-system' for information
8008 about coding-system objects.  */)
8009   (Lisp_Object object)
8010 {
8011   if (NILP (object)
8012       || CODING_SYSTEM_ID (object) >= 0)
8013     return Qt;
8014   if (! SYMBOLP (object)
8015       || NILP (Fget (object, Qcoding_system_define_form)))
8016     return Qnil;
8017   return Qt;
8018 }
8019
8020 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8021        Sread_non_nil_coding_system, 1, 1, 0,
8022        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8023   (Lisp_Object prompt)
8024 {
8025   Lisp_Object val;
8026   do
8027     {
8028       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8029                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8030     }
8031   while (SCHARS (val) == 0);
8032   return (Fintern (val, Qnil));
8033 }
8034
8035 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8036        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8037 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8038 Ignores case when completing coding systems (all Emacs coding systems
8039 are lower-case).  */)
8040   (Lisp_Object prompt, Lisp_Object default_coding_system)
8041 {
8042   Lisp_Object val;
8043   ptrdiff_t count = SPECPDL_INDEX ();
8044
8045   if (SYMBOLP (default_coding_system))
8046     default_coding_system = SYMBOL_NAME (default_coding_system);
8047   specbind (Qcompletion_ignore_case, Qt);
8048   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8049                           Qt, Qnil, Qcoding_system_history,
8050                           default_coding_system, Qnil);
8051   unbind_to (count, Qnil);
8052   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8053 }
8054
8055 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8056        1, 1, 0,
8057        doc: /* Check validity of CODING-SYSTEM.
8058 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8059 It is valid if it is nil or a symbol defined as a coding system by the
8060 function `define-coding-system'.  */)
8061   (Lisp_Object coding_system)
8062 {
8063   Lisp_Object define_form;
8064
8065   define_form = Fget (coding_system, Qcoding_system_define_form);
8066   if (! NILP (define_form))
8067     {
8068       Fput (coding_system, Qcoding_system_define_form, Qnil);
8069       safe_eval (define_form);
8070     }
8071   if (!NILP (Fcoding_system_p (coding_system)))
8072     return coding_system;
8073   xsignal1 (Qcoding_system_error, coding_system);
8074 }
8075
8076 \f
8077 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8078    HIGHEST, return the coding system of the highest
8079    priority among the detected coding systems.  Otherwise return a
8080    list of detected coding systems sorted by their priorities.  If
8081    MULTIBYTEP, it is assumed that the bytes are in correct
8082    multibyte form but contains only ASCII and eight-bit chars.
8083    Otherwise, the bytes are raw bytes.
8084
8085    CODING-SYSTEM controls the detection as below:
8086
8087    If it is nil, detect both text-format and eol-format.  If the
8088    text-format part of CODING-SYSTEM is already specified
8089    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8090    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8091    detect only text-format.  */
8092
8093 Lisp_Object
8094 detect_coding_system (const unsigned char *src,
8095                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8096                       bool highest, bool multibytep,
8097                       Lisp_Object coding_system)
8098 {
8099   const unsigned char *src_end = src + src_bytes;
8100   Lisp_Object attrs, eol_type;
8101   Lisp_Object val = Qnil;
8102   struct coding_system coding;
8103   ptrdiff_t id;
8104   struct coding_detection_info detect_info;
8105   enum coding_category base_category;
8106   bool null_byte_found = 0, eight_bit_found = 0;
8107
8108   if (NILP (coding_system))
8109     coding_system = Qundecided;
8110   setup_coding_system (coding_system, &coding);
8111   attrs = CODING_ID_ATTRS (coding.id);
8112   eol_type = CODING_ID_EOL_TYPE (coding.id);
8113   coding_system = CODING_ATTR_BASE_NAME (attrs);
8114
8115   coding.source = src;
8116   coding.src_chars = src_chars;
8117   coding.src_bytes = src_bytes;
8118   coding.src_multibyte = multibytep;
8119   coding.consumed = 0;
8120   coding.mode |= CODING_MODE_LAST_BLOCK;
8121   coding.head_ascii = 0;
8122
8123   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8124
8125   /* At first, detect text-format if necessary.  */
8126   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8127   if (base_category == coding_category_undecided)
8128     {
8129       enum coding_category category IF_LINT (= 0);
8130       struct coding_system *this IF_LINT (= NULL);
8131       int c, i;
8132
8133       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8134       for (; src < src_end; src++)
8135         {
8136           c = *src;
8137           if (c & 0x80)
8138             {
8139               eight_bit_found = 1;
8140               if (null_byte_found)
8141                 break;
8142             }
8143           else if (c < 0x20)
8144             {
8145               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8146                   && ! inhibit_iso_escape_detection
8147                   && ! detect_info.checked)
8148                 {
8149                   if (detect_coding_iso_2022 (&coding, &detect_info))
8150                     {
8151                       /* We have scanned the whole data.  */
8152                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8153                         {
8154                           /* We didn't find an 8-bit code.  We may
8155                              have found a null-byte, but it's very
8156                              rare that a binary file confirm to
8157                              ISO-2022.  */
8158                           src = src_end;
8159                           coding.head_ascii = src - coding.source;
8160                         }
8161                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8162                       break;
8163                     }
8164                 }
8165               else if (! c && !inhibit_null_byte_detection)
8166                 {
8167                   null_byte_found = 1;
8168                   if (eight_bit_found)
8169                     break;
8170                 }
8171               if (! eight_bit_found)
8172                 coding.head_ascii++;
8173             }
8174           else if (! eight_bit_found)
8175             coding.head_ascii++;
8176         }
8177
8178       if (null_byte_found || eight_bit_found
8179           || coding.head_ascii < coding.src_bytes
8180           || detect_info.found)
8181         {
8182           if (coding.head_ascii == coding.src_bytes)
8183             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8184             for (i = 0; i < coding_category_raw_text; i++)
8185               {
8186                 category = coding_priorities[i];
8187                 this = coding_categories + category;
8188                 if (detect_info.found & (1 << category))
8189                   break;
8190               }
8191           else
8192             {
8193               if (null_byte_found)
8194                 {
8195                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8196                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8197                 }
8198               for (i = 0; i < coding_category_raw_text; i++)
8199                 {
8200                   category = coding_priorities[i];
8201                   this = coding_categories + category;
8202
8203                   if (this->id < 0)
8204                     {
8205                       /* No coding system of this category is defined.  */
8206                       detect_info.rejected |= (1 << category);
8207                     }
8208                   else if (category >= coding_category_raw_text)
8209                     continue;
8210                   else if (detect_info.checked & (1 << category))
8211                     {
8212                       if (highest
8213                           && (detect_info.found & (1 << category)))
8214                         break;
8215                     }
8216                   else if ((*(this->detector)) (&coding, &detect_info)
8217                            && highest
8218                            && (detect_info.found & (1 << category)))
8219                     {
8220                       if (category == coding_category_utf_16_auto)
8221                         {
8222                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8223                             category = coding_category_utf_16_le;
8224                           else
8225                             category = coding_category_utf_16_be;
8226                         }
8227                       break;
8228                     }
8229                 }
8230             }
8231         }
8232
8233       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8234           || null_byte_found)
8235         {
8236           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8237           id = CODING_SYSTEM_ID (Qno_conversion);
8238           val = Fcons (make_number (id), Qnil);
8239         }
8240       else if (! detect_info.rejected && ! detect_info.found)
8241         {
8242           detect_info.found = CATEGORY_MASK_ANY;
8243           id = coding_categories[coding_category_undecided].id;
8244           val = Fcons (make_number (id), Qnil);
8245         }
8246       else if (highest)
8247         {
8248           if (detect_info.found)
8249             {
8250               detect_info.found = 1 << category;
8251               val = Fcons (make_number (this->id), Qnil);
8252             }
8253           else
8254             for (i = 0; i < coding_category_raw_text; i++)
8255               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8256                 {
8257                   detect_info.found = 1 << coding_priorities[i];
8258                   id = coding_categories[coding_priorities[i]].id;
8259                   val = Fcons (make_number (id), Qnil);
8260                   break;
8261                 }
8262         }
8263       else
8264         {
8265           int mask = detect_info.rejected | detect_info.found;
8266           int found = 0;
8267
8268           for (i = coding_category_raw_text - 1; i >= 0; i--)
8269             {
8270               category = coding_priorities[i];
8271               if (! (mask & (1 << category)))
8272                 {
8273                   found |= 1 << category;
8274                   id = coding_categories[category].id;
8275                   if (id >= 0)
8276                     val = Fcons (make_number (id), val);
8277                 }
8278             }
8279           for (i = coding_category_raw_text - 1; i >= 0; i--)
8280             {
8281               category = coding_priorities[i];
8282               if (detect_info.found & (1 << category))
8283                 {
8284                   id = coding_categories[category].id;
8285                   val = Fcons (make_number (id), val);
8286                 }
8287             }
8288           detect_info.found |= found;
8289         }
8290     }
8291   else if (base_category == coding_category_utf_8_auto)
8292     {
8293       if (detect_coding_utf_8 (&coding, &detect_info))
8294         {
8295           struct coding_system *this;
8296
8297           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8298             this = coding_categories + coding_category_utf_8_sig;
8299           else
8300             this = coding_categories + coding_category_utf_8_nosig;
8301           val = Fcons (make_number (this->id), Qnil);
8302         }
8303     }
8304   else if (base_category == coding_category_utf_16_auto)
8305     {
8306       if (detect_coding_utf_16 (&coding, &detect_info))
8307         {
8308           struct coding_system *this;
8309
8310           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8311             this = coding_categories + coding_category_utf_16_le;
8312           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8313             this = coding_categories + coding_category_utf_16_be;
8314           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8315             this = coding_categories + coding_category_utf_16_be_nosig;
8316           else
8317             this = coding_categories + coding_category_utf_16_le_nosig;
8318           val = Fcons (make_number (this->id), Qnil);
8319         }
8320     }
8321   else
8322     {
8323       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8324       val = Fcons (make_number (coding.id), Qnil);
8325     }
8326
8327   /* Then, detect eol-format if necessary.  */
8328   {
8329     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8330     Lisp_Object tail;
8331
8332     if (VECTORP (eol_type))
8333       {
8334         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8335           {
8336             if (null_byte_found)
8337               normal_eol = EOL_SEEN_LF;
8338             else
8339               normal_eol = detect_eol (coding.source, src_bytes,
8340                                        coding_category_raw_text);
8341           }
8342         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8343                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8344           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8345                                       coding_category_utf_16_be);
8346         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8347                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8348           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8349                                       coding_category_utf_16_le);
8350       }
8351     else
8352       {
8353         if (EQ (eol_type, Qunix))
8354           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8355         else if (EQ (eol_type, Qdos))
8356           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8357         else
8358           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8359       }
8360
8361     for (tail = val; CONSP (tail); tail = XCDR (tail))
8362       {
8363         enum coding_category category;
8364         int this_eol;
8365
8366         id = XINT (XCAR (tail));
8367         attrs = CODING_ID_ATTRS (id);
8368         category = XINT (CODING_ATTR_CATEGORY (attrs));
8369         eol_type = CODING_ID_EOL_TYPE (id);
8370         if (VECTORP (eol_type))
8371           {
8372             if (category == coding_category_utf_16_be
8373                 || category == coding_category_utf_16_be_nosig)
8374               this_eol = utf_16_be_eol;
8375             else if (category == coding_category_utf_16_le
8376                      || category == coding_category_utf_16_le_nosig)
8377               this_eol = utf_16_le_eol;
8378             else
8379               this_eol = normal_eol;
8380
8381             if (this_eol == EOL_SEEN_LF)
8382               XSETCAR (tail, AREF (eol_type, 0));
8383             else if (this_eol == EOL_SEEN_CRLF)
8384               XSETCAR (tail, AREF (eol_type, 1));
8385             else if (this_eol == EOL_SEEN_CR)
8386               XSETCAR (tail, AREF (eol_type, 2));
8387             else
8388               XSETCAR (tail, CODING_ID_NAME (id));
8389           }
8390         else
8391           XSETCAR (tail, CODING_ID_NAME (id));
8392       }
8393   }
8394
8395   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8396 }
8397
8398
8399 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8400        2, 3, 0,
8401        doc: /* Detect coding system of the text in the region between START and END.
8402 Return a list of possible coding systems ordered by priority.
8403 The coding systems to try and their priorities follows what
8404 the function `coding-system-priority-list' (which see) returns.
8405
8406 If only ASCII characters are found (except for such ISO-2022 control
8407 characters as ESC), it returns a list of single element `undecided'
8408 or its subsidiary coding system according to a detected end-of-line
8409 format.
8410
8411 If optional argument HIGHEST is non-nil, return the coding system of
8412 highest priority.  */)
8413   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8414 {
8415   ptrdiff_t from, to;
8416   ptrdiff_t from_byte, to_byte;
8417
8418   CHECK_NUMBER_COERCE_MARKER (start);
8419   CHECK_NUMBER_COERCE_MARKER (end);
8420
8421   validate_region (&start, &end);
8422   from = XINT (start), to = XINT (end);
8423   from_byte = CHAR_TO_BYTE (from);
8424   to_byte = CHAR_TO_BYTE (to);
8425
8426   if (from < GPT && to >= GPT)
8427     move_gap_both (to, to_byte);
8428
8429   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8430                                to - from, to_byte - from_byte,
8431                                !NILP (highest),
8432                                !NILP (BVAR (current_buffer
8433                                       , enable_multibyte_characters)),
8434                                Qnil);
8435 }
8436
8437 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8438        1, 2, 0,
8439        doc: /* Detect coding system of the text in STRING.
8440 Return a list of possible coding systems ordered by priority.
8441 The coding systems to try and their priorities follows what
8442 the function `coding-system-priority-list' (which see) returns.
8443
8444 If only ASCII characters are found (except for such ISO-2022 control
8445 characters as ESC), it returns a list of single element `undecided'
8446 or its subsidiary coding system according to a detected end-of-line
8447 format.
8448
8449 If optional argument HIGHEST is non-nil, return the coding system of
8450 highest priority.  */)
8451   (Lisp_Object string, Lisp_Object highest)
8452 {
8453   CHECK_STRING (string);
8454
8455   return detect_coding_system (SDATA (string),
8456                                SCHARS (string), SBYTES (string),
8457                                !NILP (highest), STRING_MULTIBYTE (string),
8458                                Qnil);
8459 }
8460
8461
8462 static inline bool
8463 char_encodable_p (int c, Lisp_Object attrs)
8464 {
8465   Lisp_Object tail;
8466   struct charset *charset;
8467   Lisp_Object translation_table;
8468
8469   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8470   if (! NILP (translation_table))
8471     c = translate_char (translation_table, c);
8472   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8473        CONSP (tail); tail = XCDR (tail))
8474     {
8475       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8476       if (CHAR_CHARSET_P (c, charset))
8477         break;
8478     }
8479   return (! NILP (tail));
8480 }
8481
8482
8483 /* Return a list of coding systems that safely encode the text between
8484    START and END.  If EXCLUDE is non-nil, it is a list of coding
8485    systems not to check.  The returned list doesn't contain any such
8486    coding systems.  In any case, if the text contains only ASCII or is
8487    unibyte, return t.  */
8488
8489 DEFUN ("find-coding-systems-region-internal",
8490        Ffind_coding_systems_region_internal,
8491        Sfind_coding_systems_region_internal, 2, 3, 0,
8492        doc: /* Internal use only.  */)
8493   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8494 {
8495   Lisp_Object coding_attrs_list, safe_codings;
8496   ptrdiff_t start_byte, end_byte;
8497   const unsigned char *p, *pbeg, *pend;
8498   int c;
8499   Lisp_Object tail, elt, work_table;
8500
8501   if (STRINGP (start))
8502     {
8503       if (!STRING_MULTIBYTE (start)
8504           || SCHARS (start) == SBYTES (start))
8505         return Qt;
8506       start_byte = 0;
8507       end_byte = SBYTES (start);
8508     }
8509   else
8510     {
8511       CHECK_NUMBER_COERCE_MARKER (start);
8512       CHECK_NUMBER_COERCE_MARKER (end);
8513       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8514         args_out_of_range (start, end);
8515       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8516         return Qt;
8517       start_byte = CHAR_TO_BYTE (XINT (start));
8518       end_byte = CHAR_TO_BYTE (XINT (end));
8519       if (XINT (end) - XINT (start) == end_byte - start_byte)
8520         return Qt;
8521
8522       if (XINT (start) < GPT && XINT (end) > GPT)
8523         {
8524           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8525             move_gap_both (XINT (start), start_byte);
8526           else
8527             move_gap_both (XINT (end), end_byte);
8528         }
8529     }
8530
8531   coding_attrs_list = Qnil;
8532   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8533     if (NILP (exclude)
8534         || NILP (Fmemq (XCAR (tail), exclude)))
8535       {
8536         Lisp_Object attrs;
8537
8538         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8539         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8540             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8541           {
8542             ASET (attrs, coding_attr_trans_tbl,
8543                   get_translation_table (attrs, 1, NULL));
8544             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8545           }
8546       }
8547
8548   if (STRINGP (start))
8549     p = pbeg = SDATA (start);
8550   else
8551     p = pbeg = BYTE_POS_ADDR (start_byte);
8552   pend = p + (end_byte - start_byte);
8553
8554   while (p < pend && ASCII_BYTE_P (*p)) p++;
8555   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8556
8557   work_table = Fmake_char_table (Qnil, Qnil);
8558   while (p < pend)
8559     {
8560       if (ASCII_BYTE_P (*p))
8561         p++;
8562       else
8563         {
8564           c = STRING_CHAR_ADVANCE (p);
8565           if (!NILP (char_table_ref (work_table, c)))
8566             /* This character was already checked.  Ignore it.  */
8567             continue;
8568
8569           charset_map_loaded = 0;
8570           for (tail = coding_attrs_list; CONSP (tail);)
8571             {
8572               elt = XCAR (tail);
8573               if (NILP (elt))
8574                 tail = XCDR (tail);
8575               else if (char_encodable_p (c, elt))
8576                 tail = XCDR (tail);
8577               else if (CONSP (XCDR (tail)))
8578                 {
8579                   XSETCAR (tail, XCAR (XCDR (tail)));
8580                   XSETCDR (tail, XCDR (XCDR (tail)));
8581                 }
8582               else
8583                 {
8584                   XSETCAR (tail, Qnil);
8585                   tail = XCDR (tail);
8586                 }
8587             }
8588           if (charset_map_loaded)
8589             {
8590               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8591
8592               if (STRINGP (start))
8593                 pbeg = SDATA (start);
8594               else
8595                 pbeg = BYTE_POS_ADDR (start_byte);
8596               p = pbeg + p_offset;
8597               pend = pbeg + pend_offset;
8598             }
8599           char_table_set (work_table, c, Qt);
8600         }
8601     }
8602
8603   safe_codings = list2 (Qraw_text, Qno_conversion);
8604   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8605     if (! NILP (XCAR (tail)))
8606       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8607
8608   return safe_codings;
8609 }
8610
8611
8612 DEFUN ("unencodable-char-position", Funencodable_char_position,
8613        Sunencodable_char_position, 3, 5, 0,
8614        doc: /*
8615 Return position of first un-encodable character in a region.
8616 START and END specify the region and CODING-SYSTEM specifies the
8617 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8618
8619 If optional 4th argument COUNT is non-nil, it specifies at most how
8620 many un-encodable characters to search.  In this case, the value is a
8621 list of positions.
8622
8623 If optional 5th argument STRING is non-nil, it is a string to search
8624 for un-encodable characters.  In that case, START and END are indexes
8625 to the string.  */)
8626   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8627 {
8628   EMACS_INT n;
8629   struct coding_system coding;
8630   Lisp_Object attrs, charset_list, translation_table;
8631   Lisp_Object positions;
8632   ptrdiff_t from, to;
8633   const unsigned char *p, *stop, *pend;
8634   bool ascii_compatible;
8635
8636   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8637   attrs = CODING_ID_ATTRS (coding.id);
8638   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8639     return Qnil;
8640   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8641   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8642   translation_table = get_translation_table (attrs, 1, NULL);
8643
8644   if (NILP (string))
8645     {
8646       validate_region (&start, &end);
8647       from = XINT (start);
8648       to = XINT (end);
8649       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8650           || (ascii_compatible
8651               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8652         return Qnil;
8653       p = CHAR_POS_ADDR (from);
8654       pend = CHAR_POS_ADDR (to);
8655       if (from < GPT && to >= GPT)
8656         stop = GPT_ADDR;
8657       else
8658         stop = pend;
8659     }
8660   else
8661     {
8662       CHECK_STRING (string);
8663       CHECK_NATNUM (start);
8664       CHECK_NATNUM (end);
8665       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8666         args_out_of_range_3 (string, start, end);
8667       from = XINT (start);
8668       to = XINT (end);
8669       if (! STRING_MULTIBYTE (string))
8670         return Qnil;
8671       p = SDATA (string) + string_char_to_byte (string, from);
8672       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8673       if (ascii_compatible && (to - from) == (pend - p))
8674         return Qnil;
8675     }
8676
8677   if (NILP (count))
8678     n = 1;
8679   else
8680     {
8681       CHECK_NATNUM (count);
8682       n = XINT (count);
8683     }
8684
8685   positions = Qnil;
8686   charset_map_loaded = 0;
8687   while (1)
8688     {
8689       int c;
8690
8691       if (ascii_compatible)
8692         while (p < stop && ASCII_BYTE_P (*p))
8693           p++, from++;
8694       if (p >= stop)
8695         {
8696           if (p >= pend)
8697             break;
8698           stop = pend;
8699           p = GAP_END_ADDR;
8700         }
8701
8702       c = STRING_CHAR_ADVANCE (p);
8703       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8704           && ! char_charset (translate_char (translation_table, c),
8705                              charset_list, NULL))
8706         {
8707           positions = Fcons (make_number (from), positions);
8708           n--;
8709           if (n == 0)
8710             break;
8711         }
8712
8713       from++;
8714       if (charset_map_loaded && NILP (string))
8715         {
8716           p = CHAR_POS_ADDR (from);
8717           pend = CHAR_POS_ADDR (to);
8718           if (from < GPT && to >= GPT)
8719             stop = GPT_ADDR;
8720           else
8721             stop = pend;
8722           charset_map_loaded = 0;
8723         }
8724     }
8725
8726   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8727 }
8728
8729
8730 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8731        Scheck_coding_systems_region, 3, 3, 0,
8732        doc: /* Check if the region is encodable by coding systems.
8733
8734 START and END are buffer positions specifying the region.
8735 CODING-SYSTEM-LIST is a list of coding systems to check.
8736
8737 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8738 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8739 whole region, POS0, POS1, ... are buffer positions where non-encodable
8740 characters are found.
8741
8742 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8743 value is nil.
8744
8745 START may be a string.  In that case, check if the string is
8746 encodable, and the value contains indices to the string instead of
8747 buffer positions.  END is ignored.
8748
8749 If the current buffer (or START if it is a string) is unibyte, the value
8750 is nil.  */)
8751   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8752 {
8753   Lisp_Object list;
8754   ptrdiff_t start_byte, end_byte;
8755   ptrdiff_t pos;
8756   const unsigned char *p, *pbeg, *pend;
8757   int c;
8758   Lisp_Object tail, elt, attrs;
8759
8760   if (STRINGP (start))
8761     {
8762       if (!STRING_MULTIBYTE (start)
8763           || SCHARS (start) == SBYTES (start))
8764         return Qnil;
8765       start_byte = 0;
8766       end_byte = SBYTES (start);
8767       pos = 0;
8768     }
8769   else
8770     {
8771       CHECK_NUMBER_COERCE_MARKER (start);
8772       CHECK_NUMBER_COERCE_MARKER (end);
8773       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8774         args_out_of_range (start, end);
8775       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8776         return Qnil;
8777       start_byte = CHAR_TO_BYTE (XINT (start));
8778       end_byte = CHAR_TO_BYTE (XINT (end));
8779       if (XINT (end) - XINT (start) == end_byte - start_byte)
8780         return Qnil;
8781
8782       if (XINT (start) < GPT && XINT (end) > GPT)
8783         {
8784           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8785             move_gap_both (XINT (start), start_byte);
8786           else
8787             move_gap_both (XINT (end), end_byte);
8788         }
8789       pos = XINT (start);
8790     }
8791
8792   list = Qnil;
8793   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8794     {
8795       elt = XCAR (tail);
8796       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8797       ASET (attrs, coding_attr_trans_tbl,
8798             get_translation_table (attrs, 1, NULL));
8799       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8800     }
8801
8802   if (STRINGP (start))
8803     p = pbeg = SDATA (start);
8804   else
8805     p = pbeg = BYTE_POS_ADDR (start_byte);
8806   pend = p + (end_byte - start_byte);
8807
8808   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8809   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8810
8811   while (p < pend)
8812     {
8813       if (ASCII_BYTE_P (*p))
8814         p++;
8815       else
8816         {
8817           c = STRING_CHAR_ADVANCE (p);
8818
8819           charset_map_loaded = 0;
8820           for (tail = list; CONSP (tail); tail = XCDR (tail))
8821             {
8822               elt = XCDR (XCAR (tail));
8823               if (! char_encodable_p (c, XCAR (elt)))
8824                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8825             }
8826           if (charset_map_loaded)
8827             {
8828               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8829
8830               if (STRINGP (start))
8831                 pbeg = SDATA (start);
8832               else
8833                 pbeg = BYTE_POS_ADDR (start_byte);
8834               p = pbeg + p_offset;
8835               pend = pbeg + pend_offset;
8836             }
8837         }
8838       pos++;
8839     }
8840
8841   tail = list;
8842   list = Qnil;
8843   for (; CONSP (tail); tail = XCDR (tail))
8844     {
8845       elt = XCAR (tail);
8846       if (CONSP (XCDR (XCDR (elt))))
8847         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8848                       list);
8849     }
8850
8851   return list;
8852 }
8853
8854
8855 static Lisp_Object
8856 code_convert_region (Lisp_Object start, Lisp_Object end,
8857                      Lisp_Object coding_system, Lisp_Object dst_object,
8858                      bool encodep, bool norecord)
8859 {
8860   struct coding_system coding;
8861   ptrdiff_t from, from_byte, to, to_byte;
8862   Lisp_Object src_object;
8863
8864   CHECK_NUMBER_COERCE_MARKER (start);
8865   CHECK_NUMBER_COERCE_MARKER (end);
8866   if (NILP (coding_system))
8867     coding_system = Qno_conversion;
8868   else
8869     CHECK_CODING_SYSTEM (coding_system);
8870   src_object = Fcurrent_buffer ();
8871   if (NILP (dst_object))
8872     dst_object = src_object;
8873   else if (! EQ (dst_object, Qt))
8874     CHECK_BUFFER (dst_object);
8875
8876   validate_region (&start, &end);
8877   from = XFASTINT (start);
8878   from_byte = CHAR_TO_BYTE (from);
8879   to = XFASTINT (end);
8880   to_byte = CHAR_TO_BYTE (to);
8881
8882   setup_coding_system (coding_system, &coding);
8883   coding.mode |= CODING_MODE_LAST_BLOCK;
8884
8885   if (encodep)
8886     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8887                           dst_object);
8888   else
8889     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8890                           dst_object);
8891   if (! norecord)
8892     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8893
8894   return (BUFFERP (dst_object)
8895           ? make_number (coding.produced_char)
8896           : coding.dst_object);
8897 }
8898
8899
8900 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8901        3, 4, "r\nzCoding system: ",
8902        doc: /* Decode the current region from the specified coding system.
8903 When called from a program, takes four arguments:
8904         START, END, CODING-SYSTEM, and DESTINATION.
8905 START and END are buffer positions.
8906
8907 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8908 If nil, the region between START and END is replaced by the decoded text.
8909 If buffer, the decoded text is inserted in that buffer after point (point
8910 does not move).
8911 In those cases, the length of the decoded text is returned.
8912 If DESTINATION is t, the decoded text is returned.
8913
8914 This function sets `last-coding-system-used' to the precise coding system
8915 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8916 not fully specified.)  */)
8917   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8918 {
8919   return code_convert_region (start, end, coding_system, destination, 0, 0);
8920 }
8921
8922 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8923        3, 4, "r\nzCoding system: ",
8924        doc: /* Encode the current region by specified coding system.
8925 When called from a program, takes four arguments:
8926         START, END, CODING-SYSTEM and DESTINATION.
8927 START and END are buffer positions.
8928
8929 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8930 If nil, the region between START and END is replace by the encoded text.
8931 If buffer, the encoded text is inserted in that buffer after point (point
8932 does not move).
8933 In those cases, the length of the encoded text is returned.
8934 If DESTINATION is t, the encoded text is returned.
8935
8936 This function sets `last-coding-system-used' to the precise coding system
8937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8938 not fully specified.)  */)
8939   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8940 {
8941   return code_convert_region (start, end, coding_system, destination, 1, 0);
8942 }
8943
8944 Lisp_Object
8945 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8946                      Lisp_Object dst_object, bool encodep, bool nocopy,
8947                      bool norecord)
8948 {
8949   struct coding_system coding;
8950   ptrdiff_t chars, bytes;
8951
8952   CHECK_STRING (string);
8953   if (NILP (coding_system))
8954     {
8955       if (! norecord)
8956         Vlast_coding_system_used = Qno_conversion;
8957       if (NILP (dst_object))
8958         return (nocopy ? Fcopy_sequence (string) : string);
8959     }
8960
8961   if (NILP (coding_system))
8962     coding_system = Qno_conversion;
8963   else
8964     CHECK_CODING_SYSTEM (coding_system);
8965   if (NILP (dst_object))
8966     dst_object = Qt;
8967   else if (! EQ (dst_object, Qt))
8968     CHECK_BUFFER (dst_object);
8969
8970   setup_coding_system (coding_system, &coding);
8971   coding.mode |= CODING_MODE_LAST_BLOCK;
8972   chars = SCHARS (string);
8973   bytes = SBYTES (string);
8974   if (encodep)
8975     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8976   else
8977     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8978   if (! norecord)
8979     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8980
8981   return (BUFFERP (dst_object)
8982           ? make_number (coding.produced_char)
8983           : coding.dst_object);
8984 }
8985
8986
8987 /* Encode or decode STRING according to CODING_SYSTEM.
8988    Do not set Vlast_coding_system_used.
8989
8990    This function is called only from macros DECODE_FILE and
8991    ENCODE_FILE, thus we ignore character composition.  */
8992
8993 Lisp_Object
8994 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8995                               bool encodep)
8996 {
8997   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8998 }
8999
9000
9001 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9002        2, 4, 0,
9003        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9004
9005 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9006 if the decoding operation is trivial.
9007
9008 Optional fourth arg BUFFER non-nil means that the decoded text is
9009 inserted in that buffer after point (point does not move).  In this
9010 case, the return value is the length of the decoded text.
9011
9012 This function sets `last-coding-system-used' to the precise coding system
9013 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9014 not fully specified.)  */)
9015   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9016 {
9017   return code_convert_string (string, coding_system, buffer,
9018                               0, ! NILP (nocopy), 0);
9019 }
9020
9021 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9022        2, 4, 0,
9023        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9024
9025 Optional third arg NOCOPY non-nil means it is OK to return STRING
9026 itself if the encoding operation is trivial.
9027
9028 Optional fourth arg BUFFER non-nil means that the encoded text is
9029 inserted in that buffer after point (point does not move).  In this
9030 case, the return value is the length of the encoded text.
9031
9032 This function sets `last-coding-system-used' to the precise coding system
9033 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9034 not fully specified.)  */)
9035   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9036 {
9037   return code_convert_string (string, coding_system, buffer,
9038                               1, ! NILP (nocopy), 0);
9039 }
9040
9041 \f
9042 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9043        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9044 Return the corresponding character.  */)
9045   (Lisp_Object code)
9046 {
9047   Lisp_Object spec, attrs, val;
9048   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9049   EMACS_INT ch;
9050   int c;
9051
9052   CHECK_NATNUM (code);
9053   ch = XFASTINT (code);
9054   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9055   attrs = AREF (spec, 0);
9056
9057   if (ASCII_BYTE_P (ch)
9058       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9059     return code;
9060
9061   val = CODING_ATTR_CHARSET_LIST (attrs);
9062   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9063   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9064   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9065
9066   if (ch <= 0x7F)
9067     {
9068       c = ch;
9069       charset = charset_roman;
9070     }
9071   else if (ch >= 0xA0 && ch < 0xDF)
9072     {
9073       c = ch - 0x80;
9074       charset = charset_kana;
9075     }
9076   else
9077     {
9078       EMACS_INT c1 = ch >> 8;
9079       int c2 = ch & 0xFF;
9080
9081       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9082           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9083         error ("Invalid code: %"pI"d", ch);
9084       c = ch;
9085       SJIS_TO_JIS (c);
9086       charset = charset_kanji;
9087     }
9088   c = DECODE_CHAR (charset, c);
9089   if (c < 0)
9090     error ("Invalid code: %"pI"d", ch);
9091   return make_number (c);
9092 }
9093
9094
9095 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9096        doc: /* Encode a Japanese character CH to shift_jis encoding.
9097 Return the corresponding code in SJIS.  */)
9098   (Lisp_Object ch)
9099 {
9100   Lisp_Object spec, attrs, charset_list;
9101   int c;
9102   struct charset *charset;
9103   unsigned code;
9104
9105   CHECK_CHARACTER (ch);
9106   c = XFASTINT (ch);
9107   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9108   attrs = AREF (spec, 0);
9109
9110   if (ASCII_CHAR_P (c)
9111       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9112     return ch;
9113
9114   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9115   charset = char_charset (c, charset_list, &code);
9116   if (code == CHARSET_INVALID_CODE (charset))
9117     error ("Can't encode by shift_jis encoding: %c", c);
9118   JIS_TO_SJIS (code);
9119
9120   return make_number (code);
9121 }
9122
9123 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9124        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9125 Return the corresponding character.  */)
9126   (Lisp_Object code)
9127 {
9128   Lisp_Object spec, attrs, val;
9129   struct charset *charset_roman, *charset_big5, *charset;
9130   EMACS_INT ch;
9131   int c;
9132
9133   CHECK_NATNUM (code);
9134   ch = XFASTINT (code);
9135   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9136   attrs = AREF (spec, 0);
9137
9138   if (ASCII_BYTE_P (ch)
9139       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9140     return code;
9141
9142   val = CODING_ATTR_CHARSET_LIST (attrs);
9143   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9144   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9145
9146   if (ch <= 0x7F)
9147     {
9148       c = ch;
9149       charset = charset_roman;
9150     }
9151   else
9152     {
9153       EMACS_INT b1 = ch >> 8;
9154       int b2 = ch & 0x7F;
9155       if (b1 < 0xA1 || b1 > 0xFE
9156           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9157         error ("Invalid code: %"pI"d", ch);
9158       c = ch;
9159       charset = charset_big5;
9160     }
9161   c = DECODE_CHAR (charset, c);
9162   if (c < 0)
9163     error ("Invalid code: %"pI"d", ch);
9164   return make_number (c);
9165 }
9166
9167 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9168        doc: /* Encode the Big5 character CH to BIG5 coding system.
9169 Return the corresponding character code in Big5.  */)
9170   (Lisp_Object ch)
9171 {
9172   Lisp_Object spec, attrs, charset_list;
9173   struct charset *charset;
9174   int c;
9175   unsigned code;
9176
9177   CHECK_CHARACTER (ch);
9178   c = XFASTINT (ch);
9179   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9180   attrs = AREF (spec, 0);
9181   if (ASCII_CHAR_P (c)
9182       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9183     return ch;
9184
9185   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9186   charset = char_charset (c, charset_list, &code);
9187   if (code == CHARSET_INVALID_CODE (charset))
9188     error ("Can't encode by Big5 encoding: %c", c);
9189
9190   return make_number (code);
9191 }
9192
9193 \f
9194 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9195        Sset_terminal_coding_system_internal, 1, 2, 0,
9196        doc: /* Internal use only.  */)
9197   (Lisp_Object coding_system, Lisp_Object terminal)
9198 {
9199   struct terminal *term = get_terminal (terminal, 1);
9200   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9201   CHECK_SYMBOL (coding_system);
9202   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9203   /* We had better not send unsafe characters to terminal.  */
9204   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9205   /* Character composition should be disabled.  */
9206   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9207   terminal_coding->src_multibyte = 1;
9208   terminal_coding->dst_multibyte = 0;
9209   tset_charset_list
9210     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9211             ? coding_charset_list (terminal_coding)
9212             : Fcons (make_number (charset_ascii), Qnil)));
9213   return Qnil;
9214 }
9215
9216 DEFUN ("set-safe-terminal-coding-system-internal",
9217        Fset_safe_terminal_coding_system_internal,
9218        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9219        doc: /* Internal use only.  */)
9220   (Lisp_Object coding_system)
9221 {
9222   CHECK_SYMBOL (coding_system);
9223   setup_coding_system (Fcheck_coding_system (coding_system),
9224                        &safe_terminal_coding);
9225   /* Character composition should be disabled.  */
9226   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9227   safe_terminal_coding.src_multibyte = 1;
9228   safe_terminal_coding.dst_multibyte = 0;
9229   return Qnil;
9230 }
9231
9232 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9233        Sterminal_coding_system, 0, 1, 0,
9234        doc: /* Return coding system specified for terminal output on the given terminal.
9235 TERMINAL may be a terminal object, a frame, or nil for the selected
9236 frame's terminal device.  */)
9237   (Lisp_Object terminal)
9238 {
9239   struct coding_system *terminal_coding
9240     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9241   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9242
9243   /* For backward compatibility, return nil if it is `undecided'.  */
9244   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9245 }
9246
9247 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9248        Sset_keyboard_coding_system_internal, 1, 2, 0,
9249        doc: /* Internal use only.  */)
9250   (Lisp_Object coding_system, Lisp_Object terminal)
9251 {
9252   struct terminal *t = get_terminal (terminal, 1);
9253   CHECK_SYMBOL (coding_system);
9254   if (NILP (coding_system))
9255     coding_system = Qno_conversion;
9256   else
9257     Fcheck_coding_system (coding_system);
9258   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9259   /* Character composition should be disabled.  */
9260   TERMINAL_KEYBOARD_CODING (t)->common_flags
9261     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9262   return Qnil;
9263 }
9264
9265 DEFUN ("keyboard-coding-system",
9266        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9267        doc: /* Return coding system specified for decoding keyboard input.  */)
9268   (Lisp_Object terminal)
9269 {
9270   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9271                          (get_terminal (terminal, 1))->id);
9272 }
9273
9274 \f
9275 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9276        Sfind_operation_coding_system,  1, MANY, 0,
9277        doc: /* Choose a coding system for an operation based on the target name.
9278 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9279 DECODING-SYSTEM is the coding system to use for decoding
9280 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9281 for encoding (in case OPERATION does encoding).
9282
9283 The first argument OPERATION specifies an I/O primitive:
9284   For file I/O, `insert-file-contents' or `write-region'.
9285   For process I/O, `call-process', `call-process-region', or `start-process'.
9286   For network I/O, `open-network-stream'.
9287
9288 The remaining arguments should be the same arguments that were passed
9289 to the primitive.  Depending on which primitive, one of those arguments
9290 is selected as the TARGET.  For example, if OPERATION does file I/O,
9291 whichever argument specifies the file name is TARGET.
9292
9293 TARGET has a meaning which depends on OPERATION:
9294   For file I/O, TARGET is a file name (except for the special case below).
9295   For process I/O, TARGET is a process name.
9296   For network I/O, TARGET is a service name or a port number.
9297
9298 This function looks up what is specified for TARGET in
9299 `file-coding-system-alist', `process-coding-system-alist',
9300 or `network-coding-system-alist' depending on OPERATION.
9301 They may specify a coding system, a cons of coding systems,
9302 or a function symbol to call.
9303 In the last case, we call the function with one argument,
9304 which is a list of all the arguments given to this function.
9305 If the function can't decide a coding system, it can return
9306 `undecided' so that the normal code-detection is performed.
9307
9308 If OPERATION is `insert-file-contents', the argument corresponding to
9309 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9310 file name to look up, and BUFFER is a buffer that contains the file's
9311 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9312 function to call for FILENAME, that function should examine the
9313 contents of BUFFER instead of reading the file.
9314
9315 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9316   (ptrdiff_t nargs, Lisp_Object *args)
9317 {
9318   Lisp_Object operation, target_idx, target, val;
9319   register Lisp_Object chain;
9320
9321   if (nargs < 2)
9322     error ("Too few arguments");
9323   operation = args[0];
9324   if (!SYMBOLP (operation)
9325       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9326     error ("Invalid first argument");
9327   if (nargs <= 1 + XFASTINT (target_idx))
9328     error ("Too few arguments for operation `%s'",
9329            SDATA (SYMBOL_NAME (operation)));
9330   target = args[XFASTINT (target_idx) + 1];
9331   if (!(STRINGP (target)
9332         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9333             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9334         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9335     error ("Invalid argument %"pI"d of operation `%s'",
9336            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9337   if (CONSP (target))
9338     target = XCAR (target);
9339
9340   chain = ((EQ (operation, Qinsert_file_contents)
9341             || EQ (operation, Qwrite_region))
9342            ? Vfile_coding_system_alist
9343            : (EQ (operation, Qopen_network_stream)
9344               ? Vnetwork_coding_system_alist
9345               : Vprocess_coding_system_alist));
9346   if (NILP (chain))
9347     return Qnil;
9348
9349   for (; CONSP (chain); chain = XCDR (chain))
9350     {
9351       Lisp_Object elt;
9352
9353       elt = XCAR (chain);
9354       if (CONSP (elt)
9355           && ((STRINGP (target)
9356                && STRINGP (XCAR (elt))
9357                && fast_string_match (XCAR (elt), target) >= 0)
9358               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9359         {
9360           val = XCDR (elt);
9361           /* Here, if VAL is both a valid coding system and a valid
9362              function symbol, we return VAL as a coding system.  */
9363           if (CONSP (val))
9364             return val;
9365           if (! SYMBOLP (val))
9366             return Qnil;
9367           if (! NILP (Fcoding_system_p (val)))
9368             return Fcons (val, val);
9369           if (! NILP (Ffboundp (val)))
9370             {
9371               /* We use call1 rather than safe_call1
9372                  so as to get bug reports about functions called here
9373                  which don't handle the current interface.  */
9374               val = call1 (val, Flist (nargs, args));
9375               if (CONSP (val))
9376                 return val;
9377               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9378                 return Fcons (val, val);
9379             }
9380           return Qnil;
9381         }
9382     }
9383   return Qnil;
9384 }
9385
9386 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9387        Sset_coding_system_priority, 0, MANY, 0,
9388        doc: /* Assign higher priority to the coding systems given as arguments.
9389 If multiple coding systems belong to the same category,
9390 all but the first one are ignored.
9391
9392 usage: (set-coding-system-priority &rest coding-systems)  */)
9393   (ptrdiff_t nargs, Lisp_Object *args)
9394 {
9395   ptrdiff_t i, j;
9396   bool changed[coding_category_max];
9397   enum coding_category priorities[coding_category_max];
9398
9399   memset (changed, 0, sizeof changed);
9400
9401   for (i = j = 0; i < nargs; i++)
9402     {
9403       enum coding_category category;
9404       Lisp_Object spec, attrs;
9405
9406       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9407       attrs = AREF (spec, 0);
9408       category = XINT (CODING_ATTR_CATEGORY (attrs));
9409       if (changed[category])
9410         /* Ignore this coding system because a coding system of the
9411            same category already had a higher priority.  */
9412         continue;
9413       changed[category] = 1;
9414       priorities[j++] = category;
9415       if (coding_categories[category].id >= 0
9416           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9417         setup_coding_system (args[i], &coding_categories[category]);
9418       Fset (AREF (Vcoding_category_table, category), args[i]);
9419     }
9420
9421   /* Now we have decided top J priorities.  Reflect the order of the
9422      original priorities to the remaining priorities.  */
9423
9424   for (i = j, j = 0; i < coding_category_max; i++, j++)
9425     {
9426       while (j < coding_category_max
9427              && changed[coding_priorities[j]])
9428         j++;
9429       if (j == coding_category_max)
9430         emacs_abort ();
9431       priorities[i] = coding_priorities[j];
9432     }
9433
9434   memcpy (coding_priorities, priorities, sizeof priorities);
9435
9436   /* Update `coding-category-list'.  */
9437   Vcoding_category_list = Qnil;
9438   for (i = coding_category_max; i-- > 0; )
9439     Vcoding_category_list
9440       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9441                Vcoding_category_list);
9442
9443   return Qnil;
9444 }
9445
9446 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9447        Scoding_system_priority_list, 0, 1, 0,
9448        doc: /* Return a list of coding systems ordered by their priorities.
9449 The list contains a subset of coding systems; i.e. coding systems
9450 assigned to each coding category (see `coding-category-list').
9451
9452 HIGHESTP non-nil means just return the highest priority one.  */)
9453   (Lisp_Object highestp)
9454 {
9455   int i;
9456   Lisp_Object val;
9457
9458   for (i = 0, val = Qnil; i < coding_category_max; i++)
9459     {
9460       enum coding_category category = coding_priorities[i];
9461       int id = coding_categories[category].id;
9462       Lisp_Object attrs;
9463
9464       if (id < 0)
9465         continue;
9466       attrs = CODING_ID_ATTRS (id);
9467       if (! NILP (highestp))
9468         return CODING_ATTR_BASE_NAME (attrs);
9469       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9470     }
9471   return Fnreverse (val);
9472 }
9473
9474 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9475
9476 static Lisp_Object
9477 make_subsidiaries (Lisp_Object base)
9478 {
9479   Lisp_Object subsidiaries;
9480   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9481   char *buf = alloca (base_name_len + 6);
9482   int i;
9483
9484   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9485   subsidiaries = Fmake_vector (make_number (3), Qnil);
9486   for (i = 0; i < 3; i++)
9487     {
9488       strcpy (buf + base_name_len, suffixes[i]);
9489       ASET (subsidiaries, i, intern (buf));
9490     }
9491   return subsidiaries;
9492 }
9493
9494
9495 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9496        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9497        doc: /* For internal use only.
9498 usage: (define-coding-system-internal ...)  */)
9499   (ptrdiff_t nargs, Lisp_Object *args)
9500 {
9501   Lisp_Object name;
9502   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9503   Lisp_Object attrs;            /* Vector of attributes.  */
9504   Lisp_Object eol_type;
9505   Lisp_Object aliases;
9506   Lisp_Object coding_type, charset_list, safe_charsets;
9507   enum coding_category category;
9508   Lisp_Object tail, val;
9509   int max_charset_id = 0;
9510   int i;
9511
9512   if (nargs < coding_arg_max)
9513     goto short_args;
9514
9515   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9516
9517   name = args[coding_arg_name];
9518   CHECK_SYMBOL (name);
9519   ASET (attrs, coding_attr_base_name, name);
9520
9521   val = args[coding_arg_mnemonic];
9522   if (! STRINGP (val))
9523     CHECK_CHARACTER (val);
9524   ASET (attrs, coding_attr_mnemonic, val);
9525
9526   coding_type = args[coding_arg_coding_type];
9527   CHECK_SYMBOL (coding_type);
9528   ASET (attrs, coding_attr_type, coding_type);
9529
9530   charset_list = args[coding_arg_charset_list];
9531   if (SYMBOLP (charset_list))
9532     {
9533       if (EQ (charset_list, Qiso_2022))
9534         {
9535           if (! EQ (coding_type, Qiso_2022))
9536             error ("Invalid charset-list");
9537           charset_list = Viso_2022_charset_list;
9538         }
9539       else if (EQ (charset_list, Qemacs_mule))
9540         {
9541           if (! EQ (coding_type, Qemacs_mule))
9542             error ("Invalid charset-list");
9543           charset_list = Vemacs_mule_charset_list;
9544         }
9545       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9546         {
9547           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9548             error ("Invalid charset-list");
9549           if (max_charset_id < XFASTINT (XCAR (tail)))
9550             max_charset_id = XFASTINT (XCAR (tail));
9551         }
9552     }
9553   else
9554     {
9555       charset_list = Fcopy_sequence (charset_list);
9556       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9557         {
9558           struct charset *charset;
9559
9560           val = XCAR (tail);
9561           CHECK_CHARSET_GET_CHARSET (val, charset);
9562           if (EQ (coding_type, Qiso_2022)
9563               ? CHARSET_ISO_FINAL (charset) < 0
9564               : EQ (coding_type, Qemacs_mule)
9565               ? CHARSET_EMACS_MULE_ID (charset) < 0
9566               : 0)
9567             error ("Can't handle charset `%s'",
9568                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9569
9570           XSETCAR (tail, make_number (charset->id));
9571           if (max_charset_id < charset->id)
9572             max_charset_id = charset->id;
9573         }
9574     }
9575   ASET (attrs, coding_attr_charset_list, charset_list);
9576
9577   safe_charsets = make_uninit_string (max_charset_id + 1);
9578   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9579   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9580     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9581   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9582
9583   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9584
9585   val = args[coding_arg_decode_translation_table];
9586   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9587     CHECK_SYMBOL (val);
9588   ASET (attrs, coding_attr_decode_tbl, val);
9589
9590   val = args[coding_arg_encode_translation_table];
9591   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9592     CHECK_SYMBOL (val);
9593   ASET (attrs, coding_attr_encode_tbl, val);
9594
9595   val = args[coding_arg_post_read_conversion];
9596   CHECK_SYMBOL (val);
9597   ASET (attrs, coding_attr_post_read, val);
9598
9599   val = args[coding_arg_pre_write_conversion];
9600   CHECK_SYMBOL (val);
9601   ASET (attrs, coding_attr_pre_write, val);
9602
9603   val = args[coding_arg_default_char];
9604   if (NILP (val))
9605     ASET (attrs, coding_attr_default_char, make_number (' '));
9606   else
9607     {
9608       CHECK_CHARACTER (val);
9609       ASET (attrs, coding_attr_default_char, val);
9610     }
9611
9612   val = args[coding_arg_for_unibyte];
9613   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9614
9615   val = args[coding_arg_plist];
9616   CHECK_LIST (val);
9617   ASET (attrs, coding_attr_plist, val);
9618
9619   if (EQ (coding_type, Qcharset))
9620     {
9621       /* Generate a lisp vector of 256 elements.  Each element is nil,
9622          integer, or a list of charset IDs.
9623
9624          If Nth element is nil, the byte code N is invalid in this
9625          coding system.
9626
9627          If Nth element is a number NUM, N is the first byte of a
9628          charset whose ID is NUM.
9629
9630          If Nth element is a list of charset IDs, N is the first byte
9631          of one of them.  The list is sorted by dimensions of the
9632          charsets.  A charset of smaller dimension comes first. */
9633       val = Fmake_vector (make_number (256), Qnil);
9634
9635       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9636         {
9637           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9638           int dim = CHARSET_DIMENSION (charset);
9639           int idx = (dim - 1) * 4;
9640
9641           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9642             ASET (attrs, coding_attr_ascii_compat, Qt);
9643
9644           for (i = charset->code_space[idx];
9645                i <= charset->code_space[idx + 1]; i++)
9646             {
9647               Lisp_Object tmp, tmp2;
9648               int dim2;
9649
9650               tmp = AREF (val, i);
9651               if (NILP (tmp))
9652                 tmp = XCAR (tail);
9653               else if (NUMBERP (tmp))
9654                 {
9655                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9656                   if (dim < dim2)
9657                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9658                   else
9659                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9660                 }
9661               else
9662                 {
9663                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9664                     {
9665                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9666                       if (dim < dim2)
9667                         break;
9668                     }
9669                   if (NILP (tmp2))
9670                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9671                   else
9672                     {
9673                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9674                       XSETCAR (tmp2, XCAR (tail));
9675                     }
9676                 }
9677               ASET (val, i, tmp);
9678             }
9679         }
9680       ASET (attrs, coding_attr_charset_valids, val);
9681       category = coding_category_charset;
9682     }
9683   else if (EQ (coding_type, Qccl))
9684     {
9685       Lisp_Object valids;
9686
9687       if (nargs < coding_arg_ccl_max)
9688         goto short_args;
9689
9690       val = args[coding_arg_ccl_decoder];
9691       CHECK_CCL_PROGRAM (val);
9692       if (VECTORP (val))
9693         val = Fcopy_sequence (val);
9694       ASET (attrs, coding_attr_ccl_decoder, val);
9695
9696       val = args[coding_arg_ccl_encoder];
9697       CHECK_CCL_PROGRAM (val);
9698       if (VECTORP (val))
9699         val = Fcopy_sequence (val);
9700       ASET (attrs, coding_attr_ccl_encoder, val);
9701
9702       val = args[coding_arg_ccl_valids];
9703       valids = Fmake_string (make_number (256), make_number (0));
9704       for (tail = val; CONSP (tail); tail = XCDR (tail))
9705         {
9706           int from, to;
9707
9708           val = XCAR (tail);
9709           if (INTEGERP (val))
9710             {
9711               if (! (0 <= XINT (val) && XINT (val) <= 255))
9712                 args_out_of_range_3 (val, make_number (0), make_number (255));
9713               from = to = XINT (val);
9714             }
9715           else
9716             {
9717               CHECK_CONS (val);
9718               CHECK_NATNUM_CAR (val);
9719               CHECK_NUMBER_CDR (val);
9720               if (XINT (XCAR (val)) > 255)
9721                 args_out_of_range_3 (XCAR (val),
9722                                      make_number (0), make_number (255));
9723               from = XINT (XCAR (val));
9724               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9725                 args_out_of_range_3 (XCDR (val),
9726                                      XCAR (val), make_number (255));
9727               to = XINT (XCDR (val));
9728             }
9729           for (i = from; i <= to; i++)
9730             SSET (valids, i, 1);
9731         }
9732       ASET (attrs, coding_attr_ccl_valids, valids);
9733
9734       category = coding_category_ccl;
9735     }
9736   else if (EQ (coding_type, Qutf_16))
9737     {
9738       Lisp_Object bom, endian;
9739
9740       ASET (attrs, coding_attr_ascii_compat, Qnil);
9741
9742       if (nargs < coding_arg_utf16_max)
9743         goto short_args;
9744
9745       bom = args[coding_arg_utf16_bom];
9746       if (! NILP (bom) && ! EQ (bom, Qt))
9747         {
9748           CHECK_CONS (bom);
9749           val = XCAR (bom);
9750           CHECK_CODING_SYSTEM (val);
9751           val = XCDR (bom);
9752           CHECK_CODING_SYSTEM (val);
9753         }
9754       ASET (attrs, coding_attr_utf_bom, bom);
9755
9756       endian = args[coding_arg_utf16_endian];
9757       CHECK_SYMBOL (endian);
9758       if (NILP (endian))
9759         endian = Qbig;
9760       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9761         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9762       ASET (attrs, coding_attr_utf_16_endian, endian);
9763
9764       category = (CONSP (bom)
9765                   ? coding_category_utf_16_auto
9766                   : NILP (bom)
9767                   ? (EQ (endian, Qbig)
9768                      ? coding_category_utf_16_be_nosig
9769                      : coding_category_utf_16_le_nosig)
9770                   : (EQ (endian, Qbig)
9771                      ? coding_category_utf_16_be
9772                      : coding_category_utf_16_le));
9773     }
9774   else if (EQ (coding_type, Qiso_2022))
9775     {
9776       Lisp_Object initial, reg_usage, request, flags;
9777
9778       if (nargs < coding_arg_iso2022_max)
9779         goto short_args;
9780
9781       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9782       CHECK_VECTOR (initial);
9783       for (i = 0; i < 4; i++)
9784         {
9785           val = Faref (initial, make_number (i));
9786           if (! NILP (val))
9787             {
9788               struct charset *charset;
9789
9790               CHECK_CHARSET_GET_CHARSET (val, charset);
9791               ASET (initial, i, make_number (CHARSET_ID (charset)));
9792               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9793                 ASET (attrs, coding_attr_ascii_compat, Qt);
9794             }
9795           else
9796             ASET (initial, i, make_number (-1));
9797         }
9798
9799       reg_usage = args[coding_arg_iso2022_reg_usage];
9800       CHECK_CONS (reg_usage);
9801       CHECK_NUMBER_CAR (reg_usage);
9802       CHECK_NUMBER_CDR (reg_usage);
9803
9804       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9805       for (tail = request; CONSP (tail); tail = XCDR (tail))
9806         {
9807           int id;
9808           Lisp_Object tmp1;
9809
9810           val = XCAR (tail);
9811           CHECK_CONS (val);
9812           tmp1 = XCAR (val);
9813           CHECK_CHARSET_GET_ID (tmp1, id);
9814           CHECK_NATNUM_CDR (val);
9815           if (XINT (XCDR (val)) >= 4)
9816             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9817           XSETCAR (val, make_number (id));
9818         }
9819
9820       flags = args[coding_arg_iso2022_flags];
9821       CHECK_NATNUM (flags);
9822       i = XINT (flags) & INT_MAX;
9823       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9824         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9825       flags = make_number (i);
9826
9827       ASET (attrs, coding_attr_iso_initial, initial);
9828       ASET (attrs, coding_attr_iso_usage, reg_usage);
9829       ASET (attrs, coding_attr_iso_request, request);
9830       ASET (attrs, coding_attr_iso_flags, flags);
9831       setup_iso_safe_charsets (attrs);
9832
9833       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9834         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9835                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9836                     ? coding_category_iso_7_else
9837                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9838                     ? coding_category_iso_7
9839                     : coding_category_iso_7_tight);
9840       else
9841         {
9842           int id = XINT (AREF (initial, 1));
9843
9844           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9845                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9846                        || id < 0)
9847                       ? coding_category_iso_8_else
9848                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9849                       ? coding_category_iso_8_1
9850                       : coding_category_iso_8_2);
9851         }
9852       if (category != coding_category_iso_8_1
9853           && category != coding_category_iso_8_2)
9854         ASET (attrs, coding_attr_ascii_compat, Qnil);
9855     }
9856   else if (EQ (coding_type, Qemacs_mule))
9857     {
9858       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9859         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9860       ASET (attrs, coding_attr_ascii_compat, Qt);
9861       category = coding_category_emacs_mule;
9862     }
9863   else if (EQ (coding_type, Qshift_jis))
9864     {
9865
9866       struct charset *charset;
9867
9868       if (XINT (Flength (charset_list)) != 3
9869           && XINT (Flength (charset_list)) != 4)
9870         error ("There should be three or four charsets");
9871
9872       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9873       if (CHARSET_DIMENSION (charset) != 1)
9874         error ("Dimension of charset %s is not one",
9875                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9876       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9877         ASET (attrs, coding_attr_ascii_compat, Qt);
9878
9879       charset_list = XCDR (charset_list);
9880       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9881       if (CHARSET_DIMENSION (charset) != 1)
9882         error ("Dimension of charset %s is not one",
9883                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9884
9885       charset_list = XCDR (charset_list);
9886       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9887       if (CHARSET_DIMENSION (charset) != 2)
9888         error ("Dimension of charset %s is not two",
9889                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9890
9891       charset_list = XCDR (charset_list);
9892       if (! NILP (charset_list))
9893         {
9894           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9895           if (CHARSET_DIMENSION (charset) != 2)
9896             error ("Dimension of charset %s is not two",
9897                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9898         }
9899
9900       category = coding_category_sjis;
9901       Vsjis_coding_system = name;
9902     }
9903   else if (EQ (coding_type, Qbig5))
9904     {
9905       struct charset *charset;
9906
9907       if (XINT (Flength (charset_list)) != 2)
9908         error ("There should be just two charsets");
9909
9910       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9911       if (CHARSET_DIMENSION (charset) != 1)
9912         error ("Dimension of charset %s is not one",
9913                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9914       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9915         ASET (attrs, coding_attr_ascii_compat, Qt);
9916
9917       charset_list = XCDR (charset_list);
9918       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9919       if (CHARSET_DIMENSION (charset) != 2)
9920         error ("Dimension of charset %s is not two",
9921                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9922
9923       category = coding_category_big5;
9924       Vbig5_coding_system = name;
9925     }
9926   else if (EQ (coding_type, Qraw_text))
9927     {
9928       category = coding_category_raw_text;
9929       ASET (attrs, coding_attr_ascii_compat, Qt);
9930     }
9931   else if (EQ (coding_type, Qutf_8))
9932     {
9933       Lisp_Object bom;
9934
9935       if (nargs < coding_arg_utf8_max)
9936         goto short_args;
9937
9938       bom = args[coding_arg_utf8_bom];
9939       if (! NILP (bom) && ! EQ (bom, Qt))
9940         {
9941           CHECK_CONS (bom);
9942           val = XCAR (bom);
9943           CHECK_CODING_SYSTEM (val);
9944           val = XCDR (bom);
9945           CHECK_CODING_SYSTEM (val);
9946         }
9947       ASET (attrs, coding_attr_utf_bom, bom);
9948       if (NILP (bom))
9949         ASET (attrs, coding_attr_ascii_compat, Qt);
9950
9951       category = (CONSP (bom) ? coding_category_utf_8_auto
9952                   : NILP (bom) ? coding_category_utf_8_nosig
9953                   : coding_category_utf_8_sig);
9954     }
9955   else if (EQ (coding_type, Qundecided))
9956     category = coding_category_undecided;
9957   else
9958     error ("Invalid coding system type: %s",
9959            SDATA (SYMBOL_NAME (coding_type)));
9960
9961   ASET (attrs, coding_attr_category, make_number (category));
9962   ASET (attrs, coding_attr_plist,
9963         Fcons (QCcategory,
9964                Fcons (AREF (Vcoding_category_table, category),
9965                       CODING_ATTR_PLIST (attrs))));
9966   ASET (attrs, coding_attr_plist,
9967         Fcons (QCascii_compatible_p,
9968                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9969                       CODING_ATTR_PLIST (attrs))));
9970
9971   eol_type = args[coding_arg_eol_type];
9972   if (! NILP (eol_type)
9973       && ! EQ (eol_type, Qunix)
9974       && ! EQ (eol_type, Qdos)
9975       && ! EQ (eol_type, Qmac))
9976     error ("Invalid eol-type");
9977
9978   aliases = Fcons (name, Qnil);
9979
9980   if (NILP (eol_type))
9981     {
9982       eol_type = make_subsidiaries (name);
9983       for (i = 0; i < 3; i++)
9984         {
9985           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9986
9987           this_name = AREF (eol_type, i);
9988           this_aliases = Fcons (this_name, Qnil);
9989           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9990           this_spec = Fmake_vector (make_number (3), attrs);
9991           ASET (this_spec, 1, this_aliases);
9992           ASET (this_spec, 2, this_eol_type);
9993           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9994           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9995           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9996           if (NILP (val))
9997             Vcoding_system_alist
9998               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9999                        Vcoding_system_alist);
10000         }
10001     }
10002
10003   spec_vec = Fmake_vector (make_number (3), attrs);
10004   ASET (spec_vec, 1, aliases);
10005   ASET (spec_vec, 2, eol_type);
10006
10007   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10008   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10009   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10010   if (NILP (val))
10011     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10012                                   Vcoding_system_alist);
10013
10014   {
10015     int id = coding_categories[category].id;
10016
10017     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10018       setup_coding_system (name, &coding_categories[category]);
10019   }
10020
10021   return Qnil;
10022
10023  short_args:
10024   return Fsignal (Qwrong_number_of_arguments,
10025                   Fcons (intern ("define-coding-system-internal"),
10026                          make_number (nargs)));
10027 }
10028
10029
10030 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10031        3, 3, 0,
10032        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10033   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10034 {
10035   Lisp_Object spec, attrs;
10036
10037   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10038   attrs = AREF (spec, 0);
10039   if (EQ (prop, QCmnemonic))
10040     {
10041       if (! STRINGP (val))
10042         CHECK_CHARACTER (val);
10043       ASET (attrs, coding_attr_mnemonic, val);
10044     }
10045   else if (EQ (prop, QCdefault_char))
10046     {
10047       if (NILP (val))
10048         val = make_number (' ');
10049       else
10050         CHECK_CHARACTER (val);
10051       ASET (attrs, coding_attr_default_char, val);
10052     }
10053   else if (EQ (prop, QCdecode_translation_table))
10054     {
10055       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10056         CHECK_SYMBOL (val);
10057       ASET (attrs, coding_attr_decode_tbl, val);
10058     }
10059   else if (EQ (prop, QCencode_translation_table))
10060     {
10061       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10062         CHECK_SYMBOL (val);
10063       ASET (attrs, coding_attr_encode_tbl, val);
10064     }
10065   else if (EQ (prop, QCpost_read_conversion))
10066     {
10067       CHECK_SYMBOL (val);
10068       ASET (attrs, coding_attr_post_read, val);
10069     }
10070   else if (EQ (prop, QCpre_write_conversion))
10071     {
10072       CHECK_SYMBOL (val);
10073       ASET (attrs, coding_attr_pre_write, val);
10074     }
10075   else if (EQ (prop, QCascii_compatible_p))
10076     {
10077       ASET (attrs, coding_attr_ascii_compat, val);
10078     }
10079
10080   ASET (attrs, coding_attr_plist,
10081         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10082   return val;
10083 }
10084
10085
10086 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10087        Sdefine_coding_system_alias, 2, 2, 0,
10088        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10089   (Lisp_Object alias, Lisp_Object coding_system)
10090 {
10091   Lisp_Object spec, aliases, eol_type, val;
10092
10093   CHECK_SYMBOL (alias);
10094   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10095   aliases = AREF (spec, 1);
10096   /* ALIASES should be a list of length more than zero, and the first
10097      element is a base coding system.  Append ALIAS at the tail of the
10098      list.  */
10099   while (!NILP (XCDR (aliases)))
10100     aliases = XCDR (aliases);
10101   XSETCDR (aliases, Fcons (alias, Qnil));
10102
10103   eol_type = AREF (spec, 2);
10104   if (VECTORP (eol_type))
10105     {
10106       Lisp_Object subsidiaries;
10107       int i;
10108
10109       subsidiaries = make_subsidiaries (alias);
10110       for (i = 0; i < 3; i++)
10111         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10112                                      AREF (eol_type, i));
10113     }
10114
10115   Fputhash (alias, spec, Vcoding_system_hash_table);
10116   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10117   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10118   if (NILP (val))
10119     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10120                                   Vcoding_system_alist);
10121
10122   return Qnil;
10123 }
10124
10125 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10126        1, 1, 0,
10127        doc: /* Return the base of CODING-SYSTEM.
10128 Any alias or subsidiary coding system is not a base coding system.  */)
10129   (Lisp_Object coding_system)
10130 {
10131   Lisp_Object spec, attrs;
10132
10133   if (NILP (coding_system))
10134     return (Qno_conversion);
10135   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10136   attrs = AREF (spec, 0);
10137   return CODING_ATTR_BASE_NAME (attrs);
10138 }
10139
10140 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10141        1, 1, 0,
10142        doc: "Return the property list of CODING-SYSTEM.")
10143   (Lisp_Object coding_system)
10144 {
10145   Lisp_Object spec, attrs;
10146
10147   if (NILP (coding_system))
10148     coding_system = Qno_conversion;
10149   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10150   attrs = AREF (spec, 0);
10151   return CODING_ATTR_PLIST (attrs);
10152 }
10153
10154
10155 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10156        1, 1, 0,
10157        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10158   (Lisp_Object coding_system)
10159 {
10160   Lisp_Object spec;
10161
10162   if (NILP (coding_system))
10163     coding_system = Qno_conversion;
10164   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10165   return AREF (spec, 1);
10166 }
10167
10168 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10169        Scoding_system_eol_type, 1, 1, 0,
10170        doc: /* Return eol-type of CODING-SYSTEM.
10171 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10172
10173 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10174 and CR respectively.
10175
10176 A vector value indicates that a format of end-of-line should be
10177 detected automatically.  Nth element of the vector is the subsidiary
10178 coding system whose eol-type is N.  */)
10179   (Lisp_Object coding_system)
10180 {
10181   Lisp_Object spec, eol_type;
10182   int n;
10183
10184   if (NILP (coding_system))
10185     coding_system = Qno_conversion;
10186   if (! CODING_SYSTEM_P (coding_system))
10187     return Qnil;
10188   spec = CODING_SYSTEM_SPEC (coding_system);
10189   eol_type = AREF (spec, 2);
10190   if (VECTORP (eol_type))
10191     return Fcopy_sequence (eol_type);
10192   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10193   return make_number (n);
10194 }
10195
10196 #endif /* emacs */
10197
10198 \f
10199 /*** 9. Post-amble ***/
10200
10201 void
10202 init_coding_once (void)
10203 {
10204   int i;
10205
10206   for (i = 0; i < coding_category_max; i++)
10207     {
10208       coding_categories[i].id = -1;
10209       coding_priorities[i] = i;
10210     }
10211
10212   /* ISO2022 specific initialize routine.  */
10213   for (i = 0; i < 0x20; i++)
10214     iso_code_class[i] = ISO_control_0;
10215   for (i = 0x21; i < 0x7F; i++)
10216     iso_code_class[i] = ISO_graphic_plane_0;
10217   for (i = 0x80; i < 0xA0; i++)
10218     iso_code_class[i] = ISO_control_1;
10219   for (i = 0xA1; i < 0xFF; i++)
10220     iso_code_class[i] = ISO_graphic_plane_1;
10221   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10222   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10223   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10224   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10225   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10226   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10227   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10228   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10229   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10230
10231   for (i = 0; i < 256; i++)
10232     {
10233       emacs_mule_bytes[i] = 1;
10234     }
10235   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10236   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10237   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10238   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10239 }
10240
10241 #ifdef emacs
10242
10243 void
10244 syms_of_coding (void)
10245 {
10246   staticpro (&Vcoding_system_hash_table);
10247   {
10248     Lisp_Object args[2];
10249     args[0] = QCtest;
10250     args[1] = Qeq;
10251     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10252   }
10253
10254   staticpro (&Vsjis_coding_system);
10255   Vsjis_coding_system = Qnil;
10256
10257   staticpro (&Vbig5_coding_system);
10258   Vbig5_coding_system = Qnil;
10259
10260   staticpro (&Vcode_conversion_reused_workbuf);
10261   Vcode_conversion_reused_workbuf = Qnil;
10262
10263   staticpro (&Vcode_conversion_workbuf_name);
10264   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10265
10266   reused_workbuf_in_use = 0;
10267
10268   DEFSYM (Qcharset, "charset");
10269   DEFSYM (Qtarget_idx, "target-idx");
10270   DEFSYM (Qcoding_system_history, "coding-system-history");
10271   Fset (Qcoding_system_history, Qnil);
10272
10273   /* Target FILENAME is the first argument.  */
10274   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10275   /* Target FILENAME is the third argument.  */
10276   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10277
10278   DEFSYM (Qcall_process, "call-process");
10279   /* Target PROGRAM is the first argument.  */
10280   Fput (Qcall_process, Qtarget_idx, make_number (0));
10281
10282   DEFSYM (Qcall_process_region, "call-process-region");
10283   /* Target PROGRAM is the third argument.  */
10284   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10285
10286   DEFSYM (Qstart_process, "start-process");
10287   /* Target PROGRAM is the third argument.  */
10288   Fput (Qstart_process, Qtarget_idx, make_number (2));
10289
10290   DEFSYM (Qopen_network_stream, "open-network-stream");
10291   /* Target SERVICE is the fourth argument.  */
10292   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10293
10294   DEFSYM (Qcoding_system, "coding-system");
10295   DEFSYM (Qcoding_aliases, "coding-aliases");
10296
10297   DEFSYM (Qeol_type, "eol-type");
10298   DEFSYM (Qunix, "unix");
10299   DEFSYM (Qdos, "dos");
10300
10301   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10302   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10303   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10304   DEFSYM (Qdefault_char, "default-char");
10305   DEFSYM (Qundecided, "undecided");
10306   DEFSYM (Qno_conversion, "no-conversion");
10307   DEFSYM (Qraw_text, "raw-text");
10308
10309   DEFSYM (Qiso_2022, "iso-2022");
10310
10311   DEFSYM (Qutf_8, "utf-8");
10312   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10313
10314   DEFSYM (Qutf_16, "utf-16");
10315   DEFSYM (Qbig, "big");
10316   DEFSYM (Qlittle, "little");
10317
10318   DEFSYM (Qshift_jis, "shift-jis");
10319   DEFSYM (Qbig5, "big5");
10320
10321   DEFSYM (Qcoding_system_p, "coding-system-p");
10322
10323   DEFSYM (Qcoding_system_error, "coding-system-error");
10324   Fput (Qcoding_system_error, Qerror_conditions,
10325         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10326   Fput (Qcoding_system_error, Qerror_message,
10327         build_pure_c_string ("Invalid coding system"));
10328
10329   /* Intern this now in case it isn't already done.
10330      Setting this variable twice is harmless.
10331      But don't staticpro it here--that is done in alloc.c.  */
10332   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10333
10334   DEFSYM (Qtranslation_table, "translation-table");
10335   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10336   DEFSYM (Qtranslation_table_id, "translation-table-id");
10337   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10338   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10339
10340   DEFSYM (Qvalid_codes, "valid-codes");
10341
10342   DEFSYM (Qemacs_mule, "emacs-mule");
10343
10344   DEFSYM (QCcategory, ":category");
10345   DEFSYM (QCmnemonic, ":mnemonic");
10346   DEFSYM (QCdefault_char, ":default-char");
10347   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10348   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10349   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10350   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10351   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10352
10353   Vcoding_category_table
10354     = Fmake_vector (make_number (coding_category_max), Qnil);
10355   staticpro (&Vcoding_category_table);
10356   /* Followings are target of code detection.  */
10357   ASET (Vcoding_category_table, coding_category_iso_7,
10358         intern_c_string ("coding-category-iso-7"));
10359   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10360         intern_c_string ("coding-category-iso-7-tight"));
10361   ASET (Vcoding_category_table, coding_category_iso_8_1,
10362         intern_c_string ("coding-category-iso-8-1"));
10363   ASET (Vcoding_category_table, coding_category_iso_8_2,
10364         intern_c_string ("coding-category-iso-8-2"));
10365   ASET (Vcoding_category_table, coding_category_iso_7_else,
10366         intern_c_string ("coding-category-iso-7-else"));
10367   ASET (Vcoding_category_table, coding_category_iso_8_else,
10368         intern_c_string ("coding-category-iso-8-else"));
10369   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10370         intern_c_string ("coding-category-utf-8-auto"));
10371   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10372         intern_c_string ("coding-category-utf-8"));
10373   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10374         intern_c_string ("coding-category-utf-8-sig"));
10375   ASET (Vcoding_category_table, coding_category_utf_16_be,
10376         intern_c_string ("coding-category-utf-16-be"));
10377   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10378         intern_c_string ("coding-category-utf-16-auto"));
10379   ASET (Vcoding_category_table, coding_category_utf_16_le,
10380         intern_c_string ("coding-category-utf-16-le"));
10381   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10382         intern_c_string ("coding-category-utf-16-be-nosig"));
10383   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10384         intern_c_string ("coding-category-utf-16-le-nosig"));
10385   ASET (Vcoding_category_table, coding_category_charset,
10386         intern_c_string ("coding-category-charset"));
10387   ASET (Vcoding_category_table, coding_category_sjis,
10388         intern_c_string ("coding-category-sjis"));
10389   ASET (Vcoding_category_table, coding_category_big5,
10390         intern_c_string ("coding-category-big5"));
10391   ASET (Vcoding_category_table, coding_category_ccl,
10392         intern_c_string ("coding-category-ccl"));
10393   ASET (Vcoding_category_table, coding_category_emacs_mule,
10394         intern_c_string ("coding-category-emacs-mule"));
10395   /* Followings are NOT target of code detection.  */
10396   ASET (Vcoding_category_table, coding_category_raw_text,
10397         intern_c_string ("coding-category-raw-text"));
10398   ASET (Vcoding_category_table, coding_category_undecided,
10399         intern_c_string ("coding-category-undecided"));
10400
10401   DEFSYM (Qinsufficient_source, "insufficient-source");
10402   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10403   DEFSYM (Qinvalid_source, "invalid-source");
10404   DEFSYM (Qinterrupted, "interrupted");
10405   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10406   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10407
10408   defsubr (&Scoding_system_p);
10409   defsubr (&Sread_coding_system);
10410   defsubr (&Sread_non_nil_coding_system);
10411   defsubr (&Scheck_coding_system);
10412   defsubr (&Sdetect_coding_region);
10413   defsubr (&Sdetect_coding_string);
10414   defsubr (&Sfind_coding_systems_region_internal);
10415   defsubr (&Sunencodable_char_position);
10416   defsubr (&Scheck_coding_systems_region);
10417   defsubr (&Sdecode_coding_region);
10418   defsubr (&Sencode_coding_region);
10419   defsubr (&Sdecode_coding_string);
10420   defsubr (&Sencode_coding_string);
10421   defsubr (&Sdecode_sjis_char);
10422   defsubr (&Sencode_sjis_char);
10423   defsubr (&Sdecode_big5_char);
10424   defsubr (&Sencode_big5_char);
10425   defsubr (&Sset_terminal_coding_system_internal);
10426   defsubr (&Sset_safe_terminal_coding_system_internal);
10427   defsubr (&Sterminal_coding_system);
10428   defsubr (&Sset_keyboard_coding_system_internal);
10429   defsubr (&Skeyboard_coding_system);
10430   defsubr (&Sfind_operation_coding_system);
10431   defsubr (&Sset_coding_system_priority);
10432   defsubr (&Sdefine_coding_system_internal);
10433   defsubr (&Sdefine_coding_system_alias);
10434   defsubr (&Scoding_system_put);
10435   defsubr (&Scoding_system_base);
10436   defsubr (&Scoding_system_plist);
10437   defsubr (&Scoding_system_aliases);
10438   defsubr (&Scoding_system_eol_type);
10439   defsubr (&Scoding_system_priority_list);
10440
10441   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10442                doc: /* List of coding systems.
10443
10444 Do not alter the value of this variable manually.  This variable should be
10445 updated by the functions `define-coding-system' and
10446 `define-coding-system-alias'.  */);
10447   Vcoding_system_list = Qnil;
10448
10449   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10450                doc: /* Alist of coding system names.
10451 Each element is one element list of coding system name.
10452 This variable is given to `completing-read' as COLLECTION argument.
10453
10454 Do not alter the value of this variable manually.  This variable should be
10455 updated by the functions `make-coding-system' and
10456 `define-coding-system-alias'.  */);
10457   Vcoding_system_alist = Qnil;
10458
10459   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10460                doc: /* List of coding-categories (symbols) ordered by priority.
10461
10462 On detecting a coding system, Emacs tries code detection algorithms
10463 associated with each coding-category one by one in this order.  When
10464 one algorithm agrees with a byte sequence of source text, the coding
10465 system bound to the corresponding coding-category is selected.
10466
10467 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10468   {
10469     int i;
10470
10471     Vcoding_category_list = Qnil;
10472     for (i = coding_category_max - 1; i >= 0; i--)
10473       Vcoding_category_list
10474         = Fcons (AREF (Vcoding_category_table, i),
10475                  Vcoding_category_list);
10476   }
10477
10478   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10479                doc: /* Specify the coding system for read operations.
10480 It is useful to bind this variable with `let', but do not set it globally.
10481 If the value is a coding system, it is used for decoding on read operation.
10482 If not, an appropriate element is used from one of the coding system alists.
10483 There are three such tables: `file-coding-system-alist',
10484 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10485   Vcoding_system_for_read = Qnil;
10486
10487   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10488                doc: /* Specify the coding system for write operations.
10489 Programs bind this variable with `let', but you should not set it globally.
10490 If the value is a coding system, it is used for encoding of output,
10491 when writing it to a file and when sending it to a file or subprocess.
10492
10493 If this does not specify a coding system, an appropriate element
10494 is used from one of the coding system alists.
10495 There are three such tables: `file-coding-system-alist',
10496 `process-coding-system-alist', and `network-coding-system-alist'.
10497 For output to files, if the above procedure does not specify a coding system,
10498 the value of `buffer-file-coding-system' is used.  */);
10499   Vcoding_system_for_write = Qnil;
10500
10501   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10502                doc: /*
10503 Coding system used in the latest file or process I/O.  */);
10504   Vlast_coding_system_used = Qnil;
10505
10506   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10507                doc: /*
10508 Error status of the last code conversion.
10509
10510 When an error was detected in the last code conversion, this variable
10511 is set to one of the following symbols.
10512   `insufficient-source'
10513   `inconsistent-eol'
10514   `invalid-source'
10515   `interrupted'
10516   `insufficient-memory'
10517 When no error was detected, the value doesn't change.  So, to check
10518 the error status of a code conversion by this variable, you must
10519 explicitly set this variable to nil before performing code
10520 conversion.  */);
10521   Vlast_code_conversion_error = Qnil;
10522
10523   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10524                doc: /*
10525 *Non-nil means always inhibit code conversion of end-of-line format.
10526 See info node `Coding Systems' and info node `Text and Binary' concerning
10527 such conversion.  */);
10528   inhibit_eol_conversion = 0;
10529
10530   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10531                doc: /*
10532 Non-nil means process buffer inherits coding system of process output.
10533 Bind it to t if the process output is to be treated as if it were a file
10534 read from some filesystem.  */);
10535   inherit_process_coding_system = 0;
10536
10537   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10538                doc: /*
10539 Alist to decide a coding system to use for a file I/O operation.
10540 The format is ((PATTERN . VAL) ...),
10541 where PATTERN is a regular expression matching a file name,
10542 VAL is a coding system, a cons of coding systems, or a function symbol.
10543 If VAL is a coding system, it is used for both decoding and encoding
10544 the file contents.
10545 If VAL is a cons of coding systems, the car part is used for decoding,
10546 and the cdr part is used for encoding.
10547 If VAL is a function symbol, the function must return a coding system
10548 or a cons of coding systems which are used as above.  The function is
10549 called with an argument that is a list of the arguments with which
10550 `find-operation-coding-system' was called.  If the function can't decide
10551 a coding system, it can return `undecided' so that the normal
10552 code-detection is performed.
10553
10554 See also the function `find-operation-coding-system'
10555 and the variable `auto-coding-alist'.  */);
10556   Vfile_coding_system_alist = Qnil;
10557
10558   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10559                doc: /*
10560 Alist to decide a coding system to use for a process I/O operation.
10561 The format is ((PATTERN . VAL) ...),
10562 where PATTERN is a regular expression matching a program name,
10563 VAL is a coding system, a cons of coding systems, or a function symbol.
10564 If VAL is a coding system, it is used for both decoding what received
10565 from the program and encoding what sent to the program.
10566 If VAL is a cons of coding systems, the car part is used for decoding,
10567 and the cdr part is used for encoding.
10568 If VAL is a function symbol, the function must return a coding system
10569 or a cons of coding systems which are used as above.
10570
10571 See also the function `find-operation-coding-system'.  */);
10572   Vprocess_coding_system_alist = Qnil;
10573
10574   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10575                doc: /*
10576 Alist to decide a coding system to use for a network I/O operation.
10577 The format is ((PATTERN . VAL) ...),
10578 where PATTERN is a regular expression matching a network service name
10579 or is a port number to connect to,
10580 VAL is a coding system, a cons of coding systems, or a function symbol.
10581 If VAL is a coding system, it is used for both decoding what received
10582 from the network stream and encoding what sent to the network stream.
10583 If VAL is a cons of coding systems, the car part is used for decoding,
10584 and the cdr part is used for encoding.
10585 If VAL is a function symbol, the function must return a coding system
10586 or a cons of coding systems which are used as above.
10587
10588 See also the function `find-operation-coding-system'.  */);
10589   Vnetwork_coding_system_alist = Qnil;
10590
10591   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10592                doc: /* Coding system to use with system messages.
10593 Also used for decoding keyboard input on X Window system.  */);
10594   Vlocale_coding_system = Qnil;
10595
10596   /* The eol mnemonics are reset in startup.el system-dependently.  */
10597   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10598                doc: /*
10599 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10600   eol_mnemonic_unix = build_pure_c_string (":");
10601
10602   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10603                doc: /*
10604 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10605   eol_mnemonic_dos = build_pure_c_string ("\\");
10606
10607   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10608                doc: /*
10609 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10610   eol_mnemonic_mac = build_pure_c_string ("/");
10611
10612   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10613                doc: /*
10614 *String displayed in mode line when end-of-line format is not yet determined.  */);
10615   eol_mnemonic_undecided = build_pure_c_string (":");
10616
10617   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10618                doc: /*
10619 *Non-nil enables character translation while encoding and decoding.  */);
10620   Venable_character_translation = Qt;
10621
10622   DEFVAR_LISP ("standard-translation-table-for-decode",
10623                Vstandard_translation_table_for_decode,
10624                doc: /* Table for translating characters while decoding.  */);
10625   Vstandard_translation_table_for_decode = Qnil;
10626
10627   DEFVAR_LISP ("standard-translation-table-for-encode",
10628                Vstandard_translation_table_for_encode,
10629                doc: /* Table for translating characters while encoding.  */);
10630   Vstandard_translation_table_for_encode = Qnil;
10631
10632   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10633                doc: /* Alist of charsets vs revision numbers.
10634 While encoding, if a charset (car part of an element) is found,
10635 designate it with the escape sequence identifying revision (cdr part
10636 of the element).  */);
10637   Vcharset_revision_table = Qnil;
10638
10639   DEFVAR_LISP ("default-process-coding-system",
10640                Vdefault_process_coding_system,
10641                doc: /* Cons of coding systems used for process I/O by default.
10642 The car part is used for decoding a process output,
10643 the cdr part is used for encoding a text to be sent to a process.  */);
10644   Vdefault_process_coding_system = Qnil;
10645
10646   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10647                doc: /*
10648 Table of extra Latin codes in the range 128..159 (inclusive).
10649 This is a vector of length 256.
10650 If Nth element is non-nil, the existence of code N in a file
10651 \(or output of subprocess) doesn't prevent it to be detected as
10652 a coding system of ISO 2022 variant which has a flag
10653 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10654 or reading output of a subprocess.
10655 Only 128th through 159th elements have a meaning.  */);
10656   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10657
10658   DEFVAR_LISP ("select-safe-coding-system-function",
10659                Vselect_safe_coding_system_function,
10660                doc: /*
10661 Function to call to select safe coding system for encoding a text.
10662
10663 If set, this function is called to force a user to select a proper
10664 coding system which can encode the text in the case that a default
10665 coding system used in each operation can't encode the text.  The
10666 function should take care that the buffer is not modified while
10667 the coding system is being selected.
10668
10669 The default value is `select-safe-coding-system' (which see).  */);
10670   Vselect_safe_coding_system_function = Qnil;
10671
10672   DEFVAR_BOOL ("coding-system-require-warning",
10673                coding_system_require_warning,
10674                doc: /* Internal use only.
10675 If non-nil, on writing a file, `select-safe-coding-system-function' is
10676 called even if `coding-system-for-write' is non-nil.  The command
10677 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10678   coding_system_require_warning = 0;
10679
10680
10681   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10682                inhibit_iso_escape_detection,
10683                doc: /*
10684 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10685
10686 When Emacs reads text, it tries to detect how the text is encoded.
10687 This code detection is sensitive to escape sequences.  If Emacs sees
10688 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10689 of the ISO2022 encodings, and decodes text by the corresponding coding
10690 system (e.g. `iso-2022-7bit').
10691
10692 However, there may be a case that you want to read escape sequences in
10693 a file as is.  In such a case, you can set this variable to non-nil.
10694 Then the code detection will ignore any escape sequences, and no text is
10695 detected as encoded in some ISO-2022 encoding.  The result is that all
10696 escape sequences become visible in a buffer.
10697
10698 The default value is nil, and it is strongly recommended not to change
10699 it.  That is because many Emacs Lisp source files that contain
10700 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10701 in Emacs's distribution, and they won't be decoded correctly on
10702 reading if you suppress escape sequence detection.
10703
10704 The other way to read escape sequences in a file without decoding is
10705 to explicitly specify some coding system that doesn't use ISO-2022
10706 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10707   inhibit_iso_escape_detection = 0;
10708
10709   DEFVAR_BOOL ("inhibit-null-byte-detection",
10710                inhibit_null_byte_detection,
10711                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10712 By default, Emacs treats it as binary data, and does not attempt to
10713 decode it.  The effect is as if you specified `no-conversion' for
10714 reading that text.
10715
10716 Set this to non-nil when a regular text happens to include null bytes.
10717 Examples are Index nodes of Info files and null-byte delimited output
10718 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10719 decode text as usual.  */);
10720   inhibit_null_byte_detection = 0;
10721
10722   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10723                doc: /* Char table for translating self-inserting characters.
10724 This is applied to the result of input methods, not their input.
10725 See also `keyboard-translate-table'.
10726
10727 Use of this variable for character code unification was rendered
10728 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10729 internal character representation.  */);
10730     Vtranslation_table_for_input = Qnil;
10731
10732   {
10733     Lisp_Object args[coding_arg_max];
10734     Lisp_Object plist[16];
10735     int i;
10736
10737     for (i = 0; i < coding_arg_max; i++)
10738       args[i] = Qnil;
10739
10740     plist[0] = intern_c_string (":name");
10741     plist[1] = args[coding_arg_name] = Qno_conversion;
10742     plist[2] = intern_c_string (":mnemonic");
10743     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10744     plist[4] = intern_c_string (":coding-type");
10745     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10746     plist[6] = intern_c_string (":ascii-compatible-p");
10747     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10748     plist[8] = intern_c_string (":default-char");
10749     plist[9] = args[coding_arg_default_char] = make_number (0);
10750     plist[10] = intern_c_string (":for-unibyte");
10751     plist[11] = args[coding_arg_for_unibyte] = Qt;
10752     plist[12] = intern_c_string (":docstring");
10753     plist[13] = build_pure_c_string ("Do no conversion.\n\
10754 \n\
10755 When you visit a file with this coding, the file is read into a\n\
10756 unibyte buffer as is, thus each byte of a file is treated as a\n\
10757 character.");
10758     plist[14] = intern_c_string (":eol-type");
10759     plist[15] = args[coding_arg_eol_type] = Qunix;
10760     args[coding_arg_plist] = Flist (16, plist);
10761     Fdefine_coding_system_internal (coding_arg_max, args);
10762
10763     plist[1] = args[coding_arg_name] = Qundecided;
10764     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10765     plist[5] = args[coding_arg_coding_type] = Qundecided;
10766     /* This is already set.
10767        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10768     plist[8] = intern_c_string (":charset-list");
10769     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10770     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10771     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10772     plist[15] = args[coding_arg_eol_type] = Qnil;
10773     args[coding_arg_plist] = Flist (16, plist);
10774     Fdefine_coding_system_internal (coding_arg_max, args);
10775   }
10776
10777   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10778
10779   {
10780     int i;
10781
10782     for (i = 0; i < coding_category_max; i++)
10783       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10784   }
10785 #if defined (DOS_NT)
10786   system_eol_type = Qdos;
10787 #else
10788   system_eol_type = Qunix;
10789 #endif
10790   staticpro (&system_eol_type);
10791 }
10792
10793 char *
10794 emacs_strerror (int error_number)
10795 {
10796   char *str;
10797
10798   synchronize_system_messages_locale ();
10799   str = strerror (error_number);
10800
10801   if (! NILP (Vlocale_coding_system))
10802     {
10803       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10804                                                       Vlocale_coding_system,
10805                                                       0);
10806       str = SSDATA (dec);
10807     }
10808
10809   return str;
10810 }
10811
10812 #endif /* emacs */