src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2012 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 static Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qutf_8;
 310 static Lisp_Object Qiso_2022;
 311 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 312 static Lisp_Object Qbig, Qlittle;
 313 static Lisp_Object Qcoding_system_history;
 314 static Lisp_Object Qvalid_codes;
 315 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 316 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 static Lisp_Object QCascii_compatible_p;
 319
 320 Lisp_Object Qcall_process, Qcall_process_region;
 321 Lisp_Object Qstart_process, Qopen_network_stream;
 322 static Lisp_Object Qtarget_idx;
 323
 324 static Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 325 static Lisp_Object Qinterrupted, Qinsufficient_memory;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 /* Coding-systems are handed between Emacs Lisp programs and C internal
 347    routines by the following three variables.  */
 348 /* Coding system to be used to encode text for terminal display when
 349    terminal coding system is nil.  */
 350 struct coding_system safe_terminal_coding;
 351
 352 #endif /* emacs */
 353
 354 Lisp_Object Qtranslation_table;
 355 Lisp_Object Qtranslation_table_id;
 356 static Lisp_Object Qtranslation_table_for_decode;
 357 static Lisp_Object Qtranslation_table_for_encode;
 358
 359 /* Two special coding systems.  */
 360 static Lisp_Object Vsjis_coding_system;
 361 static Lisp_Object Vbig5_coding_system;
 362
 363 /* ISO2022 section */
 364
 365 #define CODING_ISO_INITIAL(coding, reg)                 \
 366   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 367                      coding_attr_iso_initial),          \
 368                reg)))
 369
 370
 371 #define CODING_ISO_REQUEST(coding, charset_id)          \
 372   (((charset_id) <= (coding)->max_charset_id            \
 373     ? ((coding)->safe_charsets[charset_id] != 255       \
 374        ? (coding)->safe_charsets[charset_id]            \
 375        : -1)                                            \
 376     : -1))
 377
 378
 379 #define CODING_ISO_FLAGS(coding)        \
 380   ((coding)->spec.iso_2022.flags)
 381 #define CODING_ISO_DESIGNATION(coding, reg)     \
 382   ((coding)->spec.iso_2022.current_designation[reg])
 383 #define CODING_ISO_INVOCATION(coding, plane)    \
 384   ((coding)->spec.iso_2022.current_invocation[plane])
 385 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 386   ((coding)->spec.iso_2022.single_shifting)
 387 #define CODING_ISO_BOL(coding)  \
 388   ((coding)->spec.iso_2022.bol)
 389 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 390   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 391 #define CODING_ISO_CMP_STATUS(coding)   \
 392   (&(coding)->spec.iso_2022.cmp_status)
 393 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 394   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 395 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 396   ((coding)->spec.iso_2022.embedded_utf_8)
 397
 398 /* Control characters of ISO2022.  */
 399                         /* code */      /* function */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 654    and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 /* Store a byte C in the place pointed by DST and increment DST to the
 725    next free point, and increment PRODUCED_CHARS.  The caller should
 726    assure that C is 0..127, and declare and set the variable `dst'
 727    appropriately in advance.
 728 */
 729
 730
 731 #define EMIT_ONE_ASCII_BYTE(c)  \
 732   do {                          \
 733     produced_chars++;           \
 734     *dst++ = (c);               \
 735   } while (0)
 736
 737
 738 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 739
 740 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 741   do {                                  \
 742     produced_chars += 2;                \
 743     *dst++ = (c1), *dst++ = (c2);       \
 744   } while (0)
 745
 746
 747 /* Store a byte C in the place pointed by DST and increment DST to the
 748    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 749    store in an appropriate multibyte form.  The caller should
 750    declare and set the variables `dst' and `multibytep' appropriately
 751    in advance.  */
 752
 753 #define EMIT_ONE_BYTE(c)                \
 754   do {                                  \
 755     produced_chars++;                   \
 756     if (multibytep)                     \
 757       {                                 \
 758         unsigned ch = (c);              \
 759         if (ch >= 0x80)                 \
 760           ch = BYTE8_TO_CHAR (ch);      \
 761         CHAR_STRING_ADVANCE (ch, dst);  \
 762       }                                 \
 763     else                                \
 764       *dst++ = (c);                     \
 765   } while (0)
 766
 767
 768 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 769
 770 #define EMIT_TWO_BYTES(c1, c2)          \
 771   do {                                  \
 772     produced_chars += 2;                \
 773     if (multibytep)                     \
 774       {                                 \
 775         unsigned ch;                    \
 776                                         \
 777         ch = (c1);                      \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781         ch = (c2);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785       }                                 \
 786     else                                \
 787       {                                 \
 788         *dst++ = (c1);                  \
 789         *dst++ = (c2);                  \
 790       }                                 \
 791   } while (0)
 792
 793
 794 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 795   do {                                  \
 796     EMIT_ONE_BYTE (c1);                 \
 797     EMIT_TWO_BYTES (c2, c3);            \
 798   } while (0)
 799
 800
 801 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 802   do {                                          \
 803     EMIT_TWO_BYTES (c1, c2);                    \
 804     EMIT_TWO_BYTES (c3, c4);                    \
 805   } while (0)
 806
 807
 808 static void
 809 record_conversion_result (struct coding_system *coding,
 810                           enum coding_result_code result)
 811 {
 812   coding->result = result;
 813   switch (result)
 814     {
 815     case CODING_RESULT_INSUFFICIENT_SRC:
 816       Vlast_code_conversion_error = Qinsufficient_source;
 817       break;
 818     case CODING_RESULT_INCONSISTENT_EOL:
 819       Vlast_code_conversion_error = Qinconsistent_eol;
 820       break;
 821     case CODING_RESULT_INVALID_SRC:
 822       Vlast_code_conversion_error = Qinvalid_source;
 823       break;
 824     case CODING_RESULT_INTERRUPT:
 825       Vlast_code_conversion_error = Qinterrupted;
 826       break;
 827     case CODING_RESULT_INSUFFICIENT_MEM:
 828       Vlast_code_conversion_error = Qinsufficient_memory;
 829       break;
 830     case CODING_RESULT_INSUFFICIENT_DST:
 831       /* Don't record this error in Vlast_code_conversion_error
 832          because it happens just temporarily and is resolved when the
 833          whole conversion is finished.  */
 834       break;
 835     case CODING_RESULT_SUCCESS:
 836       break;
 837     default:
 838       Vlast_code_conversion_error = intern ("Unknown error");
 839     }
 840 }
 841
 842 /* These wrapper macros are used to preserve validity of pointers into
 843    buffer text across calls to decode_char, encode_char, etc, which
 844    could cause relocation of buffers if it loads a charset map,
 845    because loading a charset map allocates large structures.  */
 846
 847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 848   do {                                                                       \
 849     ptrdiff_t offset;                                                        \
 850                                                                              \
 851     charset_map_loaded = 0;                                                  \
 852     c = DECODE_CHAR (charset, code);                                         \
 853     if (charset_map_loaded                                                   \
 854         && (offset = coding_change_source (coding)))                         \
 855       {                                                                      \
 856         src += offset;                                                       \
 857         src_base += offset;                                                  \
 858         src_end += offset;                                                   \
 859       }                                                                      \
 860   } while (0)
 861
 862 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 863   do {                                                                  \
 864     ptrdiff_t offset;                                                   \
 865                                                                         \
 866     charset_map_loaded = 0;                                             \
 867     code = ENCODE_CHAR (charset, c);                                    \
 868     if (charset_map_loaded                                              \
 869         && (offset = coding_change_destination (coding)))               \
 870       {                                                                 \
 871         dst += offset;                                                  \
 872         dst_end += offset;                                              \
 873       }                                                                 \
 874   } while (0)
 875
 876 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 877   do {                                                                  \
 878     ptrdiff_t offset;                                                   \
 879                                                                         \
 880     charset_map_loaded = 0;                                             \
 881     charset = char_charset (c, charset_list, code_return);              \
 882     if (charset_map_loaded                                              \
 883         && (offset = coding_change_destination (coding)))               \
 884       {                                                                 \
 885         dst += offset;                                                  \
 886         dst_end += offset;                                              \
 887       }                                                                 \
 888   } while (0)
 889
 890 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 891   do {                                                                  \
 892     ptrdiff_t offset;                                                   \
 893                                                                         \
 894     charset_map_loaded = 0;                                             \
 895     result = CHAR_CHARSET_P (c, charset);                               \
 896     if (charset_map_loaded                                              \
 897         && (offset = coding_change_destination (coding)))               \
 898       {                                                                 \
 899         dst += offset;                                                  \
 900         dst_end += offset;                                              \
 901       }                                                                 \
 902   } while (0)
 903
 904
 905 /* If there are at least BYTES length of room at dst, allocate memory
 906    for coding->destination and update dst and dst_end.  We don't have
 907    to take care of coding->source which will be relocated.  It is
 908    handled by calling coding_set_source in encode_coding.  */
 909
 910 #define ASSURE_DESTINATION(bytes)                               \
 911   do {                                                          \
 912     if (dst + (bytes) >= dst_end)                               \
 913       {                                                         \
 914         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 915                                                                 \
 916         dst = alloc_destination (coding, more_bytes, dst);      \
 917         dst_end = coding->destination + coding->dst_bytes;      \
 918       }                                                         \
 919   } while (0)
 920
 921
 922 /* Store multibyte form of the character C in P, and advance P to the
 923    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 924    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 925    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 926
 927 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 928
 929 /* Return the character code of character whose multibyte form is at
 930    P, and advance P to the end of the multibyte form.  This used to be
 931    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 932    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 933
 934 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 935
 936 /* Set coding->source from coding->src_object.  */
 937
 938 static void
 939 coding_set_source (struct coding_system *coding)
 940 {
 941   if (BUFFERP (coding->src_object))
 942     {
 943       struct buffer *buf = XBUFFER (coding->src_object);
 944
 945       if (coding->src_pos < 0)
 946         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 947       else
 948         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 949     }
 950   else if (STRINGP (coding->src_object))
 951     {
 952       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 953     }
 954   else
 955     {
 956       /* Otherwise, the source is C string and is never relocated
 957          automatically.  Thus we don't have to update anything.  */
 958     }
 959 }
 960
 961
 962 /* Set coding->source from coding->src_object, and return how many
 963    bytes coding->source was changed.  */
 964
 965 static ptrdiff_t
 966 coding_change_source (struct coding_system *coding)
 967 {
 968   const unsigned char *orig = coding->source;
 969   coding_set_source (coding);
 970   return coding->source - orig;
 971 }
 972
 973
 974 /* Set coding->destination from coding->dst_object.  */
 975
 976 static void
 977 coding_set_destination (struct coding_system *coding)
 978 {
 979   if (BUFFERP (coding->dst_object))
 980     {
 981       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 982         {
 983           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 984           coding->dst_bytes = (GAP_END_ADDR
 985                                - (coding->src_bytes - coding->consumed)
 986                                - coding->destination);
 987         }
 988       else
 989         {
 990           /* We are sure that coding->dst_pos_byte is before the gap
 991              of the buffer. */
 992           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 993                                  + coding->dst_pos_byte - BEG_BYTE);
 994           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 995                                - coding->destination);
 996         }
 997     }
 998   else
 999     {
1000       /* Otherwise, the destination is C string and is never relocated
1001          automatically.  Thus we don't have to update anything.  */
1002     }
1003 }
1004
1005
1006 /* Set coding->destination from coding->dst_object, and return how
1007    many bytes coding->destination was changed.  */
1008
1009 static ptrdiff_t
1010 coding_change_destination (struct coding_system *coding)
1011 {
1012   const unsigned char *orig = coding->destination;
1013   coding_set_destination (coding);
1014   return coding->destination - orig;
1015 }
1016
1017
1018 static void
1019 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1020 {
1021   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1022     string_overflow ();
1023   coding->destination = xrealloc (coding->destination,
1024                                   coding->dst_bytes + bytes);
1025   coding->dst_bytes += bytes;
1026 }
1027
1028 static void
1029 coding_alloc_by_making_gap (struct coding_system *coding,
1030                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1031 {
1032   if (EQ (coding->src_object, coding->dst_object))
1033     {
1034       /* The gap may contain the produced data at the head and not-yet
1035          consumed data at the tail.  To preserve those data, we at
1036          first make the gap size to zero, then increase the gap
1037          size.  */
1038       ptrdiff_t add = GAP_SIZE;
1039
1040       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1041       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1042       make_gap (bytes);
1043       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1044       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1045     }
1046   else
1047     {
1048       Lisp_Object this_buffer;
1049
1050       this_buffer = Fcurrent_buffer ();
1051       set_buffer_internal (XBUFFER (coding->dst_object));
1052       make_gap (bytes);
1053       set_buffer_internal (XBUFFER (this_buffer));
1054     }
1055 }
1056
1057
1058 static unsigned char *
1059 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1060                    unsigned char *dst)
1061 {
1062   ptrdiff_t offset = dst - coding->destination;
1063
1064   if (BUFFERP (coding->dst_object))
1065     {
1066       struct buffer *buf = XBUFFER (coding->dst_object);
1067
1068       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1069     }
1070   else
1071     coding_alloc_by_realloc (coding, nbytes);
1072   coding_set_destination (coding);
1073   dst = coding->destination + offset;
1074   return dst;
1075 }
1076
1077 /** Macros for annotations.  */
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1091
1092    NBYTES is the number of bytes specified in the header part of
1093    old-style emacs-mule encoding, or 0 for the other kind of
1094    composition.
1095
1096    METHOD is one of enum composition_method.
1097
1098    Optional COMPOSITION-COMPONENTS are characters and composition
1099    rules.
1100
1101    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102    follows.
1103
1104    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1105    recover from an invalid annotation, and should be skipped by
1106    produce_annotation.  */
1107
1108 /* Maximum length of the header of annotation data.  */
1109 #define MAX_ANNOTATION_LENGTH 5
1110
1111 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1112   do {                                                  \
1113     *(buf)++ = -(len);                                  \
1114     *(buf)++ = (mask);                                  \
1115     *(buf)++ = (nchars);                                \
1116     coding->annotated = 1;                              \
1117   } while (0);
1118
1119 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1120   do {                                                                      \
1121     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1122     *buf++ = nbytes;                                                        \
1123     *buf++ = method;                                                        \
1124   } while (0)
1125
1126
1127 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1128   do {                                                                  \
1129     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1130     *buf++ = id;                                                        \
1131   } while (0)
1132
1133 \f
1134 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1135
1136
1137
1138 \f
1139 /*** 3. UTF-8 ***/
1140
1141 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1142    Return true if a text is encoded in UTF-8.  */
1143
1144 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1145 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1146 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1147 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1148 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1149 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1150
1151 #define UTF_8_BOM_1 0xEF
1152 #define UTF_8_BOM_2 0xBB
1153 #define UTF_8_BOM_3 0xBF
1154
1155 static bool
1156 detect_coding_utf_8 (struct coding_system *coding,
1157                      struct coding_detection_info *detect_info)
1158 {
1159   const unsigned char *src = coding->source, *src_base;
1160   const unsigned char *src_end = coding->source + coding->src_bytes;
1161   bool multibytep = coding->src_multibyte;
1162   ptrdiff_t consumed_chars = 0;
1163   bool bom_found = 0;
1164   bool found = 0;
1165
1166   detect_info->checked |= CATEGORY_MASK_UTF_8;
1167   /* A coding system of this category is always ASCII compatible.  */
1168   src += coding->head_ascii;
1169
1170   while (1)
1171     {
1172       int c, c1, c2, c3, c4;
1173
1174       src_base = src;
1175       ONE_MORE_BYTE (c);
1176       if (c < 0 || UTF_8_1_OCTET_P (c))
1177         continue;
1178       ONE_MORE_BYTE (c1);
1179       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1180         break;
1181       if (UTF_8_2_OCTET_LEADING_P (c))
1182         {
1183           found = 1;
1184           continue;
1185         }
1186       ONE_MORE_BYTE (c2);
1187       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1188         break;
1189       if (UTF_8_3_OCTET_LEADING_P (c))
1190         {
1191           found = 1;
1192           if (src_base == coding->source
1193               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1194             bom_found = 1;
1195           continue;
1196         }
1197       ONE_MORE_BYTE (c3);
1198       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1199         break;
1200       if (UTF_8_4_OCTET_LEADING_P (c))
1201         {
1202           found = 1;
1203           continue;
1204         }
1205       ONE_MORE_BYTE (c4);
1206       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1207         break;
1208       if (UTF_8_5_OCTET_LEADING_P (c))
1209         {
1210           found = 1;
1211           continue;
1212         }
1213       break;
1214     }
1215   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1216   return 0;
1217
1218  no_more_source:
1219   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1220     {
1221       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1222       return 0;
1223     }
1224   if (bom_found)
1225     {
1226       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1227       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1228     }
1229   else
1230     {
1231       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1232       if (found)
1233         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1234     }
1235   return 1;
1236 }
1237
1238
1239 static void
1240 decode_coding_utf_8 (struct coding_system *coding)
1241 {
1242   const unsigned char *src = coding->source + coding->consumed;
1243   const unsigned char *src_end = coding->source + coding->src_bytes;
1244   const unsigned char *src_base;
1245   int *charbuf = coding->charbuf + coding->charbuf_used;
1246   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1247   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1248   bool multibytep = coding->src_multibyte;
1249   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1250   bool eol_dos
1251     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1252   int byte_after_cr = -1;
1253
1254   if (bom != utf_without_bom)
1255     {
1256       int c1, c2, c3;
1257
1258       src_base = src;
1259       ONE_MORE_BYTE (c1);
1260       if (! UTF_8_3_OCTET_LEADING_P (c1))
1261         src = src_base;
1262       else
1263         {
1264           ONE_MORE_BYTE (c2);
1265           if (! UTF_8_EXTRA_OCTET_P (c2))
1266             src = src_base;
1267           else
1268             {
1269               ONE_MORE_BYTE (c3);
1270               if (! UTF_8_EXTRA_OCTET_P (c3))
1271                 src = src_base;
1272               else
1273                 {
1274                   if ((c1 != UTF_8_BOM_1)
1275                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1276                     src = src_base;
1277                   else
1278                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1279                 }
1280             }
1281         }
1282     }
1283   CODING_UTF_8_BOM (coding) = utf_without_bom;
1284
1285   while (1)
1286     {
1287       int c, c1, c2, c3, c4, c5;
1288
1289       src_base = src;
1290       consumed_chars_base = consumed_chars;
1291
1292       if (charbuf >= charbuf_end)
1293         {
1294           if (byte_after_cr >= 0)
1295             src_base--;
1296           break;
1297         }
1298
1299       if (byte_after_cr >= 0)
1300         c1 = byte_after_cr, byte_after_cr = -1;
1301       else
1302         ONE_MORE_BYTE (c1);
1303       if (c1 < 0)
1304         {
1305           c = - c1;
1306         }
1307       else if (UTF_8_1_OCTET_P (c1))
1308         {
1309           if (eol_dos && c1 == '\r')
1310             ONE_MORE_BYTE (byte_after_cr);
1311           c = c1;
1312         }
1313       else
1314         {
1315           ONE_MORE_BYTE (c2);
1316           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1317             goto invalid_code;
1318           if (UTF_8_2_OCTET_LEADING_P (c1))
1319             {
1320               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1321               /* Reject overlong sequences here and below.  Encoders
1322                  producing them are incorrect, they can be misleading,
1323                  and they mess up read/write invariance.  */
1324               if (c < 128)
1325                 goto invalid_code;
1326             }
1327           else
1328             {
1329               ONE_MORE_BYTE (c3);
1330               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1331                 goto invalid_code;
1332               if (UTF_8_3_OCTET_LEADING_P (c1))
1333                 {
1334                   c = (((c1 & 0xF) << 12)
1335                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1336                   if (c < 0x800
1337                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1338                     goto invalid_code;
1339                 }
1340               else
1341                 {
1342                   ONE_MORE_BYTE (c4);
1343                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1344                     goto invalid_code;
1345                   if (UTF_8_4_OCTET_LEADING_P (c1))
1346                     {
1347                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1348                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1349                     if (c < 0x10000)
1350                       goto invalid_code;
1351                     }
1352                   else
1353                     {
1354                       ONE_MORE_BYTE (c5);
1355                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1356                         goto invalid_code;
1357                       if (UTF_8_5_OCTET_LEADING_P (c1))
1358                         {
1359                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1360                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1361                                | (c5 & 0x3F));
1362                           if ((c > MAX_CHAR) || (c < 0x200000))
1363                             goto invalid_code;
1364                         }
1365                       else
1366                         goto invalid_code;
1367                     }
1368                 }
1369             }
1370         }
1371
1372       *charbuf++ = c;
1373       continue;
1374
1375     invalid_code:
1376       src = src_base;
1377       consumed_chars = consumed_chars_base;
1378       ONE_MORE_BYTE (c);
1379       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1380       coding->errors++;
1381     }
1382
1383  no_more_source:
1384   coding->consumed_char += consumed_chars_base;
1385   coding->consumed = src_base - coding->source;
1386   coding->charbuf_used = charbuf - coding->charbuf;
1387 }
1388
1389
1390 static bool
1391 encode_coding_utf_8 (struct coding_system *coding)
1392 {
1393   bool multibytep = coding->dst_multibyte;
1394   int *charbuf = coding->charbuf;
1395   int *charbuf_end = charbuf + coding->charbuf_used;
1396   unsigned char *dst = coding->destination + coding->produced;
1397   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1398   ptrdiff_t produced_chars = 0;
1399   int c;
1400
1401   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1402     {
1403       ASSURE_DESTINATION (3);
1404       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1405       CODING_UTF_8_BOM (coding) = utf_without_bom;
1406     }
1407
1408   if (multibytep)
1409     {
1410       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1411
1412       while (charbuf < charbuf_end)
1413         {
1414           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1415
1416           ASSURE_DESTINATION (safe_room);
1417           c = *charbuf++;
1418           if (CHAR_BYTE8_P (c))
1419             {
1420               c = CHAR_TO_BYTE8 (c);
1421               EMIT_ONE_BYTE (c);
1422             }
1423           else
1424             {
1425               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1426               for (p = str; p < pend; p++)
1427                 EMIT_ONE_BYTE (*p);
1428             }
1429         }
1430     }
1431   else
1432     {
1433       int safe_room = MAX_MULTIBYTE_LENGTH;
1434
1435       while (charbuf < charbuf_end)
1436         {
1437           ASSURE_DESTINATION (safe_room);
1438           c = *charbuf++;
1439           if (CHAR_BYTE8_P (c))
1440             *dst++ = CHAR_TO_BYTE8 (c);
1441           else
1442             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1443           produced_chars++;
1444         }
1445     }
1446   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1447   coding->produced_char += produced_chars;
1448   coding->produced = dst - coding->destination;
1449   return 0;
1450 }
1451
1452
1453 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1454    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1455
1456 #define UTF_16_HIGH_SURROGATE_P(val) \
1457   (((val) & 0xFC00) == 0xD800)
1458
1459 #define UTF_16_LOW_SURROGATE_P(val) \
1460   (((val) & 0xFC00) == 0xDC00)
1461
1462
1463 static bool
1464 detect_coding_utf_16 (struct coding_system *coding,
1465                       struct coding_detection_info *detect_info)
1466 {
1467   const unsigned char *src = coding->source;
1468   const unsigned char *src_end = coding->source + coding->src_bytes;
1469   bool multibytep = coding->src_multibyte;
1470   int c1, c2;
1471
1472   detect_info->checked |= CATEGORY_MASK_UTF_16;
1473   if (coding->mode & CODING_MODE_LAST_BLOCK
1474       && (coding->src_chars & 1))
1475     {
1476       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1477       return 0;
1478     }
1479
1480   TWO_MORE_BYTES (c1, c2);
1481   if ((c1 == 0xFF) && (c2 == 0xFE))
1482     {
1483       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1484                              | CATEGORY_MASK_UTF_16_AUTO);
1485       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1486                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1487                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1488     }
1489   else if ((c1 == 0xFE) && (c2 == 0xFF))
1490     {
1491       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1492                              | CATEGORY_MASK_UTF_16_AUTO);
1493       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1494                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1495                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1496     }
1497   else if (c2 < 0)
1498     {
1499       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1500       return 0;
1501     }
1502   else
1503     {
1504       /* We check the dispersion of Eth and Oth bytes where E is even and
1505          O is odd.  If both are high, we assume binary data.*/
1506       unsigned char e[256], o[256];
1507       unsigned e_num = 1, o_num = 1;
1508
1509       memset (e, 0, 256);
1510       memset (o, 0, 256);
1511       e[c1] = 1;
1512       o[c2] = 1;
1513
1514       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1515                                 |CATEGORY_MASK_UTF_16_BE
1516                                 | CATEGORY_MASK_UTF_16_LE);
1517
1518       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1519              != CATEGORY_MASK_UTF_16)
1520         {
1521           TWO_MORE_BYTES (c1, c2);
1522           if (c2 < 0)
1523             break;
1524           if (! e[c1])
1525             {
1526               e[c1] = 1;
1527               e_num++;
1528               if (e_num >= 128)
1529                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1530             }
1531           if (! o[c2])
1532             {
1533               o[c2] = 1;
1534               o_num++;
1535               if (o_num >= 128)
1536                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1537             }
1538         }
1539       return 0;
1540     }
1541
1542  no_more_source:
1543   return 1;
1544 }
1545
1546 static void
1547 decode_coding_utf_16 (struct coding_system *coding)
1548 {
1549   const unsigned char *src = coding->source + coding->consumed;
1550   const unsigned char *src_end = coding->source + coding->src_bytes;
1551   const unsigned char *src_base;
1552   int *charbuf = coding->charbuf + coding->charbuf_used;
1553   /* We may produces at most 3 chars in one loop.  */
1554   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1555   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1556   bool multibytep = coding->src_multibyte;
1557   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1558   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1559   int surrogate = CODING_UTF_16_SURROGATE (coding);
1560   bool eol_dos
1561     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1562   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1563
1564   if (bom == utf_with_bom)
1565     {
1566       int c, c1, c2;
1567
1568       src_base = src;
1569       ONE_MORE_BYTE (c1);
1570       ONE_MORE_BYTE (c2);
1571       c = (c1 << 8) | c2;
1572
1573       if (endian == utf_16_big_endian
1574           ? c != 0xFEFF : c != 0xFFFE)
1575         {
1576           /* The first two bytes are not BOM.  Treat them as bytes
1577              for a normal character.  */
1578           src = src_base;
1579           coding->errors++;
1580         }
1581       CODING_UTF_16_BOM (coding) = utf_without_bom;
1582     }
1583   else if (bom == utf_detect_bom)
1584     {
1585       /* We have already tried to detect BOM and failed in
1586          detect_coding.  */
1587       CODING_UTF_16_BOM (coding) = utf_without_bom;
1588     }
1589
1590   while (1)
1591     {
1592       int c, c1, c2;
1593
1594       src_base = src;
1595       consumed_chars_base = consumed_chars;
1596
1597       if (charbuf >= charbuf_end)
1598         {
1599           if (byte_after_cr1 >= 0)
1600             src_base -= 2;
1601           break;
1602         }
1603
1604       if (byte_after_cr1 >= 0)
1605         c1 = byte_after_cr1, byte_after_cr1 = -1;
1606       else
1607         ONE_MORE_BYTE (c1);
1608       if (c1 < 0)
1609         {
1610           *charbuf++ = -c1;
1611           continue;
1612         }
1613       if (byte_after_cr2 >= 0)
1614         c2 = byte_after_cr2, byte_after_cr2 = -1;
1615       else
1616         ONE_MORE_BYTE (c2);
1617       if (c2 < 0)
1618         {
1619           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1620           *charbuf++ = -c2;
1621           continue;
1622         }
1623       c = (endian == utf_16_big_endian
1624            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1625
1626       if (surrogate)
1627         {
1628           if (! UTF_16_LOW_SURROGATE_P (c))
1629             {
1630               if (endian == utf_16_big_endian)
1631                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1632               else
1633                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1634               *charbuf++ = c1;
1635               *charbuf++ = c2;
1636               coding->errors++;
1637               if (UTF_16_HIGH_SURROGATE_P (c))
1638                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1639               else
1640                 *charbuf++ = c;
1641             }
1642           else
1643             {
1644               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1645               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1646               *charbuf++ = 0x10000 + c;
1647             }
1648         }
1649       else
1650         {
1651           if (UTF_16_HIGH_SURROGATE_P (c))
1652             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1653           else
1654             {
1655               if (eol_dos && c == '\r')
1656                 {
1657                   ONE_MORE_BYTE (byte_after_cr1);
1658                   ONE_MORE_BYTE (byte_after_cr2);
1659                 }
1660               *charbuf++ = c;
1661             }
1662         }
1663     }
1664
1665  no_more_source:
1666   coding->consumed_char += consumed_chars_base;
1667   coding->consumed = src_base - coding->source;
1668   coding->charbuf_used = charbuf - coding->charbuf;
1669 }
1670
1671 static bool
1672 encode_coding_utf_16 (struct coding_system *coding)
1673 {
1674   bool multibytep = coding->dst_multibyte;
1675   int *charbuf = coding->charbuf;
1676   int *charbuf_end = charbuf + coding->charbuf_used;
1677   unsigned char *dst = coding->destination + coding->produced;
1678   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1679   int safe_room = 8;
1680   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1681   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1682   ptrdiff_t produced_chars = 0;
1683   int c;
1684
1685   if (bom != utf_without_bom)
1686     {
1687       ASSURE_DESTINATION (safe_room);
1688       if (big_endian)
1689         EMIT_TWO_BYTES (0xFE, 0xFF);
1690       else
1691         EMIT_TWO_BYTES (0xFF, 0xFE);
1692       CODING_UTF_16_BOM (coding) = utf_without_bom;
1693     }
1694
1695   while (charbuf < charbuf_end)
1696     {
1697       ASSURE_DESTINATION (safe_room);
1698       c = *charbuf++;
1699       if (c > MAX_UNICODE_CHAR)
1700         c = coding->default_char;
1701
1702       if (c < 0x10000)
1703         {
1704           if (big_endian)
1705             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1706           else
1707             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1708         }
1709       else
1710         {
1711           int c1, c2;
1712
1713           c -= 0x10000;
1714           c1 = (c >> 10) + 0xD800;
1715           c2 = (c & 0x3FF) + 0xDC00;
1716           if (big_endian)
1717             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1718           else
1719             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1720         }
1721     }
1722   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1723   coding->produced = dst - coding->destination;
1724   coding->produced_char += produced_chars;
1725   return 0;
1726 }
1727
1728 \f
1729 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1730
1731 /* Emacs' internal format for representation of multiple character
1732    sets is a kind of multi-byte encoding, i.e. characters are
1733    represented by variable-length sequences of one-byte codes.
1734
1735    ASCII characters and control characters (e.g. `tab', `newline') are
1736    represented by one-byte sequences which are their ASCII codes, in
1737    the range 0x00 through 0x7F.
1738
1739    8-bit characters of the range 0x80..0x9F are represented by
1740    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1741    code + 0x20).
1742
1743    8-bit characters of the range 0xA0..0xFF are represented by
1744    one-byte sequences which are their 8-bit code.
1745
1746    The other characters are represented by a sequence of `base
1747    leading-code', optional `extended leading-code', and one or two
1748    `position-code's.  The length of the sequence is determined by the
1749    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1750    whereas extended leading-code and position-code take the range 0xA0
1751    through 0xFF.  See `charset.h' for more details about leading-code
1752    and position-code.
1753
1754    --- CODE RANGE of Emacs' internal format ---
1755    character set        range
1756    -------------        -----
1757    ascii                0x00..0x7F
1758    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1759    eight-bit-graphic    0xA0..0xBF
1760    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1761    ---------------------------------------------
1762
1763    As this is the internal character representation, the format is
1764    usually not used externally (i.e. in a file or in a data sent to a
1765    process).  But, it is possible to have a text externally in this
1766    format (i.e. by encoding by the coding system `emacs-mule').
1767
1768    In that case, a sequence of one-byte codes has a slightly different
1769    form.
1770
1771    At first, all characters in eight-bit-control are represented by
1772    one-byte sequences which are their 8-bit code.
1773
1774    Next, character composition data are represented by the byte
1775    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1776    where,
1777         METHOD is 0xF2 plus one of composition method (enum
1778         composition_method),
1779
1780         BYTES is 0xA0 plus a byte length of this composition data,
1781
1782         CHARS is 0xA0 plus a number of characters composed by this
1783         data,
1784
1785         COMPONENTs are characters of multibyte form or composition
1786         rules encoded by two-byte of ASCII codes.
1787
1788    In addition, for backward compatibility, the following formats are
1789    also recognized as composition data on decoding.
1790
1791    0x80 MSEQ ...
1792    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1793
1794    Here,
1795         MSEQ is a multibyte form but in these special format:
1796           ASCII: 0xA0 ASCII_CODE+0x80,
1797           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1798         RULE is a one byte code of the range 0xA0..0xF0 that
1799         represents a composition rule.
1800   */
1801
1802 char emacs_mule_bytes[256];
1803
1804
1805 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1806    Return true if a text is encoded in 'emacs-mule'.  */
1807
1808 static bool
1809 detect_coding_emacs_mule (struct coding_system *coding,
1810                           struct coding_detection_info *detect_info)
1811 {
1812   const unsigned char *src = coding->source, *src_base;
1813   const unsigned char *src_end = coding->source + coding->src_bytes;
1814   bool multibytep = coding->src_multibyte;
1815   ptrdiff_t consumed_chars = 0;
1816   int c;
1817   int found = 0;
1818
1819   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1820   /* A coding system of this category is always ASCII compatible.  */
1821   src += coding->head_ascii;
1822
1823   while (1)
1824     {
1825       src_base = src;
1826       ONE_MORE_BYTE (c);
1827       if (c < 0)
1828         continue;
1829       if (c == 0x80)
1830         {
1831           /* Perhaps the start of composite character.  We simply skip
1832              it because analyzing it is too heavy for detecting.  But,
1833              at least, we check that the composite character
1834              constitutes of more than 4 bytes.  */
1835           const unsigned char *src_start;
1836
1837         repeat:
1838           src_start = src;
1839           do
1840             {
1841               ONE_MORE_BYTE (c);
1842             }
1843           while (c >= 0xA0);
1844
1845           if (src - src_start <= 4)
1846             break;
1847           found = CATEGORY_MASK_EMACS_MULE;
1848           if (c == 0x80)
1849             goto repeat;
1850         }
1851
1852       if (c < 0x80)
1853         {
1854           if (c < 0x20
1855               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1856             break;
1857         }
1858       else
1859         {
1860           int more_bytes = emacs_mule_bytes[c] - 1;
1861
1862           while (more_bytes > 0)
1863             {
1864               ONE_MORE_BYTE (c);
1865               if (c < 0xA0)
1866                 {
1867                   src--;        /* Unread the last byte.  */
1868                   break;
1869                 }
1870               more_bytes--;
1871             }
1872           if (more_bytes != 0)
1873             break;
1874           found = CATEGORY_MASK_EMACS_MULE;
1875         }
1876     }
1877   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1878   return 0;
1879
1880  no_more_source:
1881   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1882     {
1883       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1884       return 0;
1885     }
1886   detect_info->found |= found;
1887   return 1;
1888 }
1889
1890
1891 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1892    character.  If CMP_STATUS indicates that we must expect MSEQ or
1893    RULE described above, decode it and return the negative value of
1894    the decoded character or rule.  If an invalid byte is found, return
1895    -1.  If SRC is too short, return -2.  */
1896
1897 static int
1898 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1899                  int *nbytes, int *nchars, int *id,
1900                  struct composition_status *cmp_status)
1901 {
1902   const unsigned char *src_end = coding->source + coding->src_bytes;
1903   const unsigned char *src_base = src;
1904   bool multibytep = coding->src_multibyte;
1905   int charset_ID;
1906   unsigned code;
1907   int c;
1908   int consumed_chars = 0;
1909   bool mseq_found = 0;
1910
1911   ONE_MORE_BYTE (c);
1912   if (c < 0)
1913     {
1914       c = -c;
1915       charset_ID = emacs_mule_charset[0];
1916     }
1917   else
1918     {
1919       if (c >= 0xA0)
1920         {
1921           if (cmp_status->state != COMPOSING_NO
1922               && cmp_status->old_form)
1923             {
1924               if (cmp_status->state == COMPOSING_CHAR)
1925                 {
1926                   if (c == 0xA0)
1927                     {
1928                       ONE_MORE_BYTE (c);
1929                       c -= 0x80;
1930                       if (c < 0)
1931                         goto invalid_code;
1932                     }
1933                   else
1934                     c -= 0x20;
1935                   mseq_found = 1;
1936                 }
1937               else
1938                 {
1939                   *nbytes = src - src_base;
1940                   *nchars = consumed_chars;
1941                   return -c;
1942                 }
1943             }
1944           else
1945             goto invalid_code;
1946         }
1947
1948       switch (emacs_mule_bytes[c])
1949         {
1950         case 2:
1951           if ((charset_ID = emacs_mule_charset[c]) < 0)
1952             goto invalid_code;
1953           ONE_MORE_BYTE (c);
1954           if (c < 0xA0)
1955             goto invalid_code;
1956           code = c & 0x7F;
1957           break;
1958
1959         case 3:
1960           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1961               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1962             {
1963               ONE_MORE_BYTE (c);
1964               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1965                 goto invalid_code;
1966               ONE_MORE_BYTE (c);
1967               if (c < 0xA0)
1968                 goto invalid_code;
1969               code = c & 0x7F;
1970             }
1971           else
1972             {
1973               if ((charset_ID = emacs_mule_charset[c]) < 0)
1974                 goto invalid_code;
1975               ONE_MORE_BYTE (c);
1976               if (c < 0xA0)
1977                 goto invalid_code;
1978               code = (c & 0x7F) << 8;
1979               ONE_MORE_BYTE (c);
1980               if (c < 0xA0)
1981                 goto invalid_code;
1982               code |= c & 0x7F;
1983             }
1984           break;
1985
1986         case 4:
1987           ONE_MORE_BYTE (c);
1988           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1989             goto invalid_code;
1990           ONE_MORE_BYTE (c);
1991           if (c < 0xA0)
1992             goto invalid_code;
1993           code = (c & 0x7F) << 8;
1994           ONE_MORE_BYTE (c);
1995           if (c < 0xA0)
1996             goto invalid_code;
1997           code |= c & 0x7F;
1998           break;
1999
2000         case 1:
2001           code = c;
2002           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2003           break;
2004
2005         default:
2006           emacs_abort ();
2007         }
2008       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2009                           CHARSET_FROM_ID (charset_ID), code, c);
2010       if (c < 0)
2011         goto invalid_code;
2012     }
2013   *nbytes = src - src_base;
2014   *nchars = consumed_chars;
2015   if (id)
2016     *id = charset_ID;
2017   return (mseq_found ? -c : c);
2018
2019  no_more_source:
2020   return -2;
2021
2022  invalid_code:
2023   return -1;
2024 }
2025
2026
2027 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2028
2029 /* Handle these composition sequence ('|': the end of header elements,
2030    BYTES and CHARS >= 0xA0):
2031
2032    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2033    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2034    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2035
2036    and these old form:
2037
2038    (4) relative composition: 0x80 | MSEQ ... MSEQ
2039    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2040
2041    When the starter 0x80 and the following header elements are found,
2042    this annotation header is produced.
2043
2044         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2045
2046    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2047    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2048
2049    Then, upon reading the following elements, these codes are produced
2050    until the composition end is found:
2051
2052    (1) CHAR ... CHAR
2053    (2) ALT ... ALT CHAR ... CHAR
2054    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2055    (4) CHAR ... CHAR
2056    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2057
2058    When the composition end is found, LENGTH and NCHARS in the
2059    annotation header is updated as below:
2060
2061    (1) LENGTH: unchanged, NCHARS: unchanged
2062    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2063    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2064    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2065    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2066
2067    If an error is found while composing, the annotation header is
2068    changed to the original composition header (plus filler -1s) as
2069    below:
2070
2071    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2072    (5)          [ 0x80 0xFF -1 -1- -1 ]
2073
2074    and the sequence [ -2 DECODED-RULE ] is changed to the original
2075    byte sequence as below:
2076         o the original byte sequence is B: [ B -1 ]
2077         o the original byte sequence is B1 B2: [ B1 B2 ]
2078
2079    Most of the routines are implemented by macros because many
2080    variables and labels in the caller decode_coding_emacs_mule must be
2081    accessible, and they are usually called just once (thus doesn't
2082    increase the size of compiled object).  */
2083
2084 /* Decode a composition rule represented by C as a component of
2085    composition sequence of Emacs 20 style.  Set RULE to the decoded
2086    rule. */
2087
2088 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2089   do {                                                  \
2090     int gref, nref;                                     \
2091                                                         \
2092     c -= 0xA0;                                          \
2093     if (c < 0 || c >= 81)                               \
2094       goto invalid_code;                                \
2095     gref = c / 9, nref = c % 9;                         \
2096     if (gref == 4) gref = 10;                           \
2097     if (nref == 4) nref = 10;                           \
2098     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2099   } while (0)
2100
2101
2102 /* Decode a composition rule represented by C and the following byte
2103    at SRC as a component of composition sequence of Emacs 21 style.
2104    Set RULE to the decoded rule.  */
2105
2106 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2107   do {                                                  \
2108     int gref, nref;                                     \
2109                                                         \
2110     gref = c - 0x20;                                    \
2111     if (gref < 0 || gref >= 81)                         \
2112       goto invalid_code;                                \
2113     ONE_MORE_BYTE (c);                                  \
2114     nref = c - 0x20;                                    \
2115     if (nref < 0 || nref >= 81)                         \
2116       goto invalid_code;                                \
2117     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2118   } while (0)
2119
2120
2121 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2122    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2123    byte length of this composition information, CHARS is the number of
2124    characters composed by this composition.  */
2125
2126 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2127   do {                                                                  \
2128     enum composition_method method = c - 0xF2;                          \
2129     int nbytes, nchars;                                                 \
2130                                                                         \
2131     ONE_MORE_BYTE (c);                                                  \
2132     if (c < 0)                                                          \
2133       goto invalid_code;                                                \
2134     nbytes = c - 0xA0;                                                  \
2135     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2136       goto invalid_code;                                                \
2137     ONE_MORE_BYTE (c);                                                  \
2138     nchars = c - 0xA0;                                                  \
2139     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2140       goto invalid_code;                                                \
2141     cmp_status->old_form = 0;                                           \
2142     cmp_status->method = method;                                        \
2143     if (method == COMPOSITION_RELATIVE)                                 \
2144       cmp_status->state = COMPOSING_CHAR;                               \
2145     else                                                                \
2146       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2147     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2148     cmp_status->nchars = nchars;                                        \
2149     cmp_status->ncomps = nbytes - 4;                                    \
2150     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2151   } while (0)
2152
2153
2154 /* Start of Emacs 20 style format for relative composition.  */
2155
2156 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2157   do {                                                          \
2158     cmp_status->old_form = 1;                                   \
2159     cmp_status->method = COMPOSITION_RELATIVE;                  \
2160     cmp_status->state = COMPOSING_CHAR;                         \
2161     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2162     cmp_status->nchars = cmp_status->ncomps = 0;                \
2163     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2164   } while (0)
2165
2166
2167 /* Start of Emacs 20 style format for rule-base composition.  */
2168
2169 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2170   do {                                                          \
2171     cmp_status->old_form = 1;                                   \
2172     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2173     cmp_status->state = COMPOSING_CHAR;                         \
2174     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2175     cmp_status->nchars = cmp_status->ncomps = 0;                \
2176     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2177   } while (0)
2178
2179
2180 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2181   do {                                                  \
2182     const unsigned char *current_src = src;             \
2183                                                         \
2184     ONE_MORE_BYTE (c);                                  \
2185     if (c < 0)                                          \
2186       goto invalid_code;                                \
2187     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2188         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2189       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2190     else if (c < 0xA0)                                  \
2191       goto invalid_code;                                \
2192     else if (c < 0xC0)                                  \
2193       {                                                 \
2194         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2195         /* Re-read C as a composition component.  */    \
2196         src = current_src;                              \
2197       }                                                 \
2198     else if (c == 0xFF)                                 \
2199       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2200     else                                                \
2201       goto invalid_code;                                \
2202   } while (0)
2203
2204 #define EMACS_MULE_COMPOSITION_END()                            \
2205   do {                                                          \
2206     int idx = - cmp_status->length;                             \
2207                                                                 \
2208     if (cmp_status->old_form)                                   \
2209       charbuf[idx + 2] = cmp_status->nchars;                    \
2210     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2211       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2212     cmp_status->state = COMPOSING_NO;                           \
2213   } while (0)
2214
2215
2216 static int
2217 emacs_mule_finish_composition (int *charbuf,
2218                                struct composition_status *cmp_status)
2219 {
2220   int idx = - cmp_status->length;
2221   int new_chars;
2222
2223   if (cmp_status->old_form && cmp_status->nchars > 0)
2224     {
2225       charbuf[idx + 2] = cmp_status->nchars;
2226       new_chars = 0;
2227       if (cmp_status->method == COMPOSITION_WITH_RULE
2228           && cmp_status->state == COMPOSING_CHAR)
2229         {
2230           /* The last rule was invalid.  */
2231           int rule = charbuf[-1] + 0xA0;
2232
2233           charbuf[-2] = BYTE8_TO_CHAR (rule);
2234           charbuf[-1] = -1;
2235           new_chars = 1;
2236         }
2237     }
2238   else
2239     {
2240       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2241
2242       if (cmp_status->method == COMPOSITION_WITH_RULE)
2243         {
2244           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2245           charbuf[idx++] = -3;
2246           charbuf[idx++] = 0;
2247           new_chars = 1;
2248         }
2249       else
2250         {
2251           int nchars = charbuf[idx + 1] + 0xA0;
2252           int nbytes = charbuf[idx + 2] + 0xA0;
2253
2254           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2255           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2256           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2257           charbuf[idx++] = -1;
2258           new_chars = 4;
2259         }
2260     }
2261   cmp_status->state = COMPOSING_NO;
2262   return new_chars;
2263 }
2264
2265 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2266   do {                                                                    \
2267     if (cmp_status->state != COMPOSING_NO)                                \
2268       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2269   } while (0)
2270
2271
2272 static void
2273 decode_coding_emacs_mule (struct coding_system *coding)
2274 {
2275   const unsigned char *src = coding->source + coding->consumed;
2276   const unsigned char *src_end = coding->source + coding->src_bytes;
2277   const unsigned char *src_base;
2278   int *charbuf = coding->charbuf + coding->charbuf_used;
2279   /* We may produce two annotations (charset and composition) in one
2280      loop and one more charset annotation at the end.  */
2281   int *charbuf_end
2282     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2283       /* We can produce up to 2 characters in a loop.  */
2284       - 1;
2285   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2286   bool multibytep = coding->src_multibyte;
2287   ptrdiff_t char_offset = coding->produced_char;
2288   ptrdiff_t last_offset = char_offset;
2289   int last_id = charset_ascii;
2290   bool eol_dos
2291     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2292   int byte_after_cr = -1;
2293   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2294
2295   if (cmp_status->state != COMPOSING_NO)
2296     {
2297       int i;
2298
2299       if (charbuf_end - charbuf < cmp_status->length)
2300         emacs_abort ();
2301       for (i = 0; i < cmp_status->length; i++)
2302         *charbuf++ = cmp_status->carryover[i];
2303       coding->annotated = 1;
2304     }
2305
2306   while (1)
2307     {
2308       int c, id IF_LINT (= 0);
2309
2310       src_base = src;
2311       consumed_chars_base = consumed_chars;
2312
2313       if (charbuf >= charbuf_end)
2314         {
2315           if (byte_after_cr >= 0)
2316             src_base--;
2317           break;
2318         }
2319
2320       if (byte_after_cr >= 0)
2321         c = byte_after_cr, byte_after_cr = -1;
2322       else
2323         ONE_MORE_BYTE (c);
2324
2325       if (c < 0 || c == 0x80)
2326         {
2327           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2328           if (c < 0)
2329             {
2330               *charbuf++ = -c;
2331               char_offset++;
2332             }
2333           else
2334             DECODE_EMACS_MULE_COMPOSITION_START ();
2335           continue;
2336         }
2337
2338       if (c < 0x80)
2339         {
2340           if (eol_dos && c == '\r')
2341             ONE_MORE_BYTE (byte_after_cr);
2342           id = charset_ascii;
2343           if (cmp_status->state != COMPOSING_NO)
2344             {
2345               if (cmp_status->old_form)
2346                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2347               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2348                 cmp_status->ncomps--;
2349             }
2350         }
2351       else
2352         {
2353           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2354           /* emacs_mule_char can load a charset map from a file, which
2355              allocates a large structure and might cause buffer text
2356              to be relocated as result.  Thus, we need to remember the
2357              original pointer to buffer text, and fix up all related
2358              pointers after the call.  */
2359           const unsigned char *orig = coding->source;
2360           ptrdiff_t offset;
2361
2362           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2363                                cmp_status);
2364           offset = coding->source - orig;
2365           if (offset)
2366             {
2367               src += offset;
2368               src_base += offset;
2369               src_end += offset;
2370             }
2371           if (c < 0)
2372             {
2373               if (c == -1)
2374                 goto invalid_code;
2375               if (c == -2)
2376                 break;
2377             }
2378           src = src_base + nbytes;
2379           consumed_chars = consumed_chars_base + nchars;
2380           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2381             cmp_status->ncomps -= nchars;
2382         }
2383
2384       /* Now if C >= 0, we found a normally encoded character, if C <
2385          0, we found an old-style composition component character or
2386          rule.  */
2387
2388       if (cmp_status->state == COMPOSING_NO)
2389         {
2390           if (last_id != id)
2391             {
2392               if (last_id != charset_ascii)
2393                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2394                                   last_id);
2395               last_id = id;
2396               last_offset = char_offset;
2397             }
2398           *charbuf++ = c;
2399           char_offset++;
2400         }
2401       else if (cmp_status->state == COMPOSING_CHAR)
2402         {
2403           if (cmp_status->old_form)
2404             {
2405               if (c >= 0)
2406                 {
2407                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2408                   *charbuf++ = c;
2409                   char_offset++;
2410                 }
2411               else
2412                 {
2413                   *charbuf++ = -c;
2414                   cmp_status->nchars++;
2415                   cmp_status->length++;
2416                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2417                     EMACS_MULE_COMPOSITION_END ();
2418                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2419                     cmp_status->state = COMPOSING_RULE;
2420                 }
2421             }
2422           else
2423             {
2424               *charbuf++ = c;
2425               cmp_status->length++;
2426               cmp_status->nchars--;
2427               if (cmp_status->nchars == 0)
2428                 EMACS_MULE_COMPOSITION_END ();
2429             }
2430         }
2431       else if (cmp_status->state == COMPOSING_RULE)
2432         {
2433           int rule;
2434
2435           if (c >= 0)
2436             {
2437               EMACS_MULE_COMPOSITION_END ();
2438               *charbuf++ = c;
2439               char_offset++;
2440             }
2441           else
2442             {
2443               c = -c;
2444               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2445               if (rule < 0)
2446                 goto invalid_code;
2447               *charbuf++ = -2;
2448               *charbuf++ = rule;
2449               cmp_status->length += 2;
2450               cmp_status->state = COMPOSING_CHAR;
2451             }
2452         }
2453       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2454         {
2455           *charbuf++ = c;
2456           cmp_status->length++;
2457           if (cmp_status->ncomps == 0)
2458             cmp_status->state = COMPOSING_CHAR;
2459           else if (cmp_status->ncomps > 0)
2460             {
2461               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2462                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2463             }
2464           else
2465             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2466         }
2467       else                      /* COMPOSING_COMPONENT_RULE */
2468         {
2469           int rule;
2470
2471           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2472           if (rule < 0)
2473             goto invalid_code;
2474           *charbuf++ = -2;
2475           *charbuf++ = rule;
2476           cmp_status->length += 2;
2477           cmp_status->ncomps--;
2478           if (cmp_status->ncomps > 0)
2479             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2480           else
2481             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2482         }
2483       continue;
2484
2485     invalid_code:
2486       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2487       src = src_base;
2488       consumed_chars = consumed_chars_base;
2489       ONE_MORE_BYTE (c);
2490       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2491       char_offset++;
2492       coding->errors++;
2493     }
2494
2495  no_more_source:
2496   if (cmp_status->state != COMPOSING_NO)
2497     {
2498       if (coding->mode & CODING_MODE_LAST_BLOCK)
2499         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2500       else
2501         {
2502           int i;
2503
2504           charbuf -= cmp_status->length;
2505           for (i = 0; i < cmp_status->length; i++)
2506             cmp_status->carryover[i] = charbuf[i];
2507         }
2508     }
2509   if (last_id != charset_ascii)
2510     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2511   coding->consumed_char += consumed_chars_base;
2512   coding->consumed = src_base - coding->source;
2513   coding->charbuf_used = charbuf - coding->charbuf;
2514 }
2515
2516
2517 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2518   do {                                          \
2519     if (id < 0xA0)                              \
2520       codes[0] = id, codes[1] = 0;              \
2521     else if (id < 0xE0)                         \
2522       codes[0] = 0x9A, codes[1] = id;           \
2523     else if (id < 0xF0)                         \
2524       codes[0] = 0x9B, codes[1] = id;           \
2525     else if (id < 0xF5)                         \
2526       codes[0] = 0x9C, codes[1] = id;           \
2527     else                                        \
2528       codes[0] = 0x9D, codes[1] = id;           \
2529   } while (0);
2530
2531
2532 static bool
2533 encode_coding_emacs_mule (struct coding_system *coding)
2534 {
2535   bool multibytep = coding->dst_multibyte;
2536   int *charbuf = coding->charbuf;
2537   int *charbuf_end = charbuf + coding->charbuf_used;
2538   unsigned char *dst = coding->destination + coding->produced;
2539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2540   int safe_room = 8;
2541   ptrdiff_t produced_chars = 0;
2542   Lisp_Object attrs, charset_list;
2543   int c;
2544   int preferred_charset_id = -1;
2545
2546   CODING_GET_INFO (coding, attrs, charset_list);
2547   if (! EQ (charset_list, Vemacs_mule_charset_list))
2548     {
2549       charset_list = Vemacs_mule_charset_list;
2550       ASET (attrs, coding_attr_charset_list, charset_list);
2551     }
2552
2553   while (charbuf < charbuf_end)
2554     {
2555       ASSURE_DESTINATION (safe_room);
2556       c = *charbuf++;
2557
2558       if (c < 0)
2559         {
2560           /* Handle an annotation.  */
2561           switch (*charbuf)
2562             {
2563             case CODING_ANNOTATE_COMPOSITION_MASK:
2564               /* Not yet implemented.  */
2565               break;
2566             case CODING_ANNOTATE_CHARSET_MASK:
2567               preferred_charset_id = charbuf[3];
2568               if (preferred_charset_id >= 0
2569                   && NILP (Fmemq (make_number (preferred_charset_id),
2570                                   charset_list)))
2571                 preferred_charset_id = -1;
2572               break;
2573             default:
2574               emacs_abort ();
2575             }
2576           charbuf += -c - 1;
2577           continue;
2578         }
2579
2580       if (ASCII_CHAR_P (c))
2581         EMIT_ONE_ASCII_BYTE (c);
2582       else if (CHAR_BYTE8_P (c))
2583         {
2584           c = CHAR_TO_BYTE8 (c);
2585           EMIT_ONE_BYTE (c);
2586         }
2587       else
2588         {
2589           struct charset *charset;
2590           unsigned code;
2591           int dimension;
2592           int emacs_mule_id;
2593           unsigned char leading_codes[2];
2594
2595           if (preferred_charset_id >= 0)
2596             {
2597               bool result;
2598
2599               charset = CHARSET_FROM_ID (preferred_charset_id);
2600               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2601               if (result)
2602                 code = ENCODE_CHAR (charset, c);
2603               else
2604                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2605                                      &code, charset);
2606             }
2607           else
2608             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2609                                  &code, charset);
2610           if (! charset)
2611             {
2612               c = coding->default_char;
2613               if (ASCII_CHAR_P (c))
2614                 {
2615                   EMIT_ONE_ASCII_BYTE (c);
2616                   continue;
2617                 }
2618               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2619                                    &code, charset);
2620             }
2621           dimension = CHARSET_DIMENSION (charset);
2622           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2623           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2624           EMIT_ONE_BYTE (leading_codes[0]);
2625           if (leading_codes[1])
2626             EMIT_ONE_BYTE (leading_codes[1]);
2627           if (dimension == 1)
2628             EMIT_ONE_BYTE (code | 0x80);
2629           else
2630             {
2631               code |= 0x8080;
2632               EMIT_ONE_BYTE (code >> 8);
2633               EMIT_ONE_BYTE (code & 0xFF);
2634             }
2635         }
2636     }
2637   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2638   coding->produced_char += produced_chars;
2639   coding->produced = dst - coding->destination;
2640   return 0;
2641 }
2642
2643 \f
2644 /*** 7. ISO2022 handlers ***/
2645
2646 /* The following note describes the coding system ISO2022 briefly.
2647    Since the intention of this note is to help understand the
2648    functions in this file, some parts are NOT ACCURATE or are OVERLY
2649    SIMPLIFIED.  For thorough understanding, please refer to the
2650    original document of ISO2022.  This is equivalent to the standard
2651    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2652
2653    ISO2022 provides many mechanisms to encode several character sets
2654    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2655    is encoded using bytes less than 128.  This may make the encoded
2656    text a little bit longer, but the text passes more easily through
2657    several types of gateway, some of which strip off the MSB (Most
2658    Significant Bit).
2659
2660    There are two kinds of character sets: control character sets and
2661    graphic character sets.  The former contain control characters such
2662    as `newline' and `escape' to provide control functions (control
2663    functions are also provided by escape sequences).  The latter
2664    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2665    two control character sets and many graphic character sets.
2666
2667    Graphic character sets are classified into one of the following
2668    four classes, according to the number of bytes (DIMENSION) and
2669    number of characters in one dimension (CHARS) of the set:
2670    - DIMENSION1_CHARS94
2671    - DIMENSION1_CHARS96
2672    - DIMENSION2_CHARS94
2673    - DIMENSION2_CHARS96
2674
2675    In addition, each character set is assigned an identification tag,
2676    unique for each set, called the "final character" (denoted as <F>
2677    hereafter).  The <F> of each character set is decided by ECMA(*)
2678    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2679    (0x30..0x3F are for private use only).
2680
2681    Note (*): ECMA = European Computer Manufacturers Association
2682
2683    Here are examples of graphic character sets [NAME(<F>)]:
2684         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2685         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2686         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2687         o DIMENSION2_CHARS96 -- none for the moment
2688
2689    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2690         C0 [0x00..0x1F] -- control character plane 0
2691         GL [0x20..0x7F] -- graphic character plane 0
2692         C1 [0x80..0x9F] -- control character plane 1
2693         GR [0xA0..0xFF] -- graphic character plane 1
2694
2695    A control character set is directly designated and invoked to C0 or
2696    C1 by an escape sequence.  The most common case is that:
2697    - ISO646's  control character set is designated/invoked to C0, and
2698    - ISO6429's control character set is designated/invoked to C1,
2699    and usually these designations/invocations are omitted in encoded
2700    text.  In a 7-bit environment, only C0 can be used, and a control
2701    character for C1 is encoded by an appropriate escape sequence to
2702    fit into the environment.  All control characters for C1 are
2703    defined to have corresponding escape sequences.
2704
2705    A graphic character set is at first designated to one of four
2706    graphic registers (G0 through G3), then these graphic registers are
2707    invoked to GL or GR.  These designations and invocations can be
2708    done independently.  The most common case is that G0 is invoked to
2709    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2710    these invocations and designations are omitted in encoded text.
2711    In a 7-bit environment, only GL can be used.
2712
2713    When a graphic character set of CHARS94 is invoked to GL, codes
2714    0x20 and 0x7F of the GL area work as control characters SPACE and
2715    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2716    be used.
2717
2718    There are two ways of invocation: locking-shift and single-shift.
2719    With locking-shift, the invocation lasts until the next different
2720    invocation, whereas with single-shift, the invocation affects the
2721    following character only and doesn't affect the locking-shift
2722    state.  Invocations are done by the following control characters or
2723    escape sequences:
2724
2725    ----------------------------------------------------------------------
2726    abbrev  function                  cntrl escape seq   description
2727    ----------------------------------------------------------------------
2728    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2729    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2730    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2731    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2732    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2733    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2734    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2735    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2736    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2737    ----------------------------------------------------------------------
2738    (*) These are not used by any known coding system.
2739
2740    Control characters for these functions are defined by macros
2741    ISO_CODE_XXX in `coding.h'.
2742
2743    Designations are done by the following escape sequences:
2744    ----------------------------------------------------------------------
2745    escape sequence      description
2746    ----------------------------------------------------------------------
2747    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2748    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2749    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2750    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2751    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2752    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2753    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2754    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2755    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2756    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2757    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2758    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2759    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2760    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2761    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2762    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2763    ----------------------------------------------------------------------
2764
2765    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2766    of dimension 1, chars 94, and final character <F>, etc...
2767
2768    Note (*): Although these designations are not allowed in ISO2022,
2769    Emacs accepts them on decoding, and produces them on encoding
2770    CHARS96 character sets in a coding system which is characterized as
2771    7-bit environment, non-locking-shift, and non-single-shift.
2772
2773    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2774    '(' must be omitted.  We refer to this as "short-form" hereafter.
2775
2776    Now you may notice that there are a lot of ways of encoding the
2777    same multilingual text in ISO2022.  Actually, there exist many
2778    coding systems such as Compound Text (used in X11's inter client
2779    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2780    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2781    localized platforms), and all of these are variants of ISO2022.
2782
2783    In addition to the above, Emacs handles two more kinds of escape
2784    sequences: ISO6429's direction specification and Emacs' private
2785    sequence for specifying character composition.
2786
2787    ISO6429's direction specification takes the following form:
2788         o CSI ']'      -- end of the current direction
2789         o CSI '0' ']'  -- end of the current direction
2790         o CSI '1' ']'  -- start of left-to-right text
2791         o CSI '2' ']'  -- start of right-to-left text
2792    The control character CSI (0x9B: control sequence introducer) is
2793    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2794
2795    Character composition specification takes the following form:
2796         o ESC '0' -- start relative composition
2797         o ESC '1' -- end composition
2798         o ESC '2' -- start rule-base composition (*)
2799         o ESC '3' -- start relative composition with alternate chars  (**)
2800         o ESC '4' -- start rule-base composition with alternate chars  (**)
2801   Since these are not standard escape sequences of any ISO standard,
2802   the use of them with these meanings is restricted to Emacs only.
2803
2804   (*) This form is used only in Emacs 20.7 and older versions,
2805   but newer versions can safely decode it.
2806   (**) This form is used only in Emacs 21.1 and newer versions,
2807   and older versions can't decode it.
2808
2809   Here's a list of example usages of these composition escape
2810   sequences (categorized by `enum composition_method').
2811
2812   COMPOSITION_RELATIVE:
2813         ESC 0 CHAR [ CHAR ] ESC 1
2814   COMPOSITION_WITH_RULE:
2815         ESC 2 CHAR [ RULE CHAR ] ESC 1
2816   COMPOSITION_WITH_ALTCHARS:
2817         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2818   COMPOSITION_WITH_RULE_ALTCHARS:
2819         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2820
2821 static enum iso_code_class_type iso_code_class[256];
2822
2823 #define SAFE_CHARSET_P(coding, id)      \
2824   ((id) <= (coding)->max_charset_id     \
2825    && (coding)->safe_charsets[id] != 255)
2826
2827 static void
2828 setup_iso_safe_charsets (Lisp_Object attrs)
2829 {
2830   Lisp_Object charset_list, safe_charsets;
2831   Lisp_Object request;
2832   Lisp_Object reg_usage;
2833   Lisp_Object tail;
2834   EMACS_INT reg94, reg96;
2835   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2836   int max_charset_id;
2837
2838   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2839   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2840       && ! EQ (charset_list, Viso_2022_charset_list))
2841     {
2842       charset_list = Viso_2022_charset_list;
2843       ASET (attrs, coding_attr_charset_list, charset_list);
2844       ASET (attrs, coding_attr_safe_charsets, Qnil);
2845     }
2846
2847   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2848     return;
2849
2850   max_charset_id = 0;
2851   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2852     {
2853       int id = XINT (XCAR (tail));
2854       if (max_charset_id < id)
2855         max_charset_id = id;
2856     }
2857
2858   safe_charsets = make_uninit_string (max_charset_id + 1);
2859   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2860   request = AREF (attrs, coding_attr_iso_request);
2861   reg_usage = AREF (attrs, coding_attr_iso_usage);
2862   reg94 = XINT (XCAR (reg_usage));
2863   reg96 = XINT (XCDR (reg_usage));
2864
2865   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2866     {
2867       Lisp_Object id;
2868       Lisp_Object reg;
2869       struct charset *charset;
2870
2871       id = XCAR (tail);
2872       charset = CHARSET_FROM_ID (XINT (id));
2873       reg = Fcdr (Fassq (id, request));
2874       if (! NILP (reg))
2875         SSET (safe_charsets, XINT (id), XINT (reg));
2876       else if (charset->iso_chars_96)
2877         {
2878           if (reg96 < 4)
2879             SSET (safe_charsets, XINT (id), reg96);
2880         }
2881       else
2882         {
2883           if (reg94 < 4)
2884             SSET (safe_charsets, XINT (id), reg94);
2885         }
2886     }
2887   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2888 }
2889
2890
2891 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2892    Return true if a text is encoded in one of ISO-2022 based coding
2893    systems.  */
2894
2895 static bool
2896 detect_coding_iso_2022 (struct coding_system *coding,
2897                         struct coding_detection_info *detect_info)
2898 {
2899   const unsigned char *src = coding->source, *src_base = src;
2900   const unsigned char *src_end = coding->source + coding->src_bytes;
2901   bool multibytep = coding->src_multibyte;
2902   bool single_shifting = 0;
2903   int id;
2904   int c, c1;
2905   ptrdiff_t consumed_chars = 0;
2906   int i;
2907   int rejected = 0;
2908   int found = 0;
2909   int composition_count = -1;
2910
2911   detect_info->checked |= CATEGORY_MASK_ISO;
2912
2913   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2914     {
2915       struct coding_system *this = &(coding_categories[i]);
2916       Lisp_Object attrs, val;
2917
2918       if (this->id < 0)
2919         continue;
2920       attrs = CODING_ID_ATTRS (this->id);
2921       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2922           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2923         setup_iso_safe_charsets (attrs);
2924       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2925       this->max_charset_id = SCHARS (val) - 1;
2926       this->safe_charsets = SDATA (val);
2927     }
2928
2929   /* A coding system of this category is always ASCII compatible.  */
2930   src += coding->head_ascii;
2931
2932   while (rejected != CATEGORY_MASK_ISO)
2933     {
2934       src_base = src;
2935       ONE_MORE_BYTE (c);
2936       switch (c)
2937         {
2938         case ISO_CODE_ESC:
2939           if (inhibit_iso_escape_detection)
2940             break;
2941           single_shifting = 0;
2942           ONE_MORE_BYTE (c);
2943           if (c == 'N' || c == 'O')
2944             {
2945               /* ESC <Fe> for SS2 or SS3.  */
2946               single_shifting = 1;
2947               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2948             }
2949           else if (c == '1')
2950             {
2951               /* End of composition.  */
2952               if (composition_count < 0
2953                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2954                 /* Invalid */
2955                 break;
2956               composition_count = -1;
2957               found |= CATEGORY_MASK_ISO;
2958             }
2959           else if (c >= '0' && c <= '4')
2960             {
2961               /* ESC <Fp> for start/end composition.  */
2962               composition_count = 0;
2963             }
2964           else
2965             {
2966               if (c >= '(' && c <= '/')
2967                 {
2968                   /* Designation sequence for a charset of dimension 1.  */
2969                   ONE_MORE_BYTE (c1);
2970                   if (c1 < ' ' || c1 >= 0x80
2971                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2972                     /* Invalid designation sequence.  Just ignore.  */
2973                     break;
2974                 }
2975               else if (c == '$')
2976                 {
2977                   /* Designation sequence for a charset of dimension 2.  */
2978                   ONE_MORE_BYTE (c);
2979                   if (c >= '@' && c <= 'B')
2980                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2981                     id = iso_charset_table[1][0][c];
2982                   else if (c >= '(' && c <= '/')
2983                     {
2984                       ONE_MORE_BYTE (c1);
2985                       if (c1 < ' ' || c1 >= 0x80
2986                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2987                         /* Invalid designation sequence.  Just ignore.  */
2988                         break;
2989                     }
2990                   else
2991                     /* Invalid designation sequence.  Just ignore it.  */
2992                     break;
2993                 }
2994               else
2995                 {
2996                   /* Invalid escape sequence.  Just ignore it.  */
2997                   break;
2998                 }
2999
3000               /* We found a valid designation sequence for CHARSET.  */
3001               rejected |= CATEGORY_MASK_ISO_8BIT;
3002               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3003                                   id))
3004                 found |= CATEGORY_MASK_ISO_7;
3005               else
3006                 rejected |= CATEGORY_MASK_ISO_7;
3007               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3008                                   id))
3009                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3010               else
3011                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3012               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3013                                   id))
3014                 found |= CATEGORY_MASK_ISO_7_ELSE;
3015               else
3016                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3017               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3018                                   id))
3019                 found |= CATEGORY_MASK_ISO_8_ELSE;
3020               else
3021                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3022             }
3023           break;
3024
3025         case ISO_CODE_SO:
3026         case ISO_CODE_SI:
3027           /* Locking shift out/in.  */
3028           if (inhibit_iso_escape_detection)
3029             break;
3030           single_shifting = 0;
3031           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3032           break;
3033
3034         case ISO_CODE_CSI:
3035           /* Control sequence introducer.  */
3036           single_shifting = 0;
3037           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3038           found |= CATEGORY_MASK_ISO_8_ELSE;
3039           goto check_extra_latin;
3040
3041         case ISO_CODE_SS2:
3042         case ISO_CODE_SS3:
3043           /* Single shift.   */
3044           if (inhibit_iso_escape_detection)
3045             break;
3046           single_shifting = 0;
3047           rejected |= CATEGORY_MASK_ISO_7BIT;
3048           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3049               & CODING_ISO_FLAG_SINGLE_SHIFT)
3050             {
3051               found |= CATEGORY_MASK_ISO_8_1;
3052               single_shifting = 1;
3053             }
3054           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3055               & CODING_ISO_FLAG_SINGLE_SHIFT)
3056             {
3057               found |= CATEGORY_MASK_ISO_8_2;
3058               single_shifting = 1;
3059             }
3060           if (single_shifting)
3061             break;
3062         check_extra_latin:
3063           if (! VECTORP (Vlatin_extra_code_table)
3064               || NILP (AREF (Vlatin_extra_code_table, c)))
3065             {
3066               rejected = CATEGORY_MASK_ISO;
3067               break;
3068             }
3069           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3070               & CODING_ISO_FLAG_LATIN_EXTRA)
3071             found |= CATEGORY_MASK_ISO_8_1;
3072           else
3073             rejected |= CATEGORY_MASK_ISO_8_1;
3074           rejected |= CATEGORY_MASK_ISO_8_2;
3075           break;
3076
3077         default:
3078           if (c < 0)
3079             continue;
3080           if (c < 0x80)
3081             {
3082               if (composition_count >= 0)
3083                 composition_count++;
3084               single_shifting = 0;
3085               break;
3086             }
3087           if (c >= 0xA0)
3088             {
3089               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3090               found |= CATEGORY_MASK_ISO_8_1;
3091               /* Check the length of succeeding codes of the range
3092                  0xA0..0FF.  If the byte length is even, we include
3093                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3094                  only when we are not single shifting.  */
3095               if (! single_shifting
3096                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3097                 {
3098                   int len = 1;
3099                   while (src < src_end)
3100                     {
3101                       src_base = src;
3102                       ONE_MORE_BYTE (c);
3103                       if (c < 0xA0)
3104                         {
3105                           src = src_base;
3106                           break;
3107                         }
3108                       len++;
3109                     }
3110
3111                   if (len & 1 && src < src_end)
3112                     {
3113                       rejected |= CATEGORY_MASK_ISO_8_2;
3114                       if (composition_count >= 0)
3115                         composition_count += len;
3116                     }
3117                   else
3118                     {
3119                       found |= CATEGORY_MASK_ISO_8_2;
3120                       if (composition_count >= 0)
3121                         composition_count += len / 2;
3122                     }
3123                 }
3124               break;
3125             }
3126         }
3127     }
3128   detect_info->rejected |= CATEGORY_MASK_ISO;
3129   return 0;
3130
3131  no_more_source:
3132   detect_info->rejected |= rejected;
3133   detect_info->found |= (found & ~rejected);
3134   return 1;
3135 }
3136
3137
3138 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3139    escape sequence should be kept.  */
3140 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3141   do {                                                                  \
3142     int id, prev;                                                       \
3143                                                                         \
3144     if (final < '0' || final >= 128                                     \
3145         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3146         || !SAFE_CHARSET_P (coding, id))                                \
3147       {                                                                 \
3148         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3149         chars_96 = -1;                                                  \
3150         break;                                                          \
3151       }                                                                 \
3152     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3153     if (id == charset_jisx0201_roman)                                   \
3154       {                                                                 \
3155         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3156           id = charset_ascii;                                           \
3157       }                                                                 \
3158     else if (id == charset_jisx0208_1978)                               \
3159       {                                                                 \
3160         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3161           id = charset_jisx0208;                                        \
3162       }                                                                 \
3163     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3164     /* If there was an invalid designation to REG previously, and this  \
3165        designation is ASCII to REG, we should keep this designation     \
3166        sequence.  */                                                    \
3167     if (prev == -2 && id == charset_ascii)                              \
3168       chars_96 = -1;                                                    \
3169   } while (0)
3170
3171
3172 /* Handle these composition sequence (ALT: alternate char):
3173
3174    (1) relative composition: ESC 0 CHAR ... ESC 1
3175    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3176    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3177    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3178
3179    When the start sequence (ESC 0/2/3/4) is found, this annotation
3180    header is produced.
3181
3182         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3183
3184    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3185    produced until the end sequence (ESC 1) is found:
3186
3187    (1) CHAR ... CHAR
3188    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3189    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3190    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3191
3192    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3193    annotation header is updated as below:
3194
3195    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3196    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3197    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3198    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3199
3200    If an error is found while composing, the annotation header is
3201    changed to:
3202
3203         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3204
3205    and the sequence [ -2 DECODED-RULE ] is changed to the original
3206    byte sequence as below:
3207         o the original byte sequence is B: [ B -1 ]
3208         o the original byte sequence is B1 B2: [ B1 B2 ]
3209    and the sequence [ -1 -1 ] is changed to the original byte
3210    sequence:
3211         [ ESC '0' ]
3212 */
3213
3214 /* Decode a composition rule C1 and maybe one more byte from the
3215    source, and set RULE to the encoded composition rule.  If the rule
3216    is invalid, goto invalid_code.  */
3217
3218 #define DECODE_COMPOSITION_RULE(rule)                                   \
3219   do {                                                                  \
3220     rule = c1 - 32;                                                     \
3221     if (rule < 0)                                                       \
3222       goto invalid_code;                                                \
3223     if (rule < 81)              /* old format (before ver.21) */        \
3224       {                                                                 \
3225         int gref = (rule) / 9;                                          \
3226         int nref = (rule) % 9;                                          \
3227         if (gref == 4) gref = 10;                                       \
3228         if (nref == 4) nref = 10;                                       \
3229         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3230       }                                                                 \
3231     else                        /* new format (after ver.21) */         \
3232       {                                                                 \
3233         int b;                                                          \
3234                                                                         \
3235         ONE_MORE_BYTE (b);                                              \
3236         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3237           goto invalid_code;                                            \
3238         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3239         rule += 0x100;   /* Distinguish it from the old format.  */     \
3240       }                                                                 \
3241   } while (0)
3242
3243 #define ENCODE_COMPOSITION_RULE(rule)                           \
3244   do {                                                          \
3245     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3246                                                                 \
3247     if (rule < 0x100)           /* old format */                \
3248       {                                                         \
3249         if (gref == 10) gref = 4;                               \
3250         if (nref == 10) nref = 4;                               \
3251         charbuf[idx] = 32 + gref * 9 + nref;                    \
3252         charbuf[idx + 1] = -1;                                  \
3253         new_chars++;                                            \
3254       }                                                         \
3255     else                                /* new format */        \
3256       {                                                         \
3257         charbuf[idx] = 32 + 81 + gref;                          \
3258         charbuf[idx + 1] = 32 + nref;                           \
3259         new_chars += 2;                                         \
3260       }                                                         \
3261   } while (0)
3262
3263 /* Finish the current composition as invalid.  */
3264
3265 static int
3266 finish_composition (int *charbuf, struct composition_status *cmp_status)
3267 {
3268   int idx = - cmp_status->length;
3269   int new_chars;
3270
3271   /* Recover the original ESC sequence */
3272   charbuf[idx++] = ISO_CODE_ESC;
3273   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3274                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3275                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3276                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3277                     : '4');
3278   charbuf[idx++] = -2;
3279   charbuf[idx++] = 0;
3280   charbuf[idx++] = -1;
3281   new_chars = cmp_status->nchars;
3282   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3283     for (; idx < 0; idx++)
3284       {
3285         int elt = charbuf[idx];
3286
3287         if (elt == -2)
3288           {
3289             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3290             idx++;
3291           }
3292         else if (elt == -1)
3293           {
3294             charbuf[idx++] = ISO_CODE_ESC;
3295             charbuf[idx] = '0';
3296             new_chars += 2;
3297           }
3298       }
3299   cmp_status->state = COMPOSING_NO;
3300   return new_chars;
3301 }
3302
3303 /* If characters are under composition, finish the composition.  */
3304 #define MAYBE_FINISH_COMPOSITION()                              \
3305   do {                                                          \
3306     if (cmp_status->state != COMPOSING_NO)                      \
3307       char_offset += finish_composition (charbuf, cmp_status);  \
3308   } while (0)
3309
3310 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3311
3312    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3313    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3314    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3315    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3316
3317    Produce this annotation sequence now:
3318
3319    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3320 */
3321
3322 #define DECODE_COMPOSITION_START(c1)                                       \
3323   do {                                                                     \
3324     if (c1 == '0'                                                          \
3325         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3326              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3327             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3328                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3329       {                                                                    \
3330         *charbuf++ = -1;                                                   \
3331         *charbuf++= -1;                                                    \
3332         cmp_status->state = COMPOSING_CHAR;                                \
3333         cmp_status->length += 2;                                           \
3334       }                                                                    \
3335     else                                                                   \
3336       {                                                                    \
3337         MAYBE_FINISH_COMPOSITION ();                                       \
3338         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3339                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3340                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3341                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3342         cmp_status->state                                                  \
3343           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3344         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3345         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3346         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3347         coding->annotated = 1;                                             \
3348       }                                                                    \
3349   } while (0)
3350
3351
3352 /* Handle composition end sequence ESC 1.  */
3353
3354 #define DECODE_COMPOSITION_END()                                        \
3355   do {                                                                  \
3356     if (cmp_status->nchars == 0                                         \
3357         || ((cmp_status->state == COMPOSING_CHAR)                       \
3358             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3359       {                                                                 \
3360         MAYBE_FINISH_COMPOSITION ();                                    \
3361         goto invalid_code;                                              \
3362       }                                                                 \
3363     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3364       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3365     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3366       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3367     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3368     char_offset += cmp_status->nchars;                                  \
3369     cmp_status->state = COMPOSING_NO;                                   \
3370   } while (0)
3371
3372 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3373
3374 #define STORE_COMPOSITION_RULE(rule)    \
3375   do {                                  \
3376     *charbuf++ = -2;                    \
3377     *charbuf++ = rule;                  \
3378     cmp_status->length += 2;            \
3379     cmp_status->state--;                \
3380   } while (0)
3381
3382 /* Store a composed char or a component char C in charbuf, and update
3383    cmp_status.  */
3384
3385 #define STORE_COMPOSITION_CHAR(c)                                       \
3386   do {                                                                  \
3387     *charbuf++ = (c);                                                   \
3388     cmp_status->length++;                                               \
3389     if (cmp_status->state == COMPOSING_CHAR)                            \
3390       cmp_status->nchars++;                                             \
3391     else                                                                \
3392       cmp_status->ncomps++;                                             \
3393     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3394         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3395             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3396       cmp_status->state++;                                              \
3397   } while (0)
3398
3399
3400 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3401
3402 static void
3403 decode_coding_iso_2022 (struct coding_system *coding)
3404 {
3405   const unsigned char *src = coding->source + coding->consumed;
3406   const unsigned char *src_end = coding->source + coding->src_bytes;
3407   const unsigned char *src_base;
3408   int *charbuf = coding->charbuf + coding->charbuf_used;
3409   /* We may produce two annotations (charset and composition) in one
3410      loop and one more charset annotation at the end.  */
3411   int *charbuf_end
3412     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3413   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3414   bool multibytep = coding->src_multibyte;
3415   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3416   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3417   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3418   int charset_id_2, charset_id_3;
3419   struct charset *charset;
3420   int c;
3421   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3422   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3423   ptrdiff_t char_offset = coding->produced_char;
3424   ptrdiff_t last_offset = char_offset;
3425   int last_id = charset_ascii;
3426   bool eol_dos
3427     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3428   int byte_after_cr = -1;
3429   int i;
3430
3431   setup_iso_safe_charsets (attrs);
3432   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3433
3434   if (cmp_status->state != COMPOSING_NO)
3435     {
3436       if (charbuf_end - charbuf < cmp_status->length)
3437         emacs_abort ();
3438       for (i = 0; i < cmp_status->length; i++)
3439         *charbuf++ = cmp_status->carryover[i];
3440       coding->annotated = 1;
3441     }
3442
3443   while (1)
3444     {
3445       int c1, c2, c3;
3446
3447       src_base = src;
3448       consumed_chars_base = consumed_chars;
3449
3450       if (charbuf >= charbuf_end)
3451         {
3452           if (byte_after_cr >= 0)
3453             src_base--;
3454           break;
3455         }
3456
3457       if (byte_after_cr >= 0)
3458         c1 = byte_after_cr, byte_after_cr = -1;
3459       else
3460         ONE_MORE_BYTE (c1);
3461       if (c1 < 0)
3462         goto invalid_code;
3463
3464       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3465         {
3466           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3467           char_offset++;
3468           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3469           continue;
3470         }
3471
3472       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3473         {
3474           if (c1 == ISO_CODE_ESC)
3475             {
3476               if (src + 1 >= src_end)
3477                 goto no_more_source;
3478               *charbuf++ = ISO_CODE_ESC;
3479               char_offset++;
3480               if (src[0] == '%' && src[1] == '@')
3481                 {
3482                   src += 2;
3483                   consumed_chars += 2;
3484                   char_offset += 2;
3485                   /* We are sure charbuf can contain two more chars. */
3486                   *charbuf++ = '%';
3487                   *charbuf++ = '@';
3488                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3489                 }
3490             }
3491           else
3492             {
3493               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3494               char_offset++;
3495             }
3496           continue;
3497         }
3498
3499       if ((cmp_status->state == COMPOSING_RULE
3500            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3501           && c1 != ISO_CODE_ESC)
3502         {
3503           int rule;
3504
3505           DECODE_COMPOSITION_RULE (rule);
3506           STORE_COMPOSITION_RULE (rule);
3507           continue;
3508         }
3509
3510       /* We produce at most one character.  */
3511       switch (iso_code_class [c1])
3512         {
3513         case ISO_0x20_or_0x7F:
3514           if (charset_id_0 < 0
3515               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3516             /* This is SPACE or DEL.  */
3517             charset = CHARSET_FROM_ID (charset_ascii);
3518           else
3519             charset = CHARSET_FROM_ID (charset_id_0);
3520           break;
3521
3522         case ISO_graphic_plane_0:
3523           if (charset_id_0 < 0)
3524             charset = CHARSET_FROM_ID (charset_ascii);
3525           else
3526             charset = CHARSET_FROM_ID (charset_id_0);
3527           break;
3528
3529         case ISO_0xA0_or_0xFF:
3530           if (charset_id_1 < 0
3531               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3532               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3533             goto invalid_code;
3534           /* This is a graphic character, we fall down ... */
3535
3536         case ISO_graphic_plane_1:
3537           if (charset_id_1 < 0)
3538             goto invalid_code;
3539           charset = CHARSET_FROM_ID (charset_id_1);
3540           break;
3541
3542         case ISO_control_0:
3543           if (eol_dos && c1 == '\r')
3544             ONE_MORE_BYTE (byte_after_cr);
3545           MAYBE_FINISH_COMPOSITION ();
3546           charset = CHARSET_FROM_ID (charset_ascii);
3547           break;
3548
3549         case ISO_control_1:
3550           goto invalid_code;
3551
3552         case ISO_shift_out:
3553           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3554               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3555             goto invalid_code;
3556           CODING_ISO_INVOCATION (coding, 0) = 1;
3557           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3558           continue;
3559
3560         case ISO_shift_in:
3561           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3562             goto invalid_code;
3563           CODING_ISO_INVOCATION (coding, 0) = 0;
3564           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3565           continue;
3566
3567         case ISO_single_shift_2_7:
3568           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3569             goto invalid_code;
3570         case ISO_single_shift_2:
3571           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3572             goto invalid_code;
3573           /* SS2 is handled as an escape sequence of ESC 'N' */
3574           c1 = 'N';
3575           goto label_escape_sequence;
3576
3577         case ISO_single_shift_3:
3578           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3579             goto invalid_code;
3580           /* SS2 is handled as an escape sequence of ESC 'O' */
3581           c1 = 'O';
3582           goto label_escape_sequence;
3583
3584         case ISO_control_sequence_introducer:
3585           /* CSI is handled as an escape sequence of ESC '[' ...  */
3586           c1 = '[';
3587           goto label_escape_sequence;
3588
3589         case ISO_escape:
3590           ONE_MORE_BYTE (c1);
3591         label_escape_sequence:
3592           /* Escape sequences handled here are invocation,
3593              designation, direction specification, and character
3594              composition specification.  */
3595           switch (c1)
3596             {
3597             case '&':           /* revision of following character set */
3598               ONE_MORE_BYTE (c1);
3599               if (!(c1 >= '@' && c1 <= '~'))
3600                 goto invalid_code;
3601               ONE_MORE_BYTE (c1);
3602               if (c1 != ISO_CODE_ESC)
3603                 goto invalid_code;
3604               ONE_MORE_BYTE (c1);
3605               goto label_escape_sequence;
3606
3607             case '$':           /* designation of 2-byte character set */
3608               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3609                 goto invalid_code;
3610               {
3611                 int reg, chars96;
3612
3613                 ONE_MORE_BYTE (c1);
3614                 if (c1 >= '@' && c1 <= 'B')
3615                   {     /* designation of JISX0208.1978, GB2312.1980,
3616                            or JISX0208.1980 */
3617                     reg = 0, chars96 = 0;
3618                   }
3619                 else if (c1 >= 0x28 && c1 <= 0x2B)
3620                   { /* designation of DIMENSION2_CHARS94 character set */
3621                     reg = c1 - 0x28, chars96 = 0;
3622                     ONE_MORE_BYTE (c1);
3623                   }
3624                 else if (c1 >= 0x2C && c1 <= 0x2F)
3625                   { /* designation of DIMENSION2_CHARS96 character set */
3626                     reg = c1 - 0x2C, chars96 = 1;
3627                     ONE_MORE_BYTE (c1);
3628                   }
3629                 else
3630                   goto invalid_code;
3631                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3632                 /* We must update these variables now.  */
3633                 if (reg == 0)
3634                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3635                 else if (reg == 1)
3636                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3637                 if (chars96 < 0)
3638                   goto invalid_code;
3639               }
3640               continue;
3641
3642             case 'n':           /* invocation of locking-shift-2 */
3643               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3644                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3645                 goto invalid_code;
3646               CODING_ISO_INVOCATION (coding, 0) = 2;
3647               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3648               continue;
3649
3650             case 'o':           /* invocation of locking-shift-3 */
3651               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3652                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3653                 goto invalid_code;
3654               CODING_ISO_INVOCATION (coding, 0) = 3;
3655               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3656               continue;
3657
3658             case 'N':           /* invocation of single-shift-2 */
3659               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3660                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3661                 goto invalid_code;
3662               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3663               if (charset_id_2 < 0)
3664                 charset = CHARSET_FROM_ID (charset_ascii);
3665               else
3666                 charset = CHARSET_FROM_ID (charset_id_2);
3667               ONE_MORE_BYTE (c1);
3668               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3669                 goto invalid_code;
3670               break;
3671
3672             case 'O':           /* invocation of single-shift-3 */
3673               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3674                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3675                 goto invalid_code;
3676               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3677               if (charset_id_3 < 0)
3678                 charset = CHARSET_FROM_ID (charset_ascii);
3679               else
3680                 charset = CHARSET_FROM_ID (charset_id_3);
3681               ONE_MORE_BYTE (c1);
3682               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3683                 goto invalid_code;
3684               break;
3685
3686             case '0': case '2': case '3': case '4': /* start composition */
3687               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3688                 goto invalid_code;
3689               if (last_id != charset_ascii)
3690                 {
3691                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3692                   last_id = charset_ascii;
3693                   last_offset = char_offset;
3694                 }
3695               DECODE_COMPOSITION_START (c1);
3696               continue;
3697
3698             case '1':           /* end composition */
3699               if (cmp_status->state == COMPOSING_NO)
3700                 goto invalid_code;
3701               DECODE_COMPOSITION_END ();
3702               continue;
3703
3704             case '[':           /* specification of direction */
3705               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3706                 goto invalid_code;
3707               /* For the moment, nested direction is not supported.
3708                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3709                  left-to-right, and nonzero means right-to-left.  */
3710               ONE_MORE_BYTE (c1);
3711               switch (c1)
3712                 {
3713                 case ']':       /* end of the current direction */
3714                   coding->mode &= ~CODING_MODE_DIRECTION;
3715
3716                 case '0':       /* end of the current direction */
3717                 case '1':       /* start of left-to-right direction */
3718                   ONE_MORE_BYTE (c1);
3719                   if (c1 == ']')
3720                     coding->mode &= ~CODING_MODE_DIRECTION;
3721                   else
3722                     goto invalid_code;
3723                   break;
3724
3725                 case '2':       /* start of right-to-left direction */
3726                   ONE_MORE_BYTE (c1);
3727                   if (c1 == ']')
3728                     coding->mode |= CODING_MODE_DIRECTION;
3729                   else
3730                     goto invalid_code;
3731                   break;
3732
3733                 default:
3734                   goto invalid_code;
3735                 }
3736               continue;
3737
3738             case '%':
3739               ONE_MORE_BYTE (c1);
3740               if (c1 == '/')
3741                 {
3742                   /* CTEXT extended segment:
3743                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3744                      We keep these bytes as is for the moment.
3745                      They may be decoded by post-read-conversion.  */
3746                   int dim, M, L;
3747                   int size;
3748
3749                   ONE_MORE_BYTE (dim);
3750                   if (dim < '0' || dim > '4')
3751                     goto invalid_code;
3752                   ONE_MORE_BYTE (M);
3753                   if (M < 128)
3754                     goto invalid_code;
3755                   ONE_MORE_BYTE (L);
3756                   if (L < 128)
3757                     goto invalid_code;
3758                   size = ((M - 128) * 128) + (L - 128);
3759                   if (charbuf + 6 > charbuf_end)
3760                     goto break_loop;
3761                   *charbuf++ = ISO_CODE_ESC;
3762                   *charbuf++ = '%';
3763                   *charbuf++ = '/';
3764                   *charbuf++ = dim;
3765                   *charbuf++ = BYTE8_TO_CHAR (M);
3766                   *charbuf++ = BYTE8_TO_CHAR (L);
3767                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3768                 }
3769               else if (c1 == 'G')
3770                 {
3771                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3772                      ESC % G --UTF-8-BYTES-- ESC % @
3773                      We keep these bytes as is for the moment.
3774                      They may be decoded by post-read-conversion.  */
3775                   if (charbuf + 3 > charbuf_end)
3776                     goto break_loop;
3777                   *charbuf++ = ISO_CODE_ESC;
3778                   *charbuf++ = '%';
3779                   *charbuf++ = 'G';
3780                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3781                 }
3782               else
3783                 goto invalid_code;
3784               continue;
3785               break;
3786
3787             default:
3788               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3789                 goto invalid_code;
3790               {
3791                 int reg, chars96;
3792
3793                 if (c1 >= 0x28 && c1 <= 0x2B)
3794                   { /* designation of DIMENSION1_CHARS94 character set */
3795                     reg = c1 - 0x28, chars96 = 0;
3796                     ONE_MORE_BYTE (c1);
3797                   }
3798                 else if (c1 >= 0x2C && c1 <= 0x2F)
3799                   { /* designation of DIMENSION1_CHARS96 character set */
3800                     reg = c1 - 0x2C, chars96 = 1;
3801                     ONE_MORE_BYTE (c1);
3802                   }
3803                 else
3804                   goto invalid_code;
3805                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3806                 /* We must update these variables now.  */
3807                 if (reg == 0)
3808                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3809                 else if (reg == 1)
3810                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3811                 if (chars96 < 0)
3812                   goto invalid_code;
3813               }
3814               continue;
3815             }
3816           break;
3817
3818         default:
3819           emacs_abort ();
3820         }
3821
3822       if (cmp_status->state == COMPOSING_NO
3823           && charset->id != charset_ascii
3824           && last_id != charset->id)
3825         {
3826           if (last_id != charset_ascii)
3827             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3828           last_id = charset->id;
3829           last_offset = char_offset;
3830         }
3831
3832       /* Now we know CHARSET and 1st position code C1 of a character.
3833          Produce a decoded character while getting 2nd and 3rd
3834          position codes C2, C3 if necessary.  */
3835       if (CHARSET_DIMENSION (charset) > 1)
3836         {
3837           ONE_MORE_BYTE (c2);
3838           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3839               || ((c1 & 0x80) != (c2 & 0x80)))
3840             /* C2 is not in a valid range.  */
3841             goto invalid_code;
3842           if (CHARSET_DIMENSION (charset) == 2)
3843             c1 = (c1 << 8) | c2;
3844           else
3845             {
3846               ONE_MORE_BYTE (c3);
3847               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3848                   || ((c1 & 0x80) != (c3 & 0x80)))
3849                 /* C3 is not in a valid range.  */
3850                 goto invalid_code;
3851               c1 = (c1 << 16) | (c2 << 8) | c2;
3852             }
3853         }
3854       c1 &= 0x7F7F7F;
3855       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3856       if (c < 0)
3857         {
3858           MAYBE_FINISH_COMPOSITION ();
3859           for (; src_base < src; src_base++, char_offset++)
3860             {
3861               if (ASCII_BYTE_P (*src_base))
3862                 *charbuf++ = *src_base;
3863               else
3864                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3865             }
3866         }
3867       else if (cmp_status->state == COMPOSING_NO)
3868         {
3869           *charbuf++ = c;
3870           char_offset++;
3871         }
3872       else if ((cmp_status->state == COMPOSING_CHAR
3873                 ? cmp_status->nchars
3874                 : cmp_status->ncomps)
3875                >= MAX_COMPOSITION_COMPONENTS)
3876         {
3877           /* Too long composition.  */
3878           MAYBE_FINISH_COMPOSITION ();
3879           *charbuf++ = c;
3880           char_offset++;
3881         }
3882       else
3883         STORE_COMPOSITION_CHAR (c);
3884       continue;
3885
3886     invalid_code:
3887       MAYBE_FINISH_COMPOSITION ();
3888       src = src_base;
3889       consumed_chars = consumed_chars_base;
3890       ONE_MORE_BYTE (c);
3891       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3892       char_offset++;
3893       coding->errors++;
3894       continue;
3895
3896     break_loop:
3897       break;
3898     }
3899
3900  no_more_source:
3901   if (cmp_status->state != COMPOSING_NO)
3902     {
3903       if (coding->mode & CODING_MODE_LAST_BLOCK)
3904         MAYBE_FINISH_COMPOSITION ();
3905       else
3906         {
3907           charbuf -= cmp_status->length;
3908           for (i = 0; i < cmp_status->length; i++)
3909             cmp_status->carryover[i] = charbuf[i];
3910         }
3911     }
3912   else if (last_id != charset_ascii)
3913     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3914   coding->consumed_char += consumed_chars_base;
3915   coding->consumed = src_base - coding->source;
3916   coding->charbuf_used = charbuf - coding->charbuf;
3917 }
3918
3919
3920 /* ISO2022 encoding stuff.  */
3921
3922 /*
3923    It is not enough to say just "ISO2022" on encoding, we have to
3924    specify more details.  In Emacs, each coding system of ISO2022
3925    variant has the following specifications:
3926         1. Initial designation to G0 thru G3.
3927         2. Allows short-form designation?
3928         3. ASCII should be designated to G0 before control characters?
3929         4. ASCII should be designated to G0 at end of line?
3930         5. 7-bit environment or 8-bit environment?
3931         6. Use locking-shift?
3932         7. Use Single-shift?
3933    And the following two are only for Japanese:
3934         8. Use ASCII in place of JIS0201-1976-Roman?
3935         9. Use JISX0208-1983 in place of JISX0208-1978?
3936    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3937    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3938    details.
3939 */
3940
3941 /* Produce codes (escape sequence) for designating CHARSET to graphic
3942    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3943    '@', 'A', or 'B' and the coding system CODING allows, produce
3944    designation sequence of short-form.  */
3945
3946 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3947   do {                                                                  \
3948     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3949     const char *intermediate_char_94 = "()*+";                          \
3950     const char *intermediate_char_96 = ",-./";                          \
3951     int revision = -1;                                                  \
3952                                                                         \
3953     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3954       revision = CHARSET_ISO_REVISION (charset);                        \
3955                                                                         \
3956     if (revision >= 0)                                                  \
3957       {                                                                 \
3958         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3959         EMIT_ONE_BYTE ('@' + revision);                                 \
3960       }                                                                 \
3961     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3962     if (CHARSET_DIMENSION (charset) == 1)                               \
3963       {                                                                 \
3964         int b;                                                          \
3965         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3966           b = intermediate_char_94[reg];                                \
3967         else                                                            \
3968           b = intermediate_char_96[reg];                                \
3969         EMIT_ONE_ASCII_BYTE (b);                                        \
3970       }                                                                 \
3971     else                                                                \
3972       {                                                                 \
3973         EMIT_ONE_ASCII_BYTE ('$');                                      \
3974         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3975           {                                                             \
3976             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3977                 || reg != 0                                             \
3978                 || final_char < '@' || final_char > 'B')                \
3979               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3980           }                                                             \
3981         else                                                            \
3982           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3983       }                                                                 \
3984     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3985                                                                         \
3986     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3987   } while (0)
3988
3989
3990 /* The following two macros produce codes (control character or escape
3991    sequence) for ISO2022 single-shift functions (single-shift-2 and
3992    single-shift-3).  */
3993
3994 #define ENCODE_SINGLE_SHIFT_2                                           \
3995   do {                                                                  \
3996     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3997       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3998     else                                                                \
3999       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4000     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4001   } while (0)
4002
4003
4004 #define ENCODE_SINGLE_SHIFT_3                                           \
4005   do {                                                                  \
4006     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4007       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4008     else                                                                \
4009       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4010     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4011   } while (0)
4012
4013
4014 /* The following four macros produce codes (control character or
4015    escape sequence) for ISO2022 locking-shift functions (shift-in,
4016    shift-out, locking-shift-2, and locking-shift-3).  */
4017
4018 #define ENCODE_SHIFT_IN                                 \
4019   do {                                                  \
4020     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4021     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4022   } while (0)
4023
4024
4025 #define ENCODE_SHIFT_OUT                                \
4026   do {                                                  \
4027     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4028     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4029   } while (0)
4030
4031
4032 #define ENCODE_LOCKING_SHIFT_2                          \
4033   do {                                                  \
4034     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4035     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4036   } while (0)
4037
4038
4039 #define ENCODE_LOCKING_SHIFT_3                          \
4040   do {                                                  \
4041     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4042     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4043   } while (0)
4044
4045
4046 /* Produce codes for a DIMENSION1 character whose character set is
4047    CHARSET and whose position-code is C1.  Designation and invocation
4048    sequences are also produced in advance if necessary.  */
4049
4050 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4051   do {                                                                  \
4052     int id = CHARSET_ID (charset);                                      \
4053                                                                         \
4054     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4055         && id == charset_ascii)                                         \
4056       {                                                                 \
4057         id = charset_jisx0201_roman;                                    \
4058         charset = CHARSET_FROM_ID (id);                                 \
4059       }                                                                 \
4060                                                                         \
4061     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4062       {                                                                 \
4063         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4064           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4065         else                                                            \
4066           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4067         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4068         break;                                                          \
4069       }                                                                 \
4070     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4071       {                                                                 \
4072         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4073         break;                                                          \
4074       }                                                                 \
4075     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4076       {                                                                 \
4077         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4078         break;                                                          \
4079       }                                                                 \
4080     else                                                                \
4081       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4082          must invoke it, or, at first, designate it to some graphic     \
4083          register.  Then repeat the loop to actually produce the        \
4084          character.  */                                                 \
4085       dst = encode_invocation_designation (charset, coding, dst,        \
4086                                            &produced_chars);            \
4087   } while (1)
4088
4089
4090 /* Produce codes for a DIMENSION2 character whose character set is
4091    CHARSET and whose position-codes are C1 and C2.  Designation and
4092    invocation codes are also produced in advance if necessary.  */
4093
4094 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4095   do {                                                                  \
4096     int id = CHARSET_ID (charset);                                      \
4097                                                                         \
4098     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4099         && id == charset_jisx0208)                                      \
4100       {                                                                 \
4101         id = charset_jisx0208_1978;                                     \
4102         charset = CHARSET_FROM_ID (id);                                 \
4103       }                                                                 \
4104                                                                         \
4105     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4106       {                                                                 \
4107         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4108           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4109         else                                                            \
4110           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4111         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4112         break;                                                          \
4113       }                                                                 \
4114     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4115       {                                                                 \
4116         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4117         break;                                                          \
4118       }                                                                 \
4119     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4120       {                                                                 \
4121         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4122         break;                                                          \
4123       }                                                                 \
4124     else                                                                \
4125       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4126          must invoke it, or, at first, designate it to some graphic     \
4127          register.  Then repeat the loop to actually produce the        \
4128          character.  */                                                 \
4129       dst = encode_invocation_designation (charset, coding, dst,        \
4130                                            &produced_chars);            \
4131   } while (1)
4132
4133
4134 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4135   do {                                                                     \
4136     unsigned code;                                                         \
4137     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4138                                                                            \
4139     if (CHARSET_DIMENSION (charset) == 1)                                  \
4140       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4141     else                                                                   \
4142       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4143   } while (0)
4144
4145
4146 /* Produce designation and invocation codes at a place pointed by DST
4147    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4148    Return new DST.  */
4149
4150 static unsigned char *
4151 encode_invocation_designation (struct charset *charset,
4152                                struct coding_system *coding,
4153                                unsigned char *dst, ptrdiff_t *p_nchars)
4154 {
4155   bool multibytep = coding->dst_multibyte;
4156   ptrdiff_t produced_chars = *p_nchars;
4157   int reg;                      /* graphic register number */
4158   int id = CHARSET_ID (charset);
4159
4160   /* At first, check designations.  */
4161   for (reg = 0; reg < 4; reg++)
4162     if (id == CODING_ISO_DESIGNATION (coding, reg))
4163       break;
4164
4165   if (reg >= 4)
4166     {
4167       /* CHARSET is not yet designated to any graphic registers.  */
4168       /* At first check the requested designation.  */
4169       reg = CODING_ISO_REQUEST (coding, id);
4170       if (reg < 0)
4171         /* Since CHARSET requests no special designation, designate it
4172            to graphic register 0.  */
4173         reg = 0;
4174
4175       ENCODE_DESIGNATION (charset, reg, coding);
4176     }
4177
4178   if (CODING_ISO_INVOCATION (coding, 0) != reg
4179       && CODING_ISO_INVOCATION (coding, 1) != reg)
4180     {
4181       /* Since the graphic register REG is not invoked to any graphic
4182          planes, invoke it to graphic plane 0.  */
4183       switch (reg)
4184         {
4185         case 0:                 /* graphic register 0 */
4186           ENCODE_SHIFT_IN;
4187           break;
4188
4189         case 1:                 /* graphic register 1 */
4190           ENCODE_SHIFT_OUT;
4191           break;
4192
4193         case 2:                 /* graphic register 2 */
4194           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4195             ENCODE_SINGLE_SHIFT_2;
4196           else
4197             ENCODE_LOCKING_SHIFT_2;
4198           break;
4199
4200         case 3:                 /* graphic register 3 */
4201           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4202             ENCODE_SINGLE_SHIFT_3;
4203           else
4204             ENCODE_LOCKING_SHIFT_3;
4205           break;
4206         }
4207     }
4208
4209   *p_nchars = produced_chars;
4210   return dst;
4211 }
4212
4213
4214 /* Produce codes for designation and invocation to reset the graphic
4215    planes and registers to initial state.  */
4216 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4217   do {                                                                  \
4218     int reg;                                                            \
4219     struct charset *charset;                                            \
4220                                                                         \
4221     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4222       ENCODE_SHIFT_IN;                                                  \
4223     for (reg = 0; reg < 4; reg++)                                       \
4224       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4225           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4226               != CODING_ISO_INITIAL (coding, reg)))                     \
4227         {                                                               \
4228           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4229           ENCODE_DESIGNATION (charset, reg, coding);                    \
4230         }                                                               \
4231   } while (0)
4232
4233
4234 /* Produce designation sequences of charsets in the line started from
4235    CHARBUF to a place pointed by DST, and return the number of
4236    produced bytes.  DST should not directly point a buffer text area
4237    which may be relocated by char_charset call.
4238
4239    If the current block ends before any end-of-line, we may fail to
4240    find all the necessary designations.  */
4241
4242 static ptrdiff_t
4243 encode_designation_at_bol (struct coding_system *coding,
4244                            int *charbuf, int *charbuf_end,
4245                            unsigned char *dst)
4246 {
4247   unsigned char *orig = dst;
4248   struct charset *charset;
4249   /* Table of charsets to be designated to each graphic register.  */
4250   int r[4];
4251   int c, found = 0, reg;
4252   ptrdiff_t produced_chars = 0;
4253   bool multibytep = coding->dst_multibyte;
4254   Lisp_Object attrs;
4255   Lisp_Object charset_list;
4256
4257   attrs = CODING_ID_ATTRS (coding->id);
4258   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4259   if (EQ (charset_list, Qiso_2022))
4260     charset_list = Viso_2022_charset_list;
4261
4262   for (reg = 0; reg < 4; reg++)
4263     r[reg] = -1;
4264
4265   while (charbuf < charbuf_end && found < 4)
4266     {
4267       int id;
4268
4269       c = *charbuf++;
4270       if (c == '\n')
4271         break;
4272       charset = char_charset (c, charset_list, NULL);
4273       id = CHARSET_ID (charset);
4274       reg = CODING_ISO_REQUEST (coding, id);
4275       if (reg >= 0 && r[reg] < 0)
4276         {
4277           found++;
4278           r[reg] = id;
4279         }
4280     }
4281
4282   if (found)
4283     {
4284       for (reg = 0; reg < 4; reg++)
4285         if (r[reg] >= 0
4286             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4287           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4288     }
4289
4290   return dst - orig;
4291 }
4292
4293 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4294
4295 static bool
4296 encode_coding_iso_2022 (struct coding_system *coding)
4297 {
4298   bool multibytep = coding->dst_multibyte;
4299   int *charbuf = coding->charbuf;
4300   int *charbuf_end = charbuf + coding->charbuf_used;
4301   unsigned char *dst = coding->destination + coding->produced;
4302   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4303   int safe_room = 16;
4304   bool bol_designation
4305     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4306        && CODING_ISO_BOL (coding));
4307   ptrdiff_t produced_chars = 0;
4308   Lisp_Object attrs, eol_type, charset_list;
4309   bool ascii_compatible;
4310   int c;
4311   int preferred_charset_id = -1;
4312
4313   CODING_GET_INFO (coding, attrs, charset_list);
4314   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4315   if (VECTORP (eol_type))
4316     eol_type = Qunix;
4317
4318   setup_iso_safe_charsets (attrs);
4319   /* Charset list may have been changed.  */
4320   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4321   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4322
4323   ascii_compatible
4324     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4325        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4326                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4327
4328   while (charbuf < charbuf_end)
4329     {
4330       ASSURE_DESTINATION (safe_room);
4331
4332       if (bol_designation)
4333         {
4334           /* We have to produce designation sequences if any now.  */
4335           unsigned char desig_buf[16];
4336           int nbytes;
4337           ptrdiff_t offset;
4338
4339           charset_map_loaded = 0;
4340           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4341                                               desig_buf);
4342           if (charset_map_loaded
4343               && (offset = coding_change_destination (coding)))
4344             {
4345               dst += offset;
4346               dst_end += offset;
4347             }
4348           memcpy (dst, desig_buf, nbytes);
4349           dst += nbytes;
4350           /* We are sure that designation sequences are all ASCII bytes.  */
4351           produced_chars += nbytes;
4352           bol_designation = 0;
4353           ASSURE_DESTINATION (safe_room);
4354         }
4355
4356       c = *charbuf++;
4357
4358       if (c < 0)
4359         {
4360           /* Handle an annotation.  */
4361           switch (*charbuf)
4362             {
4363             case CODING_ANNOTATE_COMPOSITION_MASK:
4364               /* Not yet implemented.  */
4365               break;
4366             case CODING_ANNOTATE_CHARSET_MASK:
4367               preferred_charset_id = charbuf[2];
4368               if (preferred_charset_id >= 0
4369                   && NILP (Fmemq (make_number (preferred_charset_id),
4370                                   charset_list)))
4371                 preferred_charset_id = -1;
4372               break;
4373             default:
4374               emacs_abort ();
4375             }
4376           charbuf += -c - 1;
4377           continue;
4378         }
4379
4380       /* Now encode the character C.  */
4381       if (c < 0x20 || c == 0x7F)
4382         {
4383           if (c == '\n'
4384               || (c == '\r' && EQ (eol_type, Qmac)))
4385             {
4386               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4387                 ENCODE_RESET_PLANE_AND_REGISTER ();
4388               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4389                 {
4390                   int i;
4391
4392                   for (i = 0; i < 4; i++)
4393                     CODING_ISO_DESIGNATION (coding, i)
4394                       = CODING_ISO_INITIAL (coding, i);
4395                 }
4396               bol_designation = ((CODING_ISO_FLAGS (coding)
4397                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4398                                  != 0);
4399             }
4400           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4401             ENCODE_RESET_PLANE_AND_REGISTER ();
4402           EMIT_ONE_ASCII_BYTE (c);
4403         }
4404       else if (ASCII_CHAR_P (c))
4405         {
4406           if (ascii_compatible)
4407             EMIT_ONE_ASCII_BYTE (c);
4408           else
4409             {
4410               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4411               ENCODE_ISO_CHARACTER (charset, c);
4412             }
4413         }
4414       else if (CHAR_BYTE8_P (c))
4415         {
4416           c = CHAR_TO_BYTE8 (c);
4417           EMIT_ONE_BYTE (c);
4418         }
4419       else
4420         {
4421           struct charset *charset;
4422
4423           if (preferred_charset_id >= 0)
4424             {
4425               bool result;
4426
4427               charset = CHARSET_FROM_ID (preferred_charset_id);
4428               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4429               if (! result)
4430                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4431                                      NULL, charset);
4432             }
4433           else
4434             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4435                                  NULL, charset);
4436           if (!charset)
4437             {
4438               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4439                 {
4440                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4441                   charset = CHARSET_FROM_ID (charset_ascii);
4442                 }
4443               else
4444                 {
4445                   c = coding->default_char;
4446                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4447                                        charset_list, NULL, charset);
4448                 }
4449             }
4450           ENCODE_ISO_CHARACTER (charset, c);
4451         }
4452     }
4453
4454   if (coding->mode & CODING_MODE_LAST_BLOCK
4455       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4456     {
4457       ASSURE_DESTINATION (safe_room);
4458       ENCODE_RESET_PLANE_AND_REGISTER ();
4459     }
4460   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4461   CODING_ISO_BOL (coding) = bol_designation;
4462   coding->produced_char += produced_chars;
4463   coding->produced = dst - coding->destination;
4464   return 0;
4465 }
4466
4467 \f
4468 /*** 8,9. SJIS and BIG5 handlers ***/
4469
4470 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4471    quite widely.  So, for the moment, Emacs supports them in the bare
4472    C code.  But, in the future, they may be supported only by CCL.  */
4473
4474 /* SJIS is a coding system encoding three character sets: ASCII, right
4475    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4476    as is.  A character of charset katakana-jisx0201 is encoded by
4477    "position-code + 0x80".  A character of charset japanese-jisx0208
4478    is encoded in 2-byte but two position-codes are divided and shifted
4479    so that it fit in the range below.
4480
4481    --- CODE RANGE of SJIS ---
4482    (character set)      (range)
4483    ASCII                0x00 .. 0x7F
4484    KATAKANA-JISX0201    0xA0 .. 0xDF
4485    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4486             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4487    -------------------------------
4488
4489 */
4490
4491 /* BIG5 is a coding system encoding two character sets: ASCII and
4492    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4493    character set and is encoded in two-byte.
4494
4495    --- CODE RANGE of BIG5 ---
4496    (character set)      (range)
4497    ASCII                0x00 .. 0x7F
4498    Big5 (1st byte)      0xA1 .. 0xFE
4499         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4500    --------------------------
4501
4502   */
4503
4504 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4505    Return true if a text is encoded in SJIS.  */
4506
4507 static bool
4508 detect_coding_sjis (struct coding_system *coding,
4509                     struct coding_detection_info *detect_info)
4510 {
4511   const unsigned char *src = coding->source, *src_base;
4512   const unsigned char *src_end = coding->source + coding->src_bytes;
4513   bool multibytep = coding->src_multibyte;
4514   ptrdiff_t consumed_chars = 0;
4515   int found = 0;
4516   int c;
4517   Lisp_Object attrs, charset_list;
4518   int max_first_byte_of_2_byte_code;
4519
4520   CODING_GET_INFO (coding, attrs, charset_list);
4521   max_first_byte_of_2_byte_code
4522     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4523
4524   detect_info->checked |= CATEGORY_MASK_SJIS;
4525   /* A coding system of this category is always ASCII compatible.  */
4526   src += coding->head_ascii;
4527
4528   while (1)
4529     {
4530       src_base = src;
4531       ONE_MORE_BYTE (c);
4532       if (c < 0x80)
4533         continue;
4534       if ((c >= 0x81 && c <= 0x9F)
4535           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4536         {
4537           ONE_MORE_BYTE (c);
4538           if (c < 0x40 || c == 0x7F || c > 0xFC)
4539             break;
4540           found = CATEGORY_MASK_SJIS;
4541         }
4542       else if (c >= 0xA0 && c < 0xE0)
4543         found = CATEGORY_MASK_SJIS;
4544       else
4545         break;
4546     }
4547   detect_info->rejected |= CATEGORY_MASK_SJIS;
4548   return 0;
4549
4550  no_more_source:
4551   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4552     {
4553       detect_info->rejected |= CATEGORY_MASK_SJIS;
4554       return 0;
4555     }
4556   detect_info->found |= found;
4557   return 1;
4558 }
4559
4560 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4561    Return true if a text is encoded in BIG5.  */
4562
4563 static bool
4564 detect_coding_big5 (struct coding_system *coding,
4565                     struct coding_detection_info *detect_info)
4566 {
4567   const unsigned char *src = coding->source, *src_base;
4568   const unsigned char *src_end = coding->source + coding->src_bytes;
4569   bool multibytep = coding->src_multibyte;
4570   ptrdiff_t consumed_chars = 0;
4571   int found = 0;
4572   int c;
4573
4574   detect_info->checked |= CATEGORY_MASK_BIG5;
4575   /* A coding system of this category is always ASCII compatible.  */
4576   src += coding->head_ascii;
4577
4578   while (1)
4579     {
4580       src_base = src;
4581       ONE_MORE_BYTE (c);
4582       if (c < 0x80)
4583         continue;
4584       if (c >= 0xA1)
4585         {
4586           ONE_MORE_BYTE (c);
4587           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4588             return 0;
4589           found = CATEGORY_MASK_BIG5;
4590         }
4591       else
4592         break;
4593     }
4594   detect_info->rejected |= CATEGORY_MASK_BIG5;
4595   return 0;
4596
4597  no_more_source:
4598   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4599     {
4600       detect_info->rejected |= CATEGORY_MASK_BIG5;
4601       return 0;
4602     }
4603   detect_info->found |= found;
4604   return 1;
4605 }
4606
4607 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4608
4609 static void
4610 decode_coding_sjis (struct coding_system *coding)
4611 {
4612   const unsigned char *src = coding->source + coding->consumed;
4613   const unsigned char *src_end = coding->source + coding->src_bytes;
4614   const unsigned char *src_base;
4615   int *charbuf = coding->charbuf + coding->charbuf_used;
4616   /* We may produce one charset annotation in one loop and one more at
4617      the end.  */
4618   int *charbuf_end
4619     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4620   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4621   bool multibytep = coding->src_multibyte;
4622   struct charset *charset_roman, *charset_kanji, *charset_kana;
4623   struct charset *charset_kanji2;
4624   Lisp_Object attrs, charset_list, val;
4625   ptrdiff_t char_offset = coding->produced_char;
4626   ptrdiff_t last_offset = char_offset;
4627   int last_id = charset_ascii;
4628   bool eol_dos
4629     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4630   int byte_after_cr = -1;
4631
4632   CODING_GET_INFO (coding, attrs, charset_list);
4633
4634   val = charset_list;
4635   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4636   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4637   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4638   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4639
4640   while (1)
4641     {
4642       int c, c1;
4643       struct charset *charset;
4644
4645       src_base = src;
4646       consumed_chars_base = consumed_chars;
4647
4648       if (charbuf >= charbuf_end)
4649         {
4650           if (byte_after_cr >= 0)
4651             src_base--;
4652           break;
4653         }
4654
4655       if (byte_after_cr >= 0)
4656         c = byte_after_cr, byte_after_cr = -1;
4657       else
4658         ONE_MORE_BYTE (c);
4659       if (c < 0)
4660         goto invalid_code;
4661       if (c < 0x80)
4662         {
4663           if (eol_dos && c == '\r')
4664             ONE_MORE_BYTE (byte_after_cr);
4665           charset = charset_roman;
4666         }
4667       else if (c == 0x80 || c == 0xA0)
4668         goto invalid_code;
4669       else if (c >= 0xA1 && c <= 0xDF)
4670         {
4671           /* SJIS -> JISX0201-Kana */
4672           c &= 0x7F;
4673           charset = charset_kana;
4674         }
4675       else if (c <= 0xEF)
4676         {
4677           /* SJIS -> JISX0208 */
4678           ONE_MORE_BYTE (c1);
4679           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4680             goto invalid_code;
4681           c = (c << 8) | c1;
4682           SJIS_TO_JIS (c);
4683           charset = charset_kanji;
4684         }
4685       else if (c <= 0xFC && charset_kanji2)
4686         {
4687           /* SJIS -> JISX0213-2 */
4688           ONE_MORE_BYTE (c1);
4689           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4690             goto invalid_code;
4691           c = (c << 8) | c1;
4692           SJIS_TO_JIS2 (c);
4693           charset = charset_kanji2;
4694         }
4695       else
4696         goto invalid_code;
4697       if (charset->id != charset_ascii
4698           && last_id != charset->id)
4699         {
4700           if (last_id != charset_ascii)
4701             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4702           last_id = charset->id;
4703           last_offset = char_offset;
4704         }
4705       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4706       *charbuf++ = c;
4707       char_offset++;
4708       continue;
4709
4710     invalid_code:
4711       src = src_base;
4712       consumed_chars = consumed_chars_base;
4713       ONE_MORE_BYTE (c);
4714       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4715       char_offset++;
4716       coding->errors++;
4717     }
4718
4719  no_more_source:
4720   if (last_id != charset_ascii)
4721     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4722   coding->consumed_char += consumed_chars_base;
4723   coding->consumed = src_base - coding->source;
4724   coding->charbuf_used = charbuf - coding->charbuf;
4725 }
4726
4727 static void
4728 decode_coding_big5 (struct coding_system *coding)
4729 {
4730   const unsigned char *src = coding->source + coding->consumed;
4731   const unsigned char *src_end = coding->source + coding->src_bytes;
4732   const unsigned char *src_base;
4733   int *charbuf = coding->charbuf + coding->charbuf_used;
4734   /* We may produce one charset annotation in one loop and one more at
4735      the end.  */
4736   int *charbuf_end
4737     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4738   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4739   bool multibytep = coding->src_multibyte;
4740   struct charset *charset_roman, *charset_big5;
4741   Lisp_Object attrs, charset_list, val;
4742   ptrdiff_t char_offset = coding->produced_char;
4743   ptrdiff_t last_offset = char_offset;
4744   int last_id = charset_ascii;
4745   bool eol_dos
4746     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4747   int byte_after_cr = -1;
4748
4749   CODING_GET_INFO (coding, attrs, charset_list);
4750   val = charset_list;
4751   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4752   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4753
4754   while (1)
4755     {
4756       int c, c1;
4757       struct charset *charset;
4758
4759       src_base = src;
4760       consumed_chars_base = consumed_chars;
4761
4762       if (charbuf >= charbuf_end)
4763         {
4764           if (byte_after_cr >= 0)
4765             src_base--;
4766           break;
4767         }
4768
4769       if (byte_after_cr >= 0)
4770         c = byte_after_cr, byte_after_cr = -1;
4771       else
4772         ONE_MORE_BYTE (c);
4773
4774       if (c < 0)
4775         goto invalid_code;
4776       if (c < 0x80)
4777         {
4778           if (eol_dos && c == '\r')
4779             ONE_MORE_BYTE (byte_after_cr);
4780           charset = charset_roman;
4781         }
4782       else
4783         {
4784           /* BIG5 -> Big5 */
4785           if (c < 0xA1 || c > 0xFE)
4786             goto invalid_code;
4787           ONE_MORE_BYTE (c1);
4788           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4789             goto invalid_code;
4790           c = c << 8 | c1;
4791           charset = charset_big5;
4792         }
4793       if (charset->id != charset_ascii
4794           && last_id != charset->id)
4795         {
4796           if (last_id != charset_ascii)
4797             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4798           last_id = charset->id;
4799           last_offset = char_offset;
4800         }
4801       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4802       *charbuf++ = c;
4803       char_offset++;
4804       continue;
4805
4806     invalid_code:
4807       src = src_base;
4808       consumed_chars = consumed_chars_base;
4809       ONE_MORE_BYTE (c);
4810       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4811       char_offset++;
4812       coding->errors++;
4813     }
4814
4815  no_more_source:
4816   if (last_id != charset_ascii)
4817     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4818   coding->consumed_char += consumed_chars_base;
4819   coding->consumed = src_base - coding->source;
4820   coding->charbuf_used = charbuf - coding->charbuf;
4821 }
4822
4823 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4824    This function can encode charsets `ascii', `katakana-jisx0201',
4825    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4826    are sure that all these charsets are registered as official charset
4827    (i.e. do not have extended leading-codes).  Characters of other
4828    charsets are produced without any encoding.  */
4829
4830 static bool
4831 encode_coding_sjis (struct coding_system *coding)
4832 {
4833   bool multibytep = coding->dst_multibyte;
4834   int *charbuf = coding->charbuf;
4835   int *charbuf_end = charbuf + coding->charbuf_used;
4836   unsigned char *dst = coding->destination + coding->produced;
4837   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4838   int safe_room = 4;
4839   ptrdiff_t produced_chars = 0;
4840   Lisp_Object attrs, charset_list, val;
4841   bool ascii_compatible;
4842   struct charset *charset_kanji, *charset_kana;
4843   struct charset *charset_kanji2;
4844   int c;
4845
4846   CODING_GET_INFO (coding, attrs, charset_list);
4847   val = XCDR (charset_list);
4848   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4849   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4850   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4851
4852   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4853
4854   while (charbuf < charbuf_end)
4855     {
4856       ASSURE_DESTINATION (safe_room);
4857       c = *charbuf++;
4858       /* Now encode the character C.  */
4859       if (ASCII_CHAR_P (c) && ascii_compatible)
4860         EMIT_ONE_ASCII_BYTE (c);
4861       else if (CHAR_BYTE8_P (c))
4862         {
4863           c = CHAR_TO_BYTE8 (c);
4864           EMIT_ONE_BYTE (c);
4865         }
4866       else
4867         {
4868           unsigned code;
4869           struct charset *charset;
4870           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4871                                &code, charset);
4872
4873           if (!charset)
4874             {
4875               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4876                 {
4877                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4878                   charset = CHARSET_FROM_ID (charset_ascii);
4879                 }
4880               else
4881                 {
4882                   c = coding->default_char;
4883                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4884                                        charset_list, &code, charset);
4885                 }
4886             }
4887           if (code == CHARSET_INVALID_CODE (charset))
4888             emacs_abort ();
4889           if (charset == charset_kanji)
4890             {
4891               int c1, c2;
4892               JIS_TO_SJIS (code);
4893               c1 = code >> 8, c2 = code & 0xFF;
4894               EMIT_TWO_BYTES (c1, c2);
4895             }
4896           else if (charset == charset_kana)
4897             EMIT_ONE_BYTE (code | 0x80);
4898           else if (charset_kanji2 && charset == charset_kanji2)
4899             {
4900               int c1, c2;
4901
4902               c1 = code >> 8;
4903               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4904                   || c1 == 0x28
4905                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4906                 {
4907                   JIS_TO_SJIS2 (code);
4908                   c1 = code >> 8, c2 = code & 0xFF;
4909                   EMIT_TWO_BYTES (c1, c2);
4910                 }
4911               else
4912                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4913             }
4914           else
4915             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4916         }
4917     }
4918   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4919   coding->produced_char += produced_chars;
4920   coding->produced = dst - coding->destination;
4921   return 0;
4922 }
4923
4924 static bool
4925 encode_coding_big5 (struct coding_system *coding)
4926 {
4927   bool multibytep = coding->dst_multibyte;
4928   int *charbuf = coding->charbuf;
4929   int *charbuf_end = charbuf + coding->charbuf_used;
4930   unsigned char *dst = coding->destination + coding->produced;
4931   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4932   int safe_room = 4;
4933   ptrdiff_t produced_chars = 0;
4934   Lisp_Object attrs, charset_list, val;
4935   bool ascii_compatible;
4936   struct charset *charset_big5;
4937   int c;
4938
4939   CODING_GET_INFO (coding, attrs, charset_list);
4940   val = XCDR (charset_list);
4941   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4942   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4943
4944   while (charbuf < charbuf_end)
4945     {
4946       ASSURE_DESTINATION (safe_room);
4947       c = *charbuf++;
4948       /* Now encode the character C.  */
4949       if (ASCII_CHAR_P (c) && ascii_compatible)
4950         EMIT_ONE_ASCII_BYTE (c);
4951       else if (CHAR_BYTE8_P (c))
4952         {
4953           c = CHAR_TO_BYTE8 (c);
4954           EMIT_ONE_BYTE (c);
4955         }
4956       else
4957         {
4958           unsigned code;
4959           struct charset *charset;
4960           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4961                                &code, charset);
4962
4963           if (! charset)
4964             {
4965               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4966                 {
4967                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4968                   charset = CHARSET_FROM_ID (charset_ascii);
4969                 }
4970               else
4971                 {
4972                   c = coding->default_char;
4973                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4974                                        charset_list, &code, charset);
4975                 }
4976             }
4977           if (code == CHARSET_INVALID_CODE (charset))
4978             emacs_abort ();
4979           if (charset == charset_big5)
4980             {
4981               int c1, c2;
4982
4983               c1 = code >> 8, c2 = code & 0xFF;
4984               EMIT_TWO_BYTES (c1, c2);
4985             }
4986           else
4987             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4988         }
4989     }
4990   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4991   coding->produced_char += produced_chars;
4992   coding->produced = dst - coding->destination;
4993   return 0;
4994 }
4995
4996 \f
4997 /*** 10. CCL handlers ***/
4998
4999 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5000    Return true if a text is encoded in a coding system of which
5001    encoder/decoder are written in CCL program.  */
5002
5003 static bool
5004 detect_coding_ccl (struct coding_system *coding,
5005                    struct coding_detection_info *detect_info)
5006 {
5007   const unsigned char *src = coding->source, *src_base;
5008   const unsigned char *src_end = coding->source + coding->src_bytes;
5009   bool multibytep = coding->src_multibyte;
5010   ptrdiff_t consumed_chars = 0;
5011   int found = 0;
5012   unsigned char *valids;
5013   ptrdiff_t head_ascii = coding->head_ascii;
5014   Lisp_Object attrs;
5015
5016   detect_info->checked |= CATEGORY_MASK_CCL;
5017
5018   coding = &coding_categories[coding_category_ccl];
5019   valids = CODING_CCL_VALIDS (coding);
5020   attrs = CODING_ID_ATTRS (coding->id);
5021   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5022     src += head_ascii;
5023
5024   while (1)
5025     {
5026       int c;
5027
5028       src_base = src;
5029       ONE_MORE_BYTE (c);
5030       if (c < 0 || ! valids[c])
5031         break;
5032       if ((valids[c] > 1))
5033         found = CATEGORY_MASK_CCL;
5034     }
5035   detect_info->rejected |= CATEGORY_MASK_CCL;
5036   return 0;
5037
5038  no_more_source:
5039   detect_info->found |= found;
5040   return 1;
5041 }
5042
5043 static void
5044 decode_coding_ccl (struct coding_system *coding)
5045 {
5046   const unsigned char *src = coding->source + coding->consumed;
5047   const unsigned char *src_end = coding->source + coding->src_bytes;
5048   int *charbuf = coding->charbuf + coding->charbuf_used;
5049   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5050   ptrdiff_t consumed_chars = 0;
5051   bool multibytep = coding->src_multibyte;
5052   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5053   int source_charbuf[1024];
5054   int source_byteidx[1025];
5055   Lisp_Object attrs, charset_list;
5056
5057   CODING_GET_INFO (coding, attrs, charset_list);
5058
5059   while (1)
5060     {
5061       const unsigned char *p = src;
5062       ptrdiff_t offset;
5063       int i = 0;
5064
5065       if (multibytep)
5066         {
5067           while (i < 1024 && p < src_end)
5068             {
5069               source_byteidx[i] = p - src;
5070               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5071             }
5072           source_byteidx[i] = p - src;
5073         }
5074       else
5075         while (i < 1024 && p < src_end)
5076           source_charbuf[i++] = *p++;
5077
5078       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5079         ccl->last_block = 1;
5080       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5081       charset_map_loaded = 0;
5082       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5083                   charset_list);
5084       if (charset_map_loaded
5085           && (offset = coding_change_source (coding)))
5086         {
5087           p += offset;
5088           src += offset;
5089           src_end += offset;
5090         }
5091       charbuf += ccl->produced;
5092       if (multibytep)
5093         src += source_byteidx[ccl->consumed];
5094       else
5095         src += ccl->consumed;
5096       consumed_chars += ccl->consumed;
5097       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5098         break;
5099     }
5100
5101   switch (ccl->status)
5102     {
5103     case CCL_STAT_SUSPEND_BY_SRC:
5104       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5105       break;
5106     case CCL_STAT_SUSPEND_BY_DST:
5107       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5108       break;
5109     case CCL_STAT_QUIT:
5110     case CCL_STAT_INVALID_CMD:
5111       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5112       break;
5113     default:
5114       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5115       break;
5116     }
5117   coding->consumed_char += consumed_chars;
5118   coding->consumed = src - coding->source;
5119   coding->charbuf_used = charbuf - coding->charbuf;
5120 }
5121
5122 static bool
5123 encode_coding_ccl (struct coding_system *coding)
5124 {
5125   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5126   bool multibytep = coding->dst_multibyte;
5127   int *charbuf = coding->charbuf;
5128   int *charbuf_end = charbuf + coding->charbuf_used;
5129   unsigned char *dst = coding->destination + coding->produced;
5130   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5131   int destination_charbuf[1024];
5132   ptrdiff_t produced_chars = 0;
5133   int i;
5134   Lisp_Object attrs, charset_list;
5135
5136   CODING_GET_INFO (coding, attrs, charset_list);
5137   if (coding->consumed_char == coding->src_chars
5138       && coding->mode & CODING_MODE_LAST_BLOCK)
5139     ccl->last_block = 1;
5140
5141   do
5142     {
5143       ptrdiff_t offset;
5144
5145       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5146       charset_map_loaded = 0;
5147       ccl_driver (ccl, charbuf, destination_charbuf,
5148                   charbuf_end - charbuf, 1024, charset_list);
5149       if (charset_map_loaded
5150           && (offset = coding_change_destination (coding)))
5151         dst += offset;
5152       if (multibytep)
5153         {
5154           ASSURE_DESTINATION (ccl->produced * 2);
5155           for (i = 0; i < ccl->produced; i++)
5156             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5157         }
5158       else
5159         {
5160           ASSURE_DESTINATION (ccl->produced);
5161           for (i = 0; i < ccl->produced; i++)
5162             *dst++ = destination_charbuf[i] & 0xFF;
5163           produced_chars += ccl->produced;
5164         }
5165       charbuf += ccl->consumed;
5166       if (ccl->status == CCL_STAT_QUIT
5167           || ccl->status == CCL_STAT_INVALID_CMD)
5168         break;
5169     }
5170   while (charbuf < charbuf_end);
5171
5172   switch (ccl->status)
5173     {
5174     case CCL_STAT_SUSPEND_BY_SRC:
5175       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5176       break;
5177     case CCL_STAT_SUSPEND_BY_DST:
5178       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5179       break;
5180     case CCL_STAT_QUIT:
5181     case CCL_STAT_INVALID_CMD:
5182       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5183       break;
5184     default:
5185       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5186       break;
5187     }
5188
5189   coding->produced_char += produced_chars;
5190   coding->produced = dst - coding->destination;
5191   return 0;
5192 }
5193
5194 \f
5195 /*** 10, 11. no-conversion handlers ***/
5196
5197 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5198
5199 static void
5200 decode_coding_raw_text (struct coding_system *coding)
5201 {
5202   bool eol_dos
5203     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5204
5205   coding->chars_at_source = 1;
5206   coding->consumed_char = coding->src_chars;
5207   coding->consumed = coding->src_bytes;
5208   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5209     {
5210       coding->consumed_char--;
5211       coding->consumed--;
5212       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5213     }
5214   else
5215     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5216 }
5217
5218 static bool
5219 encode_coding_raw_text (struct coding_system *coding)
5220 {
5221   bool multibytep = coding->dst_multibyte;
5222   int *charbuf = coding->charbuf;
5223   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5224   unsigned char *dst = coding->destination + coding->produced;
5225   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5226   ptrdiff_t produced_chars = 0;
5227   int c;
5228
5229   if (multibytep)
5230     {
5231       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5232
5233       if (coding->src_multibyte)
5234         while (charbuf < charbuf_end)
5235           {
5236             ASSURE_DESTINATION (safe_room);
5237             c = *charbuf++;
5238             if (ASCII_CHAR_P (c))
5239               EMIT_ONE_ASCII_BYTE (c);
5240             else if (CHAR_BYTE8_P (c))
5241               {
5242                 c = CHAR_TO_BYTE8 (c);
5243                 EMIT_ONE_BYTE (c);
5244               }
5245             else
5246               {
5247                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5248
5249                 CHAR_STRING_ADVANCE (c, p1);
5250                 do
5251                   {
5252                     EMIT_ONE_BYTE (*p0);
5253                     p0++;
5254                   }
5255                 while (p0 < p1);
5256               }
5257           }
5258       else
5259         while (charbuf < charbuf_end)
5260           {
5261             ASSURE_DESTINATION (safe_room);
5262             c = *charbuf++;
5263             EMIT_ONE_BYTE (c);
5264           }
5265     }
5266   else
5267     {
5268       if (coding->src_multibyte)
5269         {
5270           int safe_room = MAX_MULTIBYTE_LENGTH;
5271
5272           while (charbuf < charbuf_end)
5273             {
5274               ASSURE_DESTINATION (safe_room);
5275               c = *charbuf++;
5276               if (ASCII_CHAR_P (c))
5277                 *dst++ = c;
5278               else if (CHAR_BYTE8_P (c))
5279                 *dst++ = CHAR_TO_BYTE8 (c);
5280               else
5281                 CHAR_STRING_ADVANCE (c, dst);
5282             }
5283         }
5284       else
5285         {
5286           ASSURE_DESTINATION (charbuf_end - charbuf);
5287           while (charbuf < charbuf_end && dst < dst_end)
5288             *dst++ = *charbuf++;
5289         }
5290       produced_chars = dst - (coding->destination + coding->produced);
5291     }
5292   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5293   coding->produced_char += produced_chars;
5294   coding->produced = dst - coding->destination;
5295   return 0;
5296 }
5297
5298 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5299    Return true if a text is encoded in a charset-based coding system.  */
5300
5301 static bool
5302 detect_coding_charset (struct coding_system *coding,
5303                        struct coding_detection_info *detect_info)
5304 {
5305   const unsigned char *src = coding->source, *src_base;
5306   const unsigned char *src_end = coding->source + coding->src_bytes;
5307   bool multibytep = coding->src_multibyte;
5308   ptrdiff_t consumed_chars = 0;
5309   Lisp_Object attrs, valids, name;
5310   int found = 0;
5311   ptrdiff_t head_ascii = coding->head_ascii;
5312   bool check_latin_extra = 0;
5313
5314   detect_info->checked |= CATEGORY_MASK_CHARSET;
5315
5316   coding = &coding_categories[coding_category_charset];
5317   attrs = CODING_ID_ATTRS (coding->id);
5318   valids = AREF (attrs, coding_attr_charset_valids);
5319   name = CODING_ID_NAME (coding->id);
5320   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5321                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5322       || strncmp (SSDATA (SYMBOL_NAME (name)),
5323                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5324     check_latin_extra = 1;
5325
5326   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5327     src += head_ascii;
5328
5329   while (1)
5330     {
5331       int c;
5332       Lisp_Object val;
5333       struct charset *charset;
5334       int dim, idx;
5335
5336       src_base = src;
5337       ONE_MORE_BYTE (c);
5338       if (c < 0)
5339         continue;
5340       val = AREF (valids, c);
5341       if (NILP (val))
5342         break;
5343       if (c >= 0x80)
5344         {
5345           if (c < 0xA0
5346               && check_latin_extra
5347               && (!VECTORP (Vlatin_extra_code_table)
5348                   || NILP (AREF (Vlatin_extra_code_table, c))))
5349             break;
5350           found = CATEGORY_MASK_CHARSET;
5351         }
5352       if (INTEGERP (val))
5353         {
5354           charset = CHARSET_FROM_ID (XFASTINT (val));
5355           dim = CHARSET_DIMENSION (charset);
5356           for (idx = 1; idx < dim; idx++)
5357             {
5358               if (src == src_end)
5359                 goto too_short;
5360               ONE_MORE_BYTE (c);
5361               if (c < charset->code_space[(dim - 1 - idx) * 4]
5362                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5363                 break;
5364             }
5365           if (idx < dim)
5366             break;
5367         }
5368       else
5369         {
5370           idx = 1;
5371           for (; CONSP (val); val = XCDR (val))
5372             {
5373               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5374               dim = CHARSET_DIMENSION (charset);
5375               while (idx < dim)
5376                 {
5377                   if (src == src_end)
5378                     goto too_short;
5379                   ONE_MORE_BYTE (c);
5380                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5381                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5382                     break;
5383                   idx++;
5384                 }
5385               if (idx == dim)
5386                 {
5387                   val = Qnil;
5388                   break;
5389                 }
5390             }
5391           if (CONSP (val))
5392             break;
5393         }
5394     }
5395  too_short:
5396   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5397   return 0;
5398
5399  no_more_source:
5400   detect_info->found |= found;
5401   return 1;
5402 }
5403
5404 static void
5405 decode_coding_charset (struct coding_system *coding)
5406 {
5407   const unsigned char *src = coding->source + coding->consumed;
5408   const unsigned char *src_end = coding->source + coding->src_bytes;
5409   const unsigned char *src_base;
5410   int *charbuf = coding->charbuf + coding->charbuf_used;
5411   /* We may produce one charset annotation in one loop and one more at
5412      the end.  */
5413   int *charbuf_end
5414     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5415   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5416   bool multibytep = coding->src_multibyte;
5417   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5418   Lisp_Object valids;
5419   ptrdiff_t char_offset = coding->produced_char;
5420   ptrdiff_t last_offset = char_offset;
5421   int last_id = charset_ascii;
5422   bool eol_dos
5423     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5424   int byte_after_cr = -1;
5425
5426   valids = AREF (attrs, coding_attr_charset_valids);
5427
5428   while (1)
5429     {
5430       int c;
5431       Lisp_Object val;
5432       struct charset *charset;
5433       int dim;
5434       int len = 1;
5435       unsigned code;
5436
5437       src_base = src;
5438       consumed_chars_base = consumed_chars;
5439
5440       if (charbuf >= charbuf_end)
5441         {
5442           if (byte_after_cr >= 0)
5443             src_base--;
5444           break;
5445         }
5446
5447       if (byte_after_cr >= 0)
5448         {
5449           c = byte_after_cr;
5450           byte_after_cr = -1;
5451         }
5452       else
5453         {
5454           ONE_MORE_BYTE (c);
5455           if (eol_dos && c == '\r')
5456             ONE_MORE_BYTE (byte_after_cr);
5457         }
5458       if (c < 0)
5459         goto invalid_code;
5460       code = c;
5461
5462       val = AREF (valids, c);
5463       if (! INTEGERP (val) && ! CONSP (val))
5464         goto invalid_code;
5465       if (INTEGERP (val))
5466         {
5467           charset = CHARSET_FROM_ID (XFASTINT (val));
5468           dim = CHARSET_DIMENSION (charset);
5469           while (len < dim)
5470             {
5471               ONE_MORE_BYTE (c);
5472               code = (code << 8) | c;
5473               len++;
5474             }
5475           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5476                               charset, code, c);
5477         }
5478       else
5479         {
5480           /* VAL is a list of charset IDs.  It is assured that the
5481              list is sorted by charset dimensions (smaller one
5482              comes first).  */
5483           while (CONSP (val))
5484             {
5485               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5486               dim = CHARSET_DIMENSION (charset);
5487               while (len < dim)
5488                 {
5489                   ONE_MORE_BYTE (c);
5490                   code = (code << 8) | c;
5491                   len++;
5492                 }
5493               CODING_DECODE_CHAR (coding, src, src_base,
5494                                   src_end, charset, code, c);
5495               if (c >= 0)
5496                 break;
5497               val = XCDR (val);
5498             }
5499         }
5500       if (c < 0)
5501         goto invalid_code;
5502       if (charset->id != charset_ascii
5503           && last_id != charset->id)
5504         {
5505           if (last_id != charset_ascii)
5506             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5507           last_id = charset->id;
5508           last_offset = char_offset;
5509         }
5510
5511       *charbuf++ = c;
5512       char_offset++;
5513       continue;
5514
5515     invalid_code:
5516       src = src_base;
5517       consumed_chars = consumed_chars_base;
5518       ONE_MORE_BYTE (c);
5519       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5520       char_offset++;
5521       coding->errors++;
5522     }
5523
5524  no_more_source:
5525   if (last_id != charset_ascii)
5526     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5527   coding->consumed_char += consumed_chars_base;
5528   coding->consumed = src_base - coding->source;
5529   coding->charbuf_used = charbuf - coding->charbuf;
5530 }
5531
5532 static bool
5533 encode_coding_charset (struct coding_system *coding)
5534 {
5535   bool multibytep = coding->dst_multibyte;
5536   int *charbuf = coding->charbuf;
5537   int *charbuf_end = charbuf + coding->charbuf_used;
5538   unsigned char *dst = coding->destination + coding->produced;
5539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5540   int safe_room = MAX_MULTIBYTE_LENGTH;
5541   ptrdiff_t produced_chars = 0;
5542   Lisp_Object attrs, charset_list;
5543   bool ascii_compatible;
5544   int c;
5545
5546   CODING_GET_INFO (coding, attrs, charset_list);
5547   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5548
5549   while (charbuf < charbuf_end)
5550     {
5551       struct charset *charset;
5552       unsigned code;
5553
5554       ASSURE_DESTINATION (safe_room);
5555       c = *charbuf++;
5556       if (ascii_compatible && ASCII_CHAR_P (c))
5557         EMIT_ONE_ASCII_BYTE (c);
5558       else if (CHAR_BYTE8_P (c))
5559         {
5560           c = CHAR_TO_BYTE8 (c);
5561           EMIT_ONE_BYTE (c);
5562         }
5563       else
5564         {
5565           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5566                                &code, charset);
5567
5568           if (charset)
5569             {
5570               if (CHARSET_DIMENSION (charset) == 1)
5571                 EMIT_ONE_BYTE (code);
5572               else if (CHARSET_DIMENSION (charset) == 2)
5573                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5574               else if (CHARSET_DIMENSION (charset) == 3)
5575                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5576               else
5577                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5578                                  (code >> 8) & 0xFF, code & 0xFF);
5579             }
5580           else
5581             {
5582               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5583                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5584               else
5585                 c = coding->default_char;
5586               EMIT_ONE_BYTE (c);
5587             }
5588         }
5589     }
5590
5591   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5592   coding->produced_char += produced_chars;
5593   coding->produced = dst - coding->destination;
5594   return 0;
5595 }
5596
5597 \f
5598 /*** 7. C library functions ***/
5599
5600 /* Setup coding context CODING from information about CODING_SYSTEM.
5601    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5602    CODING_SYSTEM is invalid, signal an error.  */
5603
5604 void
5605 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5606 {
5607   Lisp_Object attrs;
5608   Lisp_Object eol_type;
5609   Lisp_Object coding_type;
5610   Lisp_Object val;
5611
5612   if (NILP (coding_system))
5613     coding_system = Qundecided;
5614
5615   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5616
5617   attrs = CODING_ID_ATTRS (coding->id);
5618   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5619
5620   coding->mode = 0;
5621   coding->head_ascii = -1;
5622   if (VECTORP (eol_type))
5623     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5624                             | CODING_REQUIRE_DETECTION_MASK);
5625   else if (! EQ (eol_type, Qunix))
5626     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5627                             | CODING_REQUIRE_ENCODING_MASK);
5628   else
5629     coding->common_flags = 0;
5630   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5631     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5632   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5633     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5634   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5635     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5636
5637   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5638   coding->max_charset_id = SCHARS (val) - 1;
5639   coding->safe_charsets = SDATA (val);
5640   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5641   coding->carryover_bytes = 0;
5642
5643   coding_type = CODING_ATTR_TYPE (attrs);
5644   if (EQ (coding_type, Qundecided))
5645     {
5646       coding->detector = NULL;
5647       coding->decoder = decode_coding_raw_text;
5648       coding->encoder = encode_coding_raw_text;
5649       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5650     }
5651   else if (EQ (coding_type, Qiso_2022))
5652     {
5653       int i;
5654       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5655
5656       /* Invoke graphic register 0 to plane 0.  */
5657       CODING_ISO_INVOCATION (coding, 0) = 0;
5658       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5659       CODING_ISO_INVOCATION (coding, 1)
5660         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5661       /* Setup the initial status of designation.  */
5662       for (i = 0; i < 4; i++)
5663         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5664       /* Not single shifting initially.  */
5665       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5666       /* Beginning of buffer should also be regarded as bol. */
5667       CODING_ISO_BOL (coding) = 1;
5668       coding->detector = detect_coding_iso_2022;
5669       coding->decoder = decode_coding_iso_2022;
5670       coding->encoder = encode_coding_iso_2022;
5671       if (flags & CODING_ISO_FLAG_SAFE)
5672         coding->mode |= CODING_MODE_SAFE_ENCODING;
5673       coding->common_flags
5674         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5675             | CODING_REQUIRE_FLUSHING_MASK);
5676       if (flags & CODING_ISO_FLAG_COMPOSITION)
5677         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5678       if (flags & CODING_ISO_FLAG_DESIGNATION)
5679         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5680       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5681         {
5682           setup_iso_safe_charsets (attrs);
5683           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5684           coding->max_charset_id = SCHARS (val) - 1;
5685           coding->safe_charsets = SDATA (val);
5686         }
5687       CODING_ISO_FLAGS (coding) = flags;
5688       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5689       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5690       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5691       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5692     }
5693   else if (EQ (coding_type, Qcharset))
5694     {
5695       coding->detector = detect_coding_charset;
5696       coding->decoder = decode_coding_charset;
5697       coding->encoder = encode_coding_charset;
5698       coding->common_flags
5699         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5700     }
5701   else if (EQ (coding_type, Qutf_8))
5702     {
5703       val = AREF (attrs, coding_attr_utf_bom);
5704       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5705                                    : EQ (val, Qt) ? utf_with_bom
5706                                    : utf_without_bom);
5707       coding->detector = detect_coding_utf_8;
5708       coding->decoder = decode_coding_utf_8;
5709       coding->encoder = encode_coding_utf_8;
5710       coding->common_flags
5711         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5712       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5713         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5714     }
5715   else if (EQ (coding_type, Qutf_16))
5716     {
5717       val = AREF (attrs, coding_attr_utf_bom);
5718       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5719                                     : EQ (val, Qt) ? utf_with_bom
5720                                     : utf_without_bom);
5721       val = AREF (attrs, coding_attr_utf_16_endian);
5722       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5723                                        : utf_16_little_endian);
5724       CODING_UTF_16_SURROGATE (coding) = 0;
5725       coding->detector = detect_coding_utf_16;
5726       coding->decoder = decode_coding_utf_16;
5727       coding->encoder = encode_coding_utf_16;
5728       coding->common_flags
5729         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5730       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5731         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5732     }
5733   else if (EQ (coding_type, Qccl))
5734     {
5735       coding->detector = detect_coding_ccl;
5736       coding->decoder = decode_coding_ccl;
5737       coding->encoder = encode_coding_ccl;
5738       coding->common_flags
5739         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5740             | CODING_REQUIRE_FLUSHING_MASK);
5741     }
5742   else if (EQ (coding_type, Qemacs_mule))
5743     {
5744       coding->detector = detect_coding_emacs_mule;
5745       coding->decoder = decode_coding_emacs_mule;
5746       coding->encoder = encode_coding_emacs_mule;
5747       coding->common_flags
5748         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5749       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5750           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5751         {
5752           Lisp_Object tail, safe_charsets;
5753           int max_charset_id = 0;
5754
5755           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5756                tail = XCDR (tail))
5757             if (max_charset_id < XFASTINT (XCAR (tail)))
5758               max_charset_id = XFASTINT (XCAR (tail));
5759           safe_charsets = make_uninit_string (max_charset_id + 1);
5760           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5761           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5762                tail = XCDR (tail))
5763             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5764           coding->max_charset_id = max_charset_id;
5765           coding->safe_charsets = SDATA (safe_charsets);
5766         }
5767       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5768       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5769     }
5770   else if (EQ (coding_type, Qshift_jis))
5771     {
5772       coding->detector = detect_coding_sjis;
5773       coding->decoder = decode_coding_sjis;
5774       coding->encoder = encode_coding_sjis;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5777     }
5778   else if (EQ (coding_type, Qbig5))
5779     {
5780       coding->detector = detect_coding_big5;
5781       coding->decoder = decode_coding_big5;
5782       coding->encoder = encode_coding_big5;
5783       coding->common_flags
5784         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5785     }
5786   else                          /* EQ (coding_type, Qraw_text) */
5787     {
5788       coding->detector = NULL;
5789       coding->decoder = decode_coding_raw_text;
5790       coding->encoder = encode_coding_raw_text;
5791       if (! EQ (eol_type, Qunix))
5792         {
5793           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5794           if (! VECTORP (eol_type))
5795             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5796         }
5797
5798     }
5799
5800   return;
5801 }
5802
5803 /* Return a list of charsets supported by CODING.  */
5804
5805 Lisp_Object
5806 coding_charset_list (struct coding_system *coding)
5807 {
5808   Lisp_Object attrs, charset_list;
5809
5810   CODING_GET_INFO (coding, attrs, charset_list);
5811   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5812     {
5813       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5814
5815       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5816         charset_list = Viso_2022_charset_list;
5817     }
5818   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5819     {
5820       charset_list = Vemacs_mule_charset_list;
5821     }
5822   return charset_list;
5823 }
5824
5825
5826 /* Return a list of charsets supported by CODING-SYSTEM.  */
5827
5828 Lisp_Object
5829 coding_system_charset_list (Lisp_Object coding_system)
5830 {
5831   ptrdiff_t id;
5832   Lisp_Object attrs, charset_list;
5833
5834   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5835   attrs = CODING_ID_ATTRS (id);
5836
5837   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5838     {
5839       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5840
5841       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5842         charset_list = Viso_2022_charset_list;
5843       else
5844         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5845     }
5846   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5847     {
5848       charset_list = Vemacs_mule_charset_list;
5849     }
5850   else
5851     {
5852       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5853     }
5854   return charset_list;
5855 }
5856
5857
5858 /* Return raw-text or one of its subsidiaries that has the same
5859    eol_type as CODING-SYSTEM.  */
5860
5861 Lisp_Object
5862 raw_text_coding_system (Lisp_Object coding_system)
5863 {
5864   Lisp_Object spec, attrs;
5865   Lisp_Object eol_type, raw_text_eol_type;
5866
5867   if (NILP (coding_system))
5868     return Qraw_text;
5869   spec = CODING_SYSTEM_SPEC (coding_system);
5870   attrs = AREF (spec, 0);
5871
5872   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5873     return coding_system;
5874
5875   eol_type = AREF (spec, 2);
5876   if (VECTORP (eol_type))
5877     return Qraw_text;
5878   spec = CODING_SYSTEM_SPEC (Qraw_text);
5879   raw_text_eol_type = AREF (spec, 2);
5880   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5881           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5882           : AREF (raw_text_eol_type, 2));
5883 }
5884
5885
5886 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5887    the subsidiary that has the same eol-spec as PARENT (if it is not
5888    nil and specifies end-of-line format) or the system's setting
5889    (system_eol_type).  */
5890
5891 Lisp_Object
5892 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5893 {
5894   Lisp_Object spec, eol_type;
5895
5896   if (NILP (coding_system))
5897     coding_system = Qraw_text;
5898   spec = CODING_SYSTEM_SPEC (coding_system);
5899   eol_type = AREF (spec, 2);
5900   if (VECTORP (eol_type))
5901     {
5902       Lisp_Object parent_eol_type;
5903
5904       if (! NILP (parent))
5905         {
5906           Lisp_Object parent_spec;
5907
5908           parent_spec = CODING_SYSTEM_SPEC (parent);
5909           parent_eol_type = AREF (parent_spec, 2);
5910           if (VECTORP (parent_eol_type))
5911             parent_eol_type = system_eol_type;
5912         }
5913       else
5914         parent_eol_type = system_eol_type;
5915       if (EQ (parent_eol_type, Qunix))
5916         coding_system = AREF (eol_type, 0);
5917       else if (EQ (parent_eol_type, Qdos))
5918         coding_system = AREF (eol_type, 1);
5919       else if (EQ (parent_eol_type, Qmac))
5920         coding_system = AREF (eol_type, 2);
5921     }
5922   return coding_system;
5923 }
5924
5925
5926 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5927    decided for writing to a process.  If not, complement them, and
5928    return a new coding system.  */
5929
5930 Lisp_Object
5931 complement_process_encoding_system (Lisp_Object coding_system)
5932 {
5933   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5934   Lisp_Object spec, attrs;
5935   int i;
5936
5937   for (i = 0; i < 3; i++)
5938     {
5939       if (i == 1)
5940         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5941       else if (i == 2)
5942         coding_system = preferred_coding_system ();
5943       spec = CODING_SYSTEM_SPEC (coding_system);
5944       if (NILP (spec))
5945         continue;
5946       attrs = AREF (spec, 0);
5947       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5948         coding_base = CODING_ATTR_BASE_NAME (attrs);
5949       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5950         eol_base = coding_system;
5951       if (! NILP (coding_base) && ! NILP (eol_base))
5952         break;
5953     }
5954
5955   if (i > 0)
5956     /* The original CODING_SYSTEM didn't specify text-conversion or
5957        eol-conversion.  Be sure that we return a fully complemented
5958        coding system.  */
5959     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5960   return coding_system;
5961 }
5962
5963
5964 /* Emacs has a mechanism to automatically detect a coding system if it
5965    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5966    it's impossible to distinguish some coding systems accurately
5967    because they use the same range of codes.  So, at first, coding
5968    systems are categorized into 7, those are:
5969
5970    o coding-category-emacs-mule
5971
5972         The category for a coding system which has the same code range
5973         as Emacs' internal format.  Assigned the coding-system (Lisp
5974         symbol) `emacs-mule' by default.
5975
5976    o coding-category-sjis
5977
5978         The category for a coding system which has the same code range
5979         as SJIS.  Assigned the coding-system (Lisp
5980         symbol) `japanese-shift-jis' by default.
5981
5982    o coding-category-iso-7
5983
5984         The category for a coding system which has the same code range
5985         as ISO2022 of 7-bit environment.  This doesn't use any locking
5986         shift and single shift functions.  This can encode/decode all
5987         charsets.  Assigned the coding-system (Lisp symbol)
5988         `iso-2022-7bit' by default.
5989
5990    o coding-category-iso-7-tight
5991
5992         Same as coding-category-iso-7 except that this can
5993         encode/decode only the specified charsets.
5994
5995    o coding-category-iso-8-1
5996
5997         The category for a coding system which has the same code range
5998         as ISO2022 of 8-bit environment and graphic plane 1 used only
5999         for DIMENSION1 charset.  This doesn't use any locking shift
6000         and single shift functions.  Assigned the coding-system (Lisp
6001         symbol) `iso-latin-1' by default.
6002
6003    o coding-category-iso-8-2
6004
6005         The category for a coding system which has the same code range
6006         as ISO2022 of 8-bit environment and graphic plane 1 used only
6007         for DIMENSION2 charset.  This doesn't use any locking shift
6008         and single shift functions.  Assigned the coding-system (Lisp
6009         symbol) `japanese-iso-8bit' by default.
6010
6011    o coding-category-iso-7-else
6012
6013         The category for a coding system which has the same code range
6014         as ISO2022 of 7-bit environment but uses locking shift or
6015         single shift functions.  Assigned the coding-system (Lisp
6016         symbol) `iso-2022-7bit-lock' by default.
6017
6018    o coding-category-iso-8-else
6019
6020         The category for a coding system which has the same code range
6021         as ISO2022 of 8-bit environment but uses locking shift or
6022         single shift functions.  Assigned the coding-system (Lisp
6023         symbol) `iso-2022-8bit-ss2' by default.
6024
6025    o coding-category-big5
6026
6027         The category for a coding system which has the same code range
6028         as BIG5.  Assigned the coding-system (Lisp symbol)
6029         `cn-big5' by default.
6030
6031    o coding-category-utf-8
6032
6033         The category for a coding system which has the same code range
6034         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6035         symbol) `utf-8' by default.
6036
6037    o coding-category-utf-16-be
6038
6039         The category for a coding system in which a text has an
6040         Unicode signature (cf. Unicode Standard) in the order of BIG
6041         endian at the head.  Assigned the coding-system (Lisp symbol)
6042         `utf-16-be' by default.
6043
6044    o coding-category-utf-16-le
6045
6046         The category for a coding system in which a text has an
6047         Unicode signature (cf. Unicode Standard) in the order of
6048         LITTLE endian at the head.  Assigned the coding-system (Lisp
6049         symbol) `utf-16-le' by default.
6050
6051    o coding-category-ccl
6052
6053         The category for a coding system of which encoder/decoder is
6054         written in CCL programs.  The default value is nil, i.e., no
6055         coding system is assigned.
6056
6057    o coding-category-binary
6058
6059         The category for a coding system not categorized in any of the
6060         above.  Assigned the coding-system (Lisp symbol)
6061         `no-conversion' by default.
6062
6063    Each of them is a Lisp symbol and the value is an actual
6064    `coding-system's (this is also a Lisp symbol) assigned by a user.
6065    What Emacs does actually is to detect a category of coding system.
6066    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6067    decide only one possible category, it selects a category of the
6068    highest priority.  Priorities of categories are also specified by a
6069    user in a Lisp variable `coding-category-list'.
6070
6071 */
6072
6073 #define EOL_SEEN_NONE   0
6074 #define EOL_SEEN_LF     1
6075 #define EOL_SEEN_CR     2
6076 #define EOL_SEEN_CRLF   4
6077
6078 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6079    SOURCE is encoded.  If CATEGORY is one of
6080    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6081    two-byte, else they are encoded by one-byte.
6082
6083    Return one of EOL_SEEN_XXX.  */
6084
6085 #define MAX_EOL_CHECK_COUNT 3
6086
6087 static int
6088 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6089             enum coding_category category)
6090 {
6091   const unsigned char *src = source, *src_end = src + src_bytes;
6092   unsigned char c;
6093   int total  = 0;
6094   int eol_seen = EOL_SEEN_NONE;
6095
6096   if ((1 << category) & CATEGORY_MASK_UTF_16)
6097     {
6098       bool msb = category == (coding_category_utf_16_le
6099                               | coding_category_utf_16_le_nosig);
6100       bool lsb = !msb;
6101
6102       while (src + 1 < src_end)
6103         {
6104           c = src[lsb];
6105           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6106             {
6107               int this_eol;
6108
6109               if (c == '\n')
6110                 this_eol = EOL_SEEN_LF;
6111               else if (src + 3 >= src_end
6112                        || src[msb + 2] != 0
6113                        || src[lsb + 2] != '\n')
6114                 this_eol = EOL_SEEN_CR;
6115               else
6116                 {
6117                   this_eol = EOL_SEEN_CRLF;
6118                   src += 2;
6119                 }
6120
6121               if (eol_seen == EOL_SEEN_NONE)
6122                 /* This is the first end-of-line.  */
6123                 eol_seen = this_eol;
6124               else if (eol_seen != this_eol)
6125                 {
6126                   /* The found type is different from what found before.
6127                      Allow for stray ^M characters in DOS EOL files.  */
6128                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6129                       || (eol_seen == EOL_SEEN_CRLF
6130                           && this_eol == EOL_SEEN_CR))
6131                     eol_seen = EOL_SEEN_CRLF;
6132                   else
6133                     {
6134                       eol_seen = EOL_SEEN_LF;
6135                       break;
6136                     }
6137                 }
6138               if (++total == MAX_EOL_CHECK_COUNT)
6139                 break;
6140             }
6141           src += 2;
6142         }
6143     }
6144   else
6145     while (src < src_end)
6146       {
6147         c = *src++;
6148         if (c == '\n' || c == '\r')
6149           {
6150             int this_eol;
6151
6152             if (c == '\n')
6153               this_eol = EOL_SEEN_LF;
6154             else if (src >= src_end || *src != '\n')
6155               this_eol = EOL_SEEN_CR;
6156             else
6157               this_eol = EOL_SEEN_CRLF, src++;
6158
6159             if (eol_seen == EOL_SEEN_NONE)
6160               /* This is the first end-of-line.  */
6161               eol_seen = this_eol;
6162             else if (eol_seen != this_eol)
6163               {
6164                 /* The found type is different from what found before.
6165                    Allow for stray ^M characters in DOS EOL files.  */
6166                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6167                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6168                   eol_seen = EOL_SEEN_CRLF;
6169                 else
6170                   {
6171                     eol_seen = EOL_SEEN_LF;
6172                     break;
6173                   }
6174               }
6175             if (++total == MAX_EOL_CHECK_COUNT)
6176               break;
6177           }
6178       }
6179   return eol_seen;
6180 }
6181
6182
6183 static Lisp_Object
6184 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6185 {
6186   Lisp_Object eol_type;
6187
6188   eol_type = CODING_ID_EOL_TYPE (coding->id);
6189   if (eol_seen & EOL_SEEN_LF)
6190     {
6191       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6192       eol_type = Qunix;
6193     }
6194   else if (eol_seen & EOL_SEEN_CRLF)
6195     {
6196       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6197       eol_type = Qdos;
6198     }
6199   else if (eol_seen & EOL_SEEN_CR)
6200     {
6201       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6202       eol_type = Qmac;
6203     }
6204   return eol_type;
6205 }
6206
6207 /* Detect how a text specified in CODING is encoded.  If a coding
6208    system is detected, update fields of CODING by the detected coding
6209    system.  */
6210
6211 static void
6212 detect_coding (struct coding_system *coding)
6213 {
6214   const unsigned char *src, *src_end;
6215   unsigned int saved_mode = coding->mode;
6216
6217   coding->consumed = coding->consumed_char = 0;
6218   coding->produced = coding->produced_char = 0;
6219   coding_set_source (coding);
6220
6221   src_end = coding->source + coding->src_bytes;
6222   coding->head_ascii = 0;
6223
6224   /* If we have not yet decided the text encoding type, detect it
6225      now.  */
6226   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6227     {
6228       int c, i;
6229       struct coding_detection_info detect_info;
6230       bool null_byte_found = 0, eight_bit_found = 0;
6231
6232       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6233       for (src = coding->source; src < src_end; src++)
6234         {
6235           c = *src;
6236           if (c & 0x80)
6237             {
6238               eight_bit_found = 1;
6239               if (null_byte_found)
6240                 break;
6241             }
6242           else if (c < 0x20)
6243             {
6244               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6245                   && ! inhibit_iso_escape_detection
6246                   && ! detect_info.checked)
6247                 {
6248                   if (detect_coding_iso_2022 (coding, &detect_info))
6249                     {
6250                       /* We have scanned the whole data.  */
6251                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6252                         {
6253                           /* We didn't find an 8-bit code.  We may
6254                              have found a null-byte, but it's very
6255                              rare that a binary file conforms to
6256                              ISO-2022.  */
6257                           src = src_end;
6258                           coding->head_ascii = src - coding->source;
6259                         }
6260                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6261                       break;
6262                     }
6263                 }
6264               else if (! c && !inhibit_null_byte_detection)
6265                 {
6266                   null_byte_found = 1;
6267                   if (eight_bit_found)
6268                     break;
6269                 }
6270               if (! eight_bit_found)
6271                 coding->head_ascii++;
6272             }
6273           else if (! eight_bit_found)
6274             coding->head_ascii++;
6275         }
6276
6277       if (null_byte_found || eight_bit_found
6278           || coding->head_ascii < coding->src_bytes
6279           || detect_info.found)
6280         {
6281           enum coding_category category;
6282           struct coding_system *this;
6283
6284           if (coding->head_ascii == coding->src_bytes)
6285             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6286             for (i = 0; i < coding_category_raw_text; i++)
6287               {
6288                 category = coding_priorities[i];
6289                 this = coding_categories + category;
6290                 if (detect_info.found & (1 << category))
6291                   break;
6292               }
6293           else
6294             {
6295               if (null_byte_found)
6296                 {
6297                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6298                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6299                 }
6300               for (i = 0; i < coding_category_raw_text; i++)
6301                 {
6302                   category = coding_priorities[i];
6303                   this = coding_categories + category;
6304                   if (this->id < 0)
6305                     {
6306                       /* No coding system of this category is defined.  */
6307                       detect_info.rejected |= (1 << category);
6308                     }
6309                   else if (category >= coding_category_raw_text)
6310                     continue;
6311                   else if (detect_info.checked & (1 << category))
6312                     {
6313                       if (detect_info.found & (1 << category))
6314                         break;
6315                     }
6316                   else if ((*(this->detector)) (coding, &detect_info)
6317                            && detect_info.found & (1 << category))
6318                     {
6319                       if (category == coding_category_utf_16_auto)
6320                         {
6321                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6322                             category = coding_category_utf_16_le;
6323                           else
6324                             category = coding_category_utf_16_be;
6325                         }
6326                       break;
6327                     }
6328                 }
6329             }
6330
6331           if (i < coding_category_raw_text)
6332             setup_coding_system (CODING_ID_NAME (this->id), coding);
6333           else if (null_byte_found)
6334             setup_coding_system (Qno_conversion, coding);
6335           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6336                    == CATEGORY_MASK_ANY)
6337             setup_coding_system (Qraw_text, coding);
6338           else if (detect_info.rejected)
6339             for (i = 0; i < coding_category_raw_text; i++)
6340               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6341                 {
6342                   this = coding_categories + coding_priorities[i];
6343                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6344                   break;
6345                 }
6346         }
6347     }
6348   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6349            == coding_category_utf_8_auto)
6350     {
6351       Lisp_Object coding_systems;
6352       struct coding_detection_info detect_info;
6353
6354       coding_systems
6355         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6356       detect_info.found = detect_info.rejected = 0;
6357       coding->head_ascii = 0;
6358       if (CONSP (coding_systems)
6359           && detect_coding_utf_8 (coding, &detect_info))
6360         {
6361           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6362             setup_coding_system (XCAR (coding_systems), coding);
6363           else
6364             setup_coding_system (XCDR (coding_systems), coding);
6365         }
6366     }
6367   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6368            == coding_category_utf_16_auto)
6369     {
6370       Lisp_Object coding_systems;
6371       struct coding_detection_info detect_info;
6372
6373       coding_systems
6374         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6375       detect_info.found = detect_info.rejected = 0;
6376       coding->head_ascii = 0;
6377       if (CONSP (coding_systems)
6378           && detect_coding_utf_16 (coding, &detect_info))
6379         {
6380           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6381             setup_coding_system (XCAR (coding_systems), coding);
6382           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6383             setup_coding_system (XCDR (coding_systems), coding);
6384         }
6385     }
6386   coding->mode = saved_mode;
6387 }
6388
6389
6390 static void
6391 decode_eol (struct coding_system *coding)
6392 {
6393   Lisp_Object eol_type;
6394   unsigned char *p, *pbeg, *pend;
6395
6396   eol_type = CODING_ID_EOL_TYPE (coding->id);
6397   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6398     return;
6399
6400   if (NILP (coding->dst_object))
6401     pbeg = coding->destination;
6402   else
6403     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6404   pend = pbeg + coding->produced;
6405
6406   if (VECTORP (eol_type))
6407     {
6408       int eol_seen = EOL_SEEN_NONE;
6409
6410       for (p = pbeg; p < pend; p++)
6411         {
6412           if (*p == '\n')
6413             eol_seen |= EOL_SEEN_LF;
6414           else if (*p == '\r')
6415             {
6416               if (p + 1 < pend && *(p + 1) == '\n')
6417                 {
6418                   eol_seen |= EOL_SEEN_CRLF;
6419                   p++;
6420                 }
6421               else
6422                 eol_seen |= EOL_SEEN_CR;
6423             }
6424         }
6425       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6426       if ((eol_seen & EOL_SEEN_CRLF) != 0
6427           && (eol_seen & EOL_SEEN_CR) != 0
6428           && (eol_seen & EOL_SEEN_LF) == 0)
6429         eol_seen = EOL_SEEN_CRLF;
6430       else if (eol_seen != EOL_SEEN_NONE
6431           && eol_seen != EOL_SEEN_LF
6432           && eol_seen != EOL_SEEN_CRLF
6433           && eol_seen != EOL_SEEN_CR)
6434         eol_seen = EOL_SEEN_LF;
6435       if (eol_seen != EOL_SEEN_NONE)
6436         eol_type = adjust_coding_eol_type (coding, eol_seen);
6437     }
6438
6439   if (EQ (eol_type, Qmac))
6440     {
6441       for (p = pbeg; p < pend; p++)
6442         if (*p == '\r')
6443           *p = '\n';
6444     }
6445   else if (EQ (eol_type, Qdos))
6446     {
6447       ptrdiff_t n = 0;
6448
6449       if (NILP (coding->dst_object))
6450         {
6451           /* Start deleting '\r' from the tail to minimize the memory
6452              movement.  */
6453           for (p = pend - 2; p >= pbeg; p--)
6454             if (*p == '\r')
6455               {
6456                 memmove (p, p + 1, pend-- - p - 1);
6457                 n++;
6458               }
6459         }
6460       else
6461         {
6462           ptrdiff_t pos_byte = coding->dst_pos_byte;
6463           ptrdiff_t pos = coding->dst_pos;
6464           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6465
6466           while (pos < pos_end)
6467             {
6468               p = BYTE_POS_ADDR (pos_byte);
6469               if (*p == '\r' && p[1] == '\n')
6470                 {
6471                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6472                   n++;
6473                   pos_end--;
6474                 }
6475               pos++;
6476               if (coding->dst_multibyte)
6477                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6478               else
6479                 pos_byte++;
6480             }
6481         }
6482       coding->produced -= n;
6483       coding->produced_char -= n;
6484     }
6485 }
6486
6487
6488 /* Return a translation table (or list of them) from coding system
6489    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6490    not ENCODEP). */
6491
6492 static Lisp_Object
6493 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6494 {
6495   Lisp_Object standard, translation_table;
6496   Lisp_Object val;
6497
6498   if (NILP (Venable_character_translation))
6499     {
6500       if (max_lookup)
6501         *max_lookup = 0;
6502       return Qnil;
6503     }
6504   if (encodep)
6505     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6506       standard = Vstandard_translation_table_for_encode;
6507   else
6508     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6509       standard = Vstandard_translation_table_for_decode;
6510   if (NILP (translation_table))
6511     translation_table = standard;
6512   else
6513     {
6514       if (SYMBOLP (translation_table))
6515         translation_table = Fget (translation_table, Qtranslation_table);
6516       else if (CONSP (translation_table))
6517         {
6518           translation_table = Fcopy_sequence (translation_table);
6519           for (val = translation_table; CONSP (val); val = XCDR (val))
6520             if (SYMBOLP (XCAR (val)))
6521               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6522         }
6523       if (CHAR_TABLE_P (standard))
6524         {
6525           if (CONSP (translation_table))
6526             translation_table = nconc2 (translation_table,
6527                                         Fcons (standard, Qnil));
6528           else
6529             translation_table = Fcons (translation_table,
6530                                        Fcons (standard, Qnil));
6531         }
6532     }
6533
6534   if (max_lookup)
6535     {
6536       *max_lookup = 1;
6537       if (CHAR_TABLE_P (translation_table)
6538           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6539         {
6540           val = XCHAR_TABLE (translation_table)->extras[1];
6541           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6542             *max_lookup = XFASTINT (val);
6543         }
6544       else if (CONSP (translation_table))
6545         {
6546           Lisp_Object tail;
6547
6548           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6549             if (CHAR_TABLE_P (XCAR (tail))
6550                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6551               {
6552                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6553                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6554                   *max_lookup = XFASTINT (tailval);
6555               }
6556         }
6557     }
6558   return translation_table;
6559 }
6560
6561 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6562   do {                                                          \
6563     trans = Qnil;                                               \
6564     if (CHAR_TABLE_P (table))                                   \
6565       {                                                         \
6566         trans = CHAR_TABLE_REF (table, c);                      \
6567         if (CHARACTERP (trans))                                 \
6568           c = XFASTINT (trans), trans = Qnil;                   \
6569       }                                                         \
6570     else if (CONSP (table))                                     \
6571       {                                                         \
6572         Lisp_Object tail;                                       \
6573                                                                 \
6574         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6575           if (CHAR_TABLE_P (XCAR (tail)))                       \
6576             {                                                   \
6577               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6578               if (CHARACTERP (trans))                           \
6579                 c = XFASTINT (trans), trans = Qnil;             \
6580               else if (! NILP (trans))                          \
6581                 break;                                          \
6582             }                                                   \
6583       }                                                         \
6584   } while (0)
6585
6586
6587 /* Return a translation of character(s) at BUF according to TRANS.
6588    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6589    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6590    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6591    translation is found, and Qnil if not found..
6592    If BUF is too short to lookup characters in FROM, return Qt.  */
6593
6594 static Lisp_Object
6595 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6596 {
6597
6598   if (INTEGERP (trans))
6599     return trans;
6600   for (; CONSP (trans); trans = XCDR (trans))
6601     {
6602       Lisp_Object val = XCAR (trans);
6603       Lisp_Object from = XCAR (val);
6604       ptrdiff_t len = ASIZE (from);
6605       ptrdiff_t i;
6606
6607       for (i = 0; i < len; i++)
6608         {
6609           if (buf + i == buf_end)
6610             return Qt;
6611           if (XINT (AREF (from, i)) != buf[i])
6612             break;
6613         }
6614       if (i == len)
6615         return val;
6616     }
6617   return Qnil;
6618 }
6619
6620
6621 static int
6622 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6623                bool last_block)
6624 {
6625   unsigned char *dst = coding->destination + coding->produced;
6626   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6627   ptrdiff_t produced;
6628   ptrdiff_t produced_chars = 0;
6629   int carryover = 0;
6630
6631   if (! coding->chars_at_source)
6632     {
6633       /* Source characters are in coding->charbuf.  */
6634       int *buf = coding->charbuf;
6635       int *buf_end = buf + coding->charbuf_used;
6636
6637       if (EQ (coding->src_object, coding->dst_object))
6638         {
6639           coding_set_source (coding);
6640           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6641         }
6642
6643       while (buf < buf_end)
6644         {
6645           int c = *buf;
6646           ptrdiff_t i;
6647
6648           if (c >= 0)
6649             {
6650               ptrdiff_t from_nchars = 1, to_nchars = 1;
6651               Lisp_Object trans = Qnil;
6652
6653               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6654               if (! NILP (trans))
6655                 {
6656                   trans = get_translation (trans, buf, buf_end);
6657                   if (INTEGERP (trans))
6658                     c = XINT (trans);
6659                   else if (CONSP (trans))
6660                     {
6661                       from_nchars = ASIZE (XCAR (trans));
6662                       trans = XCDR (trans);
6663                       if (INTEGERP (trans))
6664                         c = XINT (trans);
6665                       else
6666                         {
6667                           to_nchars = ASIZE (trans);
6668                           c = XINT (AREF (trans, 0));
6669                         }
6670                     }
6671                   else if (EQ (trans, Qt) && ! last_block)
6672                     break;
6673                 }
6674
6675               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6676                 {
6677                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6678                        / MAX_MULTIBYTE_LENGTH)
6679                       < to_nchars)
6680                     memory_full (SIZE_MAX);
6681                   dst = alloc_destination (coding,
6682                                            buf_end - buf
6683                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6684                                            dst);
6685                   if (EQ (coding->src_object, coding->dst_object))
6686                     {
6687                       coding_set_source (coding);
6688                       dst_end = (((unsigned char *) coding->source)
6689                                  + coding->consumed);
6690                     }
6691                   else
6692                     dst_end = coding->destination + coding->dst_bytes;
6693                 }
6694
6695               for (i = 0; i < to_nchars; i++)
6696                 {
6697                   if (i > 0)
6698                     c = XINT (AREF (trans, i));
6699                   if (coding->dst_multibyte
6700                       || ! CHAR_BYTE8_P (c))
6701                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6702                   else
6703                     *dst++ = CHAR_TO_BYTE8 (c);
6704                 }
6705               produced_chars += to_nchars;
6706               buf += from_nchars;
6707             }
6708           else
6709             /* This is an annotation datum.  (-C) is the length.  */
6710             buf += -c;
6711         }
6712       carryover = buf_end - buf;
6713     }
6714   else
6715     {
6716       /* Source characters are at coding->source.  */
6717       const unsigned char *src = coding->source;
6718       const unsigned char *src_end = src + coding->consumed;
6719
6720       if (EQ (coding->dst_object, coding->src_object))
6721         dst_end = (unsigned char *) src;
6722       if (coding->src_multibyte != coding->dst_multibyte)
6723         {
6724           if (coding->src_multibyte)
6725             {
6726               bool multibytep = 1;
6727               ptrdiff_t consumed_chars = 0;
6728
6729               while (1)
6730                 {
6731                   const unsigned char *src_base = src;
6732                   int c;
6733
6734                   ONE_MORE_BYTE (c);
6735                   if (dst == dst_end)
6736                     {
6737                       if (EQ (coding->src_object, coding->dst_object))
6738                         dst_end = (unsigned char *) src;
6739                       if (dst == dst_end)
6740                         {
6741                           ptrdiff_t offset = src - coding->source;
6742
6743                           dst = alloc_destination (coding, src_end - src + 1,
6744                                                    dst);
6745                           dst_end = coding->destination + coding->dst_bytes;
6746                           coding_set_source (coding);
6747                           src = coding->source + offset;
6748                           src_end = coding->source + coding->consumed;
6749                           if (EQ (coding->src_object, coding->dst_object))
6750                             dst_end = (unsigned char *) src;
6751                         }
6752                     }
6753                   *dst++ = c;
6754                   produced_chars++;
6755                 }
6756             no_more_source:
6757               ;
6758             }
6759           else
6760             while (src < src_end)
6761               {
6762                 bool multibytep = 1;
6763                 int c = *src++;
6764
6765                 if (dst >= dst_end - 1)
6766                   {
6767                     if (EQ (coding->src_object, coding->dst_object))
6768                       dst_end = (unsigned char *) src;
6769                     if (dst >= dst_end - 1)
6770                       {
6771                         ptrdiff_t offset = src - coding->source;
6772                         ptrdiff_t more_bytes;
6773
6774                         if (EQ (coding->src_object, coding->dst_object))
6775                           more_bytes = ((src_end - src) / 2) + 2;
6776                         else
6777                           more_bytes = src_end - src + 2;
6778                         dst = alloc_destination (coding, more_bytes, dst);
6779                         dst_end = coding->destination + coding->dst_bytes;
6780                         coding_set_source (coding);
6781                         src = coding->source + offset;
6782                         src_end = coding->source + coding->consumed;
6783                         if (EQ (coding->src_object, coding->dst_object))
6784                           dst_end = (unsigned char *) src;
6785                       }
6786                   }
6787                 EMIT_ONE_BYTE (c);
6788               }
6789         }
6790       else
6791         {
6792           if (!EQ (coding->src_object, coding->dst_object))
6793             {
6794               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6795
6796               if (require > 0)
6797                 {
6798                   ptrdiff_t offset = src - coding->source;
6799
6800                   dst = alloc_destination (coding, require, dst);
6801                   coding_set_source (coding);
6802                   src = coding->source + offset;
6803                   src_end = coding->source + coding->consumed;
6804                 }
6805             }
6806           produced_chars = coding->consumed_char;
6807           while (src < src_end)
6808             *dst++ = *src++;
6809         }
6810     }
6811
6812   produced = dst - (coding->destination + coding->produced);
6813   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6814     insert_from_gap (produced_chars, produced);
6815   coding->produced += produced;
6816   coding->produced_char += produced_chars;
6817   return carryover;
6818 }
6819
6820 /* Compose text in CODING->object according to the annotation data at
6821    CHARBUF.  CHARBUF is an array:
6822      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6823  */
6824
6825 static inline void
6826 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6827 {
6828   int len;
6829   ptrdiff_t to;
6830   enum composition_method method;
6831   Lisp_Object components;
6832
6833   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6834   to = pos + charbuf[2];
6835   method = (enum composition_method) (charbuf[4]);
6836
6837   if (method == COMPOSITION_RELATIVE)
6838     components = Qnil;
6839   else
6840     {
6841       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6842       int i, j;
6843
6844       if (method == COMPOSITION_WITH_RULE)
6845         len = charbuf[2] * 3 - 2;
6846       charbuf += MAX_ANNOTATION_LENGTH;
6847       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6848       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6849         {
6850           if (charbuf[i] >= 0)
6851             args[j] = make_number (charbuf[i]);
6852           else
6853             {
6854               i++;
6855               args[j] = make_number (charbuf[i] % 0x100);
6856             }
6857         }
6858       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6859     }
6860   compose_text (pos, to, components, Qnil, coding->dst_object);
6861 }
6862
6863
6864 /* Put `charset' property on text in CODING->object according to
6865    the annotation data at CHARBUF.  CHARBUF is an array:
6866      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6867  */
6868
6869 static inline void
6870 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6871 {
6872   ptrdiff_t from = pos - charbuf[2];
6873   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6874
6875   Fput_text_property (make_number (from), make_number (pos),
6876                       Qcharset, CHARSET_NAME (charset),
6877                       coding->dst_object);
6878 }
6879
6880
6881 #define CHARBUF_SIZE 0x4000
6882
6883 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6884   do {                                                                  \
6885     int size = CHARBUF_SIZE;                                            \
6886                                                                         \
6887     coding->charbuf = NULL;                                             \
6888     while (size > 1024)                                                 \
6889       {                                                                 \
6890         coding->charbuf = alloca (sizeof (int) * size);                 \
6891         if (coding->charbuf)                                            \
6892           break;                                                        \
6893         size >>= 1;                                                     \
6894       }                                                                 \
6895     if (! coding->charbuf)                                              \
6896       {                                                                 \
6897         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6898         return;                                                         \
6899       }                                                                 \
6900     coding->charbuf_size = size;                                        \
6901   } while (0)
6902
6903
6904 static void
6905 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6906 {
6907   int *charbuf = coding->charbuf;
6908   int *charbuf_end = charbuf + coding->charbuf_used;
6909
6910   if (NILP (coding->dst_object))
6911     return;
6912
6913   while (charbuf < charbuf_end)
6914     {
6915       if (*charbuf >= 0)
6916         pos++, charbuf++;
6917       else
6918         {
6919           int len = -*charbuf;
6920
6921           if (len > 2)
6922             switch (charbuf[1])
6923               {
6924               case CODING_ANNOTATE_COMPOSITION_MASK:
6925                 produce_composition (coding, charbuf, pos);
6926                 break;
6927               case CODING_ANNOTATE_CHARSET_MASK:
6928                 produce_charset (coding, charbuf, pos);
6929                 break;
6930               }
6931           charbuf += len;
6932         }
6933     }
6934 }
6935
6936 /* Decode the data at CODING->src_object into CODING->dst_object.
6937    CODING->src_object is a buffer, a string, or nil.
6938    CODING->dst_object is a buffer.
6939
6940    If CODING->src_object is a buffer, it must be the current buffer.
6941    In this case, if CODING->src_pos is positive, it is a position of
6942    the source text in the buffer, otherwise, the source text is in the
6943    gap area of the buffer, and CODING->src_pos specifies the offset of
6944    the text from GPT (which must be the same as PT).  If this is the
6945    same buffer as CODING->dst_object, CODING->src_pos must be
6946    negative.
6947
6948    If CODING->src_object is a string, CODING->src_pos is an index to
6949    that string.
6950
6951    If CODING->src_object is nil, CODING->source must already point to
6952    the non-relocatable memory area.  In this case, CODING->src_pos is
6953    an offset from CODING->source.
6954
6955    The decoded data is inserted at the current point of the buffer
6956    CODING->dst_object.
6957 */
6958
6959 static void
6960 decode_coding (struct coding_system *coding)
6961 {
6962   Lisp_Object attrs;
6963   Lisp_Object undo_list;
6964   Lisp_Object translation_table;
6965   struct ccl_spec cclspec;
6966   int carryover;
6967   int i;
6968
6969   if (BUFFERP (coding->src_object)
6970       && coding->src_pos > 0
6971       && coding->src_pos < GPT
6972       && coding->src_pos + coding->src_chars > GPT)
6973     move_gap_both (coding->src_pos, coding->src_pos_byte);
6974
6975   undo_list = Qt;
6976   if (BUFFERP (coding->dst_object))
6977     {
6978       set_buffer_internal (XBUFFER (coding->dst_object));
6979       if (GPT != PT)
6980         move_gap_both (PT, PT_BYTE);
6981
6982       /* We must disable undo_list in order to record the whole insert
6983          transaction via record_insert at the end.  But doing so also
6984          disables the recording of the first change to the undo_list.
6985          Therefore we check for first change here and record it via
6986          record_first_change if needed.  */
6987       if (MODIFF <= SAVE_MODIFF)
6988         record_first_change ();
6989
6990       undo_list = BVAR (current_buffer, undo_list);
6991       bset_undo_list (current_buffer, Qt);
6992     }
6993
6994   coding->consumed = coding->consumed_char = 0;
6995   coding->produced = coding->produced_char = 0;
6996   coding->chars_at_source = 0;
6997   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6998   coding->errors = 0;
6999
7000   ALLOC_CONVERSION_WORK_AREA (coding);
7001
7002   attrs = CODING_ID_ATTRS (coding->id);
7003   translation_table = get_translation_table (attrs, 0, NULL);
7004
7005   carryover = 0;
7006   if (coding->decoder == decode_coding_ccl)
7007     {
7008       coding->spec.ccl = &cclspec;
7009       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7010     }
7011   do
7012     {
7013       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7014
7015       coding_set_source (coding);
7016       coding->annotated = 0;
7017       coding->charbuf_used = carryover;
7018       (*(coding->decoder)) (coding);
7019       coding_set_destination (coding);
7020       carryover = produce_chars (coding, translation_table, 0);
7021       if (coding->annotated)
7022         produce_annotation (coding, pos);
7023       for (i = 0; i < carryover; i++)
7024         coding->charbuf[i]
7025           = coding->charbuf[coding->charbuf_used - carryover + i];
7026     }
7027   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7028          || (coding->consumed < coding->src_bytes
7029              && (coding->result == CODING_RESULT_SUCCESS
7030                  || coding->result == CODING_RESULT_INVALID_SRC)));
7031
7032   if (carryover > 0)
7033     {
7034       coding_set_destination (coding);
7035       coding->charbuf_used = carryover;
7036       produce_chars (coding, translation_table, 1);
7037     }
7038
7039   coding->carryover_bytes = 0;
7040   if (coding->consumed < coding->src_bytes)
7041     {
7042       int nbytes = coding->src_bytes - coding->consumed;
7043       const unsigned char *src;
7044
7045       coding_set_source (coding);
7046       coding_set_destination (coding);
7047       src = coding->source + coding->consumed;
7048
7049       if (coding->mode & CODING_MODE_LAST_BLOCK)
7050         {
7051           /* Flush out unprocessed data as binary chars.  We are sure
7052              that the number of data is less than the size of
7053              coding->charbuf.  */
7054           coding->charbuf_used = 0;
7055           coding->chars_at_source = 0;
7056
7057           while (nbytes-- > 0)
7058             {
7059               int c = *src++;
7060
7061               if (c & 0x80)
7062                 c = BYTE8_TO_CHAR (c);
7063               coding->charbuf[coding->charbuf_used++] = c;
7064             }
7065           produce_chars (coding, Qnil, 1);
7066         }
7067       else
7068         {
7069           /* Record unprocessed bytes in coding->carryover.  We are
7070              sure that the number of data is less than the size of
7071              coding->carryover.  */
7072           unsigned char *p = coding->carryover;
7073
7074           if (nbytes > sizeof coding->carryover)
7075             nbytes = sizeof coding->carryover;
7076           coding->carryover_bytes = nbytes;
7077           while (nbytes-- > 0)
7078             *p++ = *src++;
7079         }
7080       coding->consumed = coding->src_bytes;
7081     }
7082
7083   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7084       && !inhibit_eol_conversion)
7085     decode_eol (coding);
7086   if (BUFFERP (coding->dst_object))
7087     {
7088       bset_undo_list (current_buffer, undo_list);
7089       record_insert (coding->dst_pos, coding->produced_char);
7090     }
7091 }
7092
7093
7094 /* Extract an annotation datum from a composition starting at POS and
7095    ending before LIMIT of CODING->src_object (buffer or string), store
7096    the data in BUF, set *STOP to a starting position of the next
7097    composition (if any) or to LIMIT, and return the address of the
7098    next element of BUF.
7099
7100    If such an annotation is not found, set *STOP to a starting
7101    position of a composition after POS (if any) or to LIMIT, and
7102    return BUF.  */
7103
7104 static inline int *
7105 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7106                                struct coding_system *coding, int *buf,
7107                                ptrdiff_t *stop)
7108 {
7109   ptrdiff_t start, end;
7110   Lisp_Object prop;
7111
7112   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7113       || end > limit)
7114     *stop = limit;
7115   else if (start > pos)
7116     *stop = start;
7117   else
7118     {
7119       if (start == pos)
7120         {
7121           /* We found a composition.  Store the corresponding
7122              annotation data in BUF.  */
7123           int *head = buf;
7124           enum composition_method method = COMPOSITION_METHOD (prop);
7125           int nchars = COMPOSITION_LENGTH (prop);
7126
7127           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7128           if (method != COMPOSITION_RELATIVE)
7129             {
7130               Lisp_Object components;
7131               ptrdiff_t i, len, i_byte;
7132
7133               components = COMPOSITION_COMPONENTS (prop);
7134               if (VECTORP (components))
7135                 {
7136                   len = ASIZE (components);
7137                   for (i = 0; i < len; i++)
7138                     *buf++ = XINT (AREF (components, i));
7139                 }
7140               else if (STRINGP (components))
7141                 {
7142                   len = SCHARS (components);
7143                   i = i_byte = 0;
7144                   while (i < len)
7145                     {
7146                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7147                       buf++;
7148                     }
7149                 }
7150               else if (INTEGERP (components))
7151                 {
7152                   len = 1;
7153                   *buf++ = XINT (components);
7154                 }
7155               else if (CONSP (components))
7156                 {
7157                   for (len = 0; CONSP (components);
7158                        len++, components = XCDR (components))
7159                     *buf++ = XINT (XCAR (components));
7160                 }
7161               else
7162                 emacs_abort ();
7163               *head -= len;
7164             }
7165         }
7166
7167       if (find_composition (end, limit, &start, &end, &prop,
7168                             coding->src_object)
7169           && end <= limit)
7170         *stop = start;
7171       else
7172         *stop = limit;
7173     }
7174   return buf;
7175 }
7176
7177
7178 /* Extract an annotation datum from a text property `charset' at POS of
7179    CODING->src_object (buffer of string), store the data in BUF, set
7180    *STOP to the position where the value of `charset' property changes
7181    (limiting by LIMIT), and return the address of the next element of
7182    BUF.
7183
7184    If the property value is nil, set *STOP to the position where the
7185    property value is non-nil (limiting by LIMIT), and return BUF.  */
7186
7187 static inline int *
7188 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7189                            struct coding_system *coding, int *buf,
7190                            ptrdiff_t *stop)
7191 {
7192   Lisp_Object val, next;
7193   int id;
7194
7195   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7196   if (! NILP (val) && CHARSETP (val))
7197     id = XINT (CHARSET_SYMBOL_ID (val));
7198   else
7199     id = -1;
7200   ADD_CHARSET_DATA (buf, 0, id);
7201   next = Fnext_single_property_change (make_number (pos), Qcharset,
7202                                        coding->src_object,
7203                                        make_number (limit));
7204   *stop = XINT (next);
7205   return buf;
7206 }
7207
7208
7209 static void
7210 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7211                int max_lookup)
7212 {
7213   int *buf = coding->charbuf;
7214   int *buf_end = coding->charbuf + coding->charbuf_size;
7215   const unsigned char *src = coding->source + coding->consumed;
7216   const unsigned char *src_end = coding->source + coding->src_bytes;
7217   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7218   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7219   bool multibytep = coding->src_multibyte;
7220   Lisp_Object eol_type;
7221   int c;
7222   ptrdiff_t stop, stop_composition, stop_charset;
7223   int *lookup_buf = NULL;
7224
7225   if (! NILP (translation_table))
7226     lookup_buf = alloca (sizeof (int) * max_lookup);
7227
7228   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7229   if (VECTORP (eol_type))
7230     eol_type = Qunix;
7231
7232   /* Note: composition handling is not yet implemented.  */
7233   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7234
7235   if (NILP (coding->src_object))
7236     stop = stop_composition = stop_charset = end_pos;
7237   else
7238     {
7239       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7240         stop = stop_composition = pos;
7241       else
7242         stop = stop_composition = end_pos;
7243       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7244         stop = stop_charset = pos;
7245       else
7246         stop_charset = end_pos;
7247     }
7248
7249   /* Compensate for CRLF and conversion.  */
7250   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7251   while (buf < buf_end)
7252     {
7253       Lisp_Object trans;
7254
7255       if (pos == stop)
7256         {
7257           if (pos == end_pos)
7258             break;
7259           if (pos == stop_composition)
7260             buf = handle_composition_annotation (pos, end_pos, coding,
7261                                                  buf, &stop_composition);
7262           if (pos == stop_charset)
7263             buf = handle_charset_annotation (pos, end_pos, coding,
7264                                              buf, &stop_charset);
7265           stop = (stop_composition < stop_charset
7266                   ? stop_composition : stop_charset);
7267         }
7268
7269       if (! multibytep)
7270         {
7271           int bytes;
7272
7273           if (coding->encoder == encode_coding_raw_text
7274               || coding->encoder == encode_coding_ccl)
7275             c = *src++, pos++;
7276           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7277             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7278           else
7279             c = BYTE8_TO_CHAR (*src), src++, pos++;
7280         }
7281       else
7282         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7283       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7284         c = '\n';
7285       if (! EQ (eol_type, Qunix))
7286         {
7287           if (c == '\n')
7288             {
7289               if (EQ (eol_type, Qdos))
7290                 *buf++ = '\r';
7291               else
7292                 c = '\r';
7293             }
7294         }
7295
7296       trans = Qnil;
7297       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7298       if (NILP (trans))
7299         *buf++ = c;
7300       else
7301         {
7302           ptrdiff_t from_nchars = 1, to_nchars = 1;
7303           int *lookup_buf_end;
7304           const unsigned char *p = src;
7305           int i;
7306
7307           lookup_buf[0] = c;
7308           for (i = 1; i < max_lookup && p < src_end; i++)
7309             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7310           lookup_buf_end = lookup_buf + i;
7311           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7312           if (INTEGERP (trans))
7313             c = XINT (trans);
7314           else if (CONSP (trans))
7315             {
7316               from_nchars = ASIZE (XCAR (trans));
7317               trans = XCDR (trans);
7318               if (INTEGERP (trans))
7319                 c = XINT (trans);
7320               else
7321                 {
7322                   to_nchars = ASIZE (trans);
7323                   if (buf_end - buf < to_nchars)
7324                     break;
7325                   c = XINT (AREF (trans, 0));
7326                 }
7327             }
7328           else
7329             break;
7330           *buf++ = c;
7331           for (i = 1; i < to_nchars; i++)
7332             *buf++ = XINT (AREF (trans, i));
7333           for (i = 1; i < from_nchars; i++, pos++)
7334             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7335         }
7336     }
7337
7338   coding->consumed = src - coding->source;
7339   coding->consumed_char = pos - coding->src_pos;
7340   coding->charbuf_used = buf - coding->charbuf;
7341   coding->chars_at_source = 0;
7342 }
7343
7344
7345 /* Encode the text at CODING->src_object into CODING->dst_object.
7346    CODING->src_object is a buffer or a string.
7347    CODING->dst_object is a buffer or nil.
7348
7349    If CODING->src_object is a buffer, it must be the current buffer.
7350    In this case, if CODING->src_pos is positive, it is a position of
7351    the source text in the buffer, otherwise. the source text is in the
7352    gap area of the buffer, and coding->src_pos specifies the offset of
7353    the text from GPT (which must be the same as PT).  If this is the
7354    same buffer as CODING->dst_object, CODING->src_pos must be
7355    negative and CODING should not have `pre-write-conversion'.
7356
7357    If CODING->src_object is a string, CODING should not have
7358    `pre-write-conversion'.
7359
7360    If CODING->dst_object is a buffer, the encoded data is inserted at
7361    the current point of that buffer.
7362
7363    If CODING->dst_object is nil, the encoded data is placed at the
7364    memory area specified by CODING->destination.  */
7365
7366 static void
7367 encode_coding (struct coding_system *coding)
7368 {
7369   Lisp_Object attrs;
7370   Lisp_Object translation_table;
7371   int max_lookup;
7372   struct ccl_spec cclspec;
7373
7374   attrs = CODING_ID_ATTRS (coding->id);
7375   if (coding->encoder == encode_coding_raw_text)
7376     translation_table = Qnil, max_lookup = 0;
7377   else
7378     translation_table = get_translation_table (attrs, 1, &max_lookup);
7379
7380   if (BUFFERP (coding->dst_object))
7381     {
7382       set_buffer_internal (XBUFFER (coding->dst_object));
7383       coding->dst_multibyte
7384         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7385     }
7386
7387   coding->consumed = coding->consumed_char = 0;
7388   coding->produced = coding->produced_char = 0;
7389   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7390   coding->errors = 0;
7391
7392   ALLOC_CONVERSION_WORK_AREA (coding);
7393
7394   if (coding->encoder == encode_coding_ccl)
7395     {
7396       coding->spec.ccl = &cclspec;
7397       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7398     }
7399   do {
7400     coding_set_source (coding);
7401     consume_chars (coding, translation_table, max_lookup);
7402     coding_set_destination (coding);
7403     (*(coding->encoder)) (coding);
7404   } while (coding->consumed_char < coding->src_chars);
7405
7406   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7407     insert_from_gap (coding->produced_char, coding->produced);
7408 }
7409
7410
7411 /* Name (or base name) of work buffer for code conversion.  */
7412 static Lisp_Object Vcode_conversion_workbuf_name;
7413
7414 /* A working buffer used by the top level conversion.  Once it is
7415    created, it is never destroyed.  It has the name
7416    Vcode_conversion_workbuf_name.  The other working buffers are
7417    destroyed after the use is finished, and their names are modified
7418    versions of Vcode_conversion_workbuf_name.  */
7419 static Lisp_Object Vcode_conversion_reused_workbuf;
7420
7421 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7422 static bool reused_workbuf_in_use;
7423
7424
7425 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7426    multibyteness of returning buffer.  */
7427
7428 static Lisp_Object
7429 make_conversion_work_buffer (bool multibyte)
7430 {
7431   Lisp_Object name, workbuf;
7432   struct buffer *current;
7433
7434   if (reused_workbuf_in_use)
7435     {
7436       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7437       workbuf = Fget_buffer_create (name);
7438     }
7439   else
7440     {
7441       reused_workbuf_in_use = 1;
7442       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7443         Vcode_conversion_reused_workbuf
7444           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7445       workbuf = Vcode_conversion_reused_workbuf;
7446     }
7447   current = current_buffer;
7448   set_buffer_internal (XBUFFER (workbuf));
7449   /* We can't allow modification hooks to run in the work buffer.  For
7450      instance, directory_files_internal assumes that file decoding
7451      doesn't compile new regexps.  */
7452   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7453   Ferase_buffer ();
7454   bset_undo_list (current_buffer, Qt);
7455   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7456   set_buffer_internal (current);
7457   return workbuf;
7458 }
7459
7460
7461 static Lisp_Object
7462 code_conversion_restore (Lisp_Object arg)
7463 {
7464   Lisp_Object current, workbuf;
7465   struct gcpro gcpro1;
7466
7467   GCPRO1 (arg);
7468   current = XCAR (arg);
7469   workbuf = XCDR (arg);
7470   if (! NILP (workbuf))
7471     {
7472       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7473         reused_workbuf_in_use = 0;
7474       else
7475         Fkill_buffer (workbuf);
7476     }
7477   set_buffer_internal (XBUFFER (current));
7478   UNGCPRO;
7479   return Qnil;
7480 }
7481
7482 Lisp_Object
7483 code_conversion_save (bool with_work_buf, bool multibyte)
7484 {
7485   Lisp_Object workbuf = Qnil;
7486
7487   if (with_work_buf)
7488     workbuf = make_conversion_work_buffer (multibyte);
7489   record_unwind_protect (code_conversion_restore,
7490                          Fcons (Fcurrent_buffer (), workbuf));
7491   return workbuf;
7492 }
7493
7494 void
7495 decode_coding_gap (struct coding_system *coding,
7496                    ptrdiff_t chars, ptrdiff_t bytes)
7497 {
7498   ptrdiff_t count = SPECPDL_INDEX ();
7499   Lisp_Object attrs;
7500
7501   code_conversion_save (0, 0);
7502
7503   coding->src_object = Fcurrent_buffer ();
7504   coding->src_chars = chars;
7505   coding->src_bytes = bytes;
7506   coding->src_pos = -chars;
7507   coding->src_pos_byte = -bytes;
7508   coding->src_multibyte = chars < bytes;
7509   coding->dst_object = coding->src_object;
7510   coding->dst_pos = PT;
7511   coding->dst_pos_byte = PT_BYTE;
7512   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7513
7514   if (CODING_REQUIRE_DETECTION (coding))
7515     detect_coding (coding);
7516
7517   coding->mode |= CODING_MODE_LAST_BLOCK;
7518   current_buffer->text->inhibit_shrinking = 1;
7519   decode_coding (coding);
7520   current_buffer->text->inhibit_shrinking = 0;
7521
7522   attrs = CODING_ID_ATTRS (coding->id);
7523   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7524     {
7525       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7526       Lisp_Object val;
7527
7528       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7529       val = call1 (CODING_ATTR_POST_READ (attrs),
7530                    make_number (coding->produced_char));
7531       CHECK_NATNUM (val);
7532       coding->produced_char += Z - prev_Z;
7533       coding->produced += Z_BYTE - prev_Z_BYTE;
7534     }
7535
7536   unbind_to (count, Qnil);
7537 }
7538
7539
7540 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7541    SRC_OBJECT into DST_OBJECT by coding context CODING.
7542
7543    SRC_OBJECT is a buffer, a string, or Qnil.
7544
7545    If it is a buffer, the text is at point of the buffer.  FROM and TO
7546    are positions in the buffer.
7547
7548    If it is a string, the text is at the beginning of the string.
7549    FROM and TO are indices to the string.
7550
7551    If it is nil, the text is at coding->source.  FROM and TO are
7552    indices to coding->source.
7553
7554    DST_OBJECT is a buffer, Qt, or Qnil.
7555
7556    If it is a buffer, the decoded text is inserted at point of the
7557    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7558    is deleted.
7559
7560    If it is Qt, a string is made from the decoded text, and
7561    set in CODING->dst_object.
7562
7563    If it is Qnil, the decoded text is stored at CODING->destination.
7564    The caller must allocate CODING->dst_bytes bytes at
7565    CODING->destination by xmalloc.  If the decoded text is longer than
7566    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7567  */
7568
7569 void
7570 decode_coding_object (struct coding_system *coding,
7571                       Lisp_Object src_object,
7572                       ptrdiff_t from, ptrdiff_t from_byte,
7573                       ptrdiff_t to, ptrdiff_t to_byte,
7574                       Lisp_Object dst_object)
7575 {
7576   ptrdiff_t count = SPECPDL_INDEX ();
7577   unsigned char *destination IF_LINT (= NULL);
7578   ptrdiff_t dst_bytes IF_LINT (= 0);
7579   ptrdiff_t chars = to - from;
7580   ptrdiff_t bytes = to_byte - from_byte;
7581   Lisp_Object attrs;
7582   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7583   bool need_marker_adjustment = 0;
7584   Lisp_Object old_deactivate_mark;
7585
7586   old_deactivate_mark = Vdeactivate_mark;
7587
7588   if (NILP (dst_object))
7589     {
7590       destination = coding->destination;
7591       dst_bytes = coding->dst_bytes;
7592     }
7593
7594   coding->src_object = src_object;
7595   coding->src_chars = chars;
7596   coding->src_bytes = bytes;
7597   coding->src_multibyte = chars < bytes;
7598
7599   if (STRINGP (src_object))
7600     {
7601       coding->src_pos = from;
7602       coding->src_pos_byte = from_byte;
7603     }
7604   else if (BUFFERP (src_object))
7605     {
7606       set_buffer_internal (XBUFFER (src_object));
7607       if (from != GPT)
7608         move_gap_both (from, from_byte);
7609       if (EQ (src_object, dst_object))
7610         {
7611           struct Lisp_Marker *tail;
7612
7613           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7614             {
7615               tail->need_adjustment
7616                 = tail->charpos == (tail->insertion_type ? from : to);
7617               need_marker_adjustment |= tail->need_adjustment;
7618             }
7619           saved_pt = PT, saved_pt_byte = PT_BYTE;
7620           TEMP_SET_PT_BOTH (from, from_byte);
7621           current_buffer->text->inhibit_shrinking = 1;
7622           del_range_both (from, from_byte, to, to_byte, 1);
7623           coding->src_pos = -chars;
7624           coding->src_pos_byte = -bytes;
7625         }
7626       else
7627         {
7628           coding->src_pos = from;
7629           coding->src_pos_byte = from_byte;
7630         }
7631     }
7632
7633   if (CODING_REQUIRE_DETECTION (coding))
7634     detect_coding (coding);
7635   attrs = CODING_ID_ATTRS (coding->id);
7636
7637   if (EQ (dst_object, Qt)
7638       || (! NILP (CODING_ATTR_POST_READ (attrs))
7639           && NILP (dst_object)))
7640     {
7641       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7642       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7643       coding->dst_pos = BEG;
7644       coding->dst_pos_byte = BEG_BYTE;
7645     }
7646   else if (BUFFERP (dst_object))
7647     {
7648       code_conversion_save (0, 0);
7649       coding->dst_object = dst_object;
7650       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7651       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7652       coding->dst_multibyte
7653         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7654     }
7655   else
7656     {
7657       code_conversion_save (0, 0);
7658       coding->dst_object = Qnil;
7659       /* Most callers presume this will return a multibyte result, and they
7660          won't use `binary' or `raw-text' anyway, so let's not worry about
7661          CODING_FOR_UNIBYTE.  */
7662       coding->dst_multibyte = 1;
7663     }
7664
7665   decode_coding (coding);
7666
7667   if (BUFFERP (coding->dst_object))
7668     set_buffer_internal (XBUFFER (coding->dst_object));
7669
7670   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7671     {
7672       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7673       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7674       Lisp_Object val;
7675
7676       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7677       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7678               old_deactivate_mark);
7679       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7680                         make_number (coding->produced_char));
7681       UNGCPRO;
7682       CHECK_NATNUM (val);
7683       coding->produced_char += Z - prev_Z;
7684       coding->produced += Z_BYTE - prev_Z_BYTE;
7685     }
7686
7687   if (EQ (dst_object, Qt))
7688     {
7689       coding->dst_object = Fbuffer_string ();
7690     }
7691   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7692     {
7693       set_buffer_internal (XBUFFER (coding->dst_object));
7694       if (dst_bytes < coding->produced)
7695         {
7696           destination = xrealloc (destination, coding->produced);
7697           if (! destination)
7698             {
7699               record_conversion_result (coding,
7700                                         CODING_RESULT_INSUFFICIENT_MEM);
7701               unbind_to (count, Qnil);
7702               return;
7703             }
7704           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7705             move_gap_both (BEGV, BEGV_BYTE);
7706           memcpy (destination, BEGV_ADDR, coding->produced);
7707           coding->destination = destination;
7708         }
7709     }
7710
7711   if (saved_pt >= 0)
7712     {
7713       /* This is the case of:
7714          (BUFFERP (src_object) && EQ (src_object, dst_object))
7715          As we have moved PT while replacing the original buffer
7716          contents, we must recover it now.  */
7717       set_buffer_internal (XBUFFER (src_object));
7718       current_buffer->text->inhibit_shrinking = 0;
7719       if (saved_pt < from)
7720         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7721       else if (saved_pt < from + chars)
7722         TEMP_SET_PT_BOTH (from, from_byte);
7723       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7724         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7725                           saved_pt_byte + (coding->produced - bytes));
7726       else
7727         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7728                           saved_pt_byte + (coding->produced - bytes));
7729
7730       if (need_marker_adjustment)
7731         {
7732           struct Lisp_Marker *tail;
7733
7734           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7735             if (tail->need_adjustment)
7736               {
7737                 tail->need_adjustment = 0;
7738                 if (tail->insertion_type)
7739                   {
7740                     tail->bytepos = from_byte;
7741                     tail->charpos = from;
7742                   }
7743                 else
7744                   {
7745                     tail->bytepos = from_byte + coding->produced;
7746                     tail->charpos
7747                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7748                          ? tail->bytepos : from + coding->produced_char);
7749                   }
7750               }
7751         }
7752     }
7753
7754   Vdeactivate_mark = old_deactivate_mark;
7755   unbind_to (count, coding->dst_object);
7756 }
7757
7758
7759 void
7760 encode_coding_object (struct coding_system *coding,
7761                       Lisp_Object src_object,
7762                       ptrdiff_t from, ptrdiff_t from_byte,
7763                       ptrdiff_t to, ptrdiff_t to_byte,
7764                       Lisp_Object dst_object)
7765 {
7766   ptrdiff_t count = SPECPDL_INDEX ();
7767   ptrdiff_t chars = to - from;
7768   ptrdiff_t bytes = to_byte - from_byte;
7769   Lisp_Object attrs;
7770   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7771   bool need_marker_adjustment = 0;
7772   bool kill_src_buffer = 0;
7773   Lisp_Object old_deactivate_mark;
7774
7775   old_deactivate_mark = Vdeactivate_mark;
7776
7777   coding->src_object = src_object;
7778   coding->src_chars = chars;
7779   coding->src_bytes = bytes;
7780   coding->src_multibyte = chars < bytes;
7781
7782   attrs = CODING_ID_ATTRS (coding->id);
7783
7784   if (EQ (src_object, dst_object))
7785     {
7786       struct Lisp_Marker *tail;
7787
7788       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7789         {
7790           tail->need_adjustment
7791             = tail->charpos == (tail->insertion_type ? from : to);
7792           need_marker_adjustment |= tail->need_adjustment;
7793         }
7794     }
7795
7796   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7797     {
7798       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7799       set_buffer_internal (XBUFFER (coding->src_object));
7800       if (STRINGP (src_object))
7801         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7802       else if (BUFFERP (src_object))
7803         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7804       else
7805         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7806
7807       if (EQ (src_object, dst_object))
7808         {
7809           set_buffer_internal (XBUFFER (src_object));
7810           saved_pt = PT, saved_pt_byte = PT_BYTE;
7811           del_range_both (from, from_byte, to, to_byte, 1);
7812           set_buffer_internal (XBUFFER (coding->src_object));
7813         }
7814
7815       {
7816         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7817
7818         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7819                 old_deactivate_mark);
7820         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7821                     make_number (BEG), make_number (Z));
7822         UNGCPRO;
7823       }
7824       if (XBUFFER (coding->src_object) != current_buffer)
7825         kill_src_buffer = 1;
7826       coding->src_object = Fcurrent_buffer ();
7827       if (BEG != GPT)
7828         move_gap_both (BEG, BEG_BYTE);
7829       coding->src_chars = Z - BEG;
7830       coding->src_bytes = Z_BYTE - BEG_BYTE;
7831       coding->src_pos = BEG;
7832       coding->src_pos_byte = BEG_BYTE;
7833       coding->src_multibyte = Z < Z_BYTE;
7834     }
7835   else if (STRINGP (src_object))
7836     {
7837       code_conversion_save (0, 0);
7838       coding->src_pos = from;
7839       coding->src_pos_byte = from_byte;
7840     }
7841   else if (BUFFERP (src_object))
7842     {
7843       code_conversion_save (0, 0);
7844       set_buffer_internal (XBUFFER (src_object));
7845       if (EQ (src_object, dst_object))
7846         {
7847           saved_pt = PT, saved_pt_byte = PT_BYTE;
7848           coding->src_object = del_range_1 (from, to, 1, 1);
7849           coding->src_pos = 0;
7850           coding->src_pos_byte = 0;
7851         }
7852       else
7853         {
7854           if (from < GPT && to >= GPT)
7855             move_gap_both (from, from_byte);
7856           coding->src_pos = from;
7857           coding->src_pos_byte = from_byte;
7858         }
7859     }
7860   else
7861     code_conversion_save (0, 0);
7862
7863   if (BUFFERP (dst_object))
7864     {
7865       coding->dst_object = dst_object;
7866       if (EQ (src_object, dst_object))
7867         {
7868           coding->dst_pos = from;
7869           coding->dst_pos_byte = from_byte;
7870         }
7871       else
7872         {
7873           struct buffer *current = current_buffer;
7874
7875           set_buffer_temp (XBUFFER (dst_object));
7876           coding->dst_pos = PT;
7877           coding->dst_pos_byte = PT_BYTE;
7878           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7879           set_buffer_temp (current);
7880         }
7881       coding->dst_multibyte
7882         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7883     }
7884   else if (EQ (dst_object, Qt))
7885     {
7886       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7887       coding->dst_object = Qnil;
7888       coding->destination = xmalloc (dst_bytes);
7889       coding->dst_bytes = dst_bytes;
7890       coding->dst_multibyte = 0;
7891     }
7892   else
7893     {
7894       coding->dst_object = Qnil;
7895       coding->dst_multibyte = 0;
7896     }
7897
7898   encode_coding (coding);
7899
7900   if (EQ (dst_object, Qt))
7901     {
7902       if (BUFFERP (coding->dst_object))
7903         coding->dst_object = Fbuffer_string ();
7904       else
7905         {
7906           coding->dst_object
7907             = make_unibyte_string ((char *) coding->destination,
7908                                    coding->produced);
7909           xfree (coding->destination);
7910         }
7911     }
7912
7913   if (saved_pt >= 0)
7914     {
7915       /* This is the case of:
7916          (BUFFERP (src_object) && EQ (src_object, dst_object))
7917          As we have moved PT while replacing the original buffer
7918          contents, we must recover it now.  */
7919       set_buffer_internal (XBUFFER (src_object));
7920       if (saved_pt < from)
7921         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7922       else if (saved_pt < from + chars)
7923         TEMP_SET_PT_BOTH (from, from_byte);
7924       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7925         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7926                           saved_pt_byte + (coding->produced - bytes));
7927       else
7928         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7929                           saved_pt_byte + (coding->produced - bytes));
7930
7931       if (need_marker_adjustment)
7932         {
7933           struct Lisp_Marker *tail;
7934
7935           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7936             if (tail->need_adjustment)
7937               {
7938                 tail->need_adjustment = 0;
7939                 if (tail->insertion_type)
7940                   {
7941                     tail->bytepos = from_byte;
7942                     tail->charpos = from;
7943                   }
7944                 else
7945                   {
7946                     tail->bytepos = from_byte + coding->produced;
7947                     tail->charpos
7948                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7949                          ? tail->bytepos : from + coding->produced_char);
7950                   }
7951               }
7952         }
7953     }
7954
7955   if (kill_src_buffer)
7956     Fkill_buffer (coding->src_object);
7957
7958   Vdeactivate_mark = old_deactivate_mark;
7959   unbind_to (count, Qnil);
7960 }
7961
7962
7963 Lisp_Object
7964 preferred_coding_system (void)
7965 {
7966   int id = coding_categories[coding_priorities[0]].id;
7967
7968   return CODING_ID_NAME (id);
7969 }
7970
7971 \f
7972 #ifdef emacs
7973 /*** 8. Emacs Lisp library functions ***/
7974
7975 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7976        doc: /* Return t if OBJECT is nil or a coding-system.
7977 See the documentation of `define-coding-system' for information
7978 about coding-system objects.  */)
7979   (Lisp_Object object)
7980 {
7981   if (NILP (object)
7982       || CODING_SYSTEM_ID (object) >= 0)
7983     return Qt;
7984   if (! SYMBOLP (object)
7985       || NILP (Fget (object, Qcoding_system_define_form)))
7986     return Qnil;
7987   return Qt;
7988 }
7989
7990 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7991        Sread_non_nil_coding_system, 1, 1, 0,
7992        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7993   (Lisp_Object prompt)
7994 {
7995   Lisp_Object val;
7996   do
7997     {
7998       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7999                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8000     }
8001   while (SCHARS (val) == 0);
8002   return (Fintern (val, Qnil));
8003 }
8004
8005 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8006        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8007 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8008 Ignores case when completing coding systems (all Emacs coding systems
8009 are lower-case).  */)
8010   (Lisp_Object prompt, Lisp_Object default_coding_system)
8011 {
8012   Lisp_Object val;
8013   ptrdiff_t count = SPECPDL_INDEX ();
8014
8015   if (SYMBOLP (default_coding_system))
8016     default_coding_system = SYMBOL_NAME (default_coding_system);
8017   specbind (Qcompletion_ignore_case, Qt);
8018   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8019                           Qt, Qnil, Qcoding_system_history,
8020                           default_coding_system, Qnil);
8021   unbind_to (count, Qnil);
8022   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8023 }
8024
8025 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8026        1, 1, 0,
8027        doc: /* Check validity of CODING-SYSTEM.
8028 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8029 It is valid if it is nil or a symbol defined as a coding system by the
8030 function `define-coding-system'.  */)
8031   (Lisp_Object coding_system)
8032 {
8033   Lisp_Object define_form;
8034
8035   define_form = Fget (coding_system, Qcoding_system_define_form);
8036   if (! NILP (define_form))
8037     {
8038       Fput (coding_system, Qcoding_system_define_form, Qnil);
8039       safe_eval (define_form);
8040     }
8041   if (!NILP (Fcoding_system_p (coding_system)))
8042     return coding_system;
8043   xsignal1 (Qcoding_system_error, coding_system);
8044 }
8045
8046 \f
8047 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8048    HIGHEST, return the coding system of the highest
8049    priority among the detected coding systems.  Otherwise return a
8050    list of detected coding systems sorted by their priorities.  If
8051    MULTIBYTEP, it is assumed that the bytes are in correct
8052    multibyte form but contains only ASCII and eight-bit chars.
8053    Otherwise, the bytes are raw bytes.
8054
8055    CODING-SYSTEM controls the detection as below:
8056
8057    If it is nil, detect both text-format and eol-format.  If the
8058    text-format part of CODING-SYSTEM is already specified
8059    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8060    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8061    detect only text-format.  */
8062
8063 Lisp_Object
8064 detect_coding_system (const unsigned char *src,
8065                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8066                       bool highest, bool multibytep,
8067                       Lisp_Object coding_system)
8068 {
8069   const unsigned char *src_end = src + src_bytes;
8070   Lisp_Object attrs, eol_type;
8071   Lisp_Object val = Qnil;
8072   struct coding_system coding;
8073   ptrdiff_t id;
8074   struct coding_detection_info detect_info;
8075   enum coding_category base_category;
8076   bool null_byte_found = 0, eight_bit_found = 0;
8077
8078   if (NILP (coding_system))
8079     coding_system = Qundecided;
8080   setup_coding_system (coding_system, &coding);
8081   attrs = CODING_ID_ATTRS (coding.id);
8082   eol_type = CODING_ID_EOL_TYPE (coding.id);
8083   coding_system = CODING_ATTR_BASE_NAME (attrs);
8084
8085   coding.source = src;
8086   coding.src_chars = src_chars;
8087   coding.src_bytes = src_bytes;
8088   coding.src_multibyte = multibytep;
8089   coding.consumed = 0;
8090   coding.mode |= CODING_MODE_LAST_BLOCK;
8091   coding.head_ascii = 0;
8092
8093   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8094
8095   /* At first, detect text-format if necessary.  */
8096   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8097   if (base_category == coding_category_undecided)
8098     {
8099       enum coding_category category IF_LINT (= 0);
8100       struct coding_system *this IF_LINT (= NULL);
8101       int c, i;
8102
8103       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8104       for (; src < src_end; src++)
8105         {
8106           c = *src;
8107           if (c & 0x80)
8108             {
8109               eight_bit_found = 1;
8110               if (null_byte_found)
8111                 break;
8112             }
8113           else if (c < 0x20)
8114             {
8115               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8116                   && ! inhibit_iso_escape_detection
8117                   && ! detect_info.checked)
8118                 {
8119                   if (detect_coding_iso_2022 (&coding, &detect_info))
8120                     {
8121                       /* We have scanned the whole data.  */
8122                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8123                         {
8124                           /* We didn't find an 8-bit code.  We may
8125                              have found a null-byte, but it's very
8126                              rare that a binary file confirm to
8127                              ISO-2022.  */
8128                           src = src_end;
8129                           coding.head_ascii = src - coding.source;
8130                         }
8131                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8132                       break;
8133                     }
8134                 }
8135               else if (! c && !inhibit_null_byte_detection)
8136                 {
8137                   null_byte_found = 1;
8138                   if (eight_bit_found)
8139                     break;
8140                 }
8141               if (! eight_bit_found)
8142                 coding.head_ascii++;
8143             }
8144           else if (! eight_bit_found)
8145             coding.head_ascii++;
8146         }
8147
8148       if (null_byte_found || eight_bit_found
8149           || coding.head_ascii < coding.src_bytes
8150           || detect_info.found)
8151         {
8152           if (coding.head_ascii == coding.src_bytes)
8153             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8154             for (i = 0; i < coding_category_raw_text; i++)
8155               {
8156                 category = coding_priorities[i];
8157                 this = coding_categories + category;
8158                 if (detect_info.found & (1 << category))
8159                   break;
8160               }
8161           else
8162             {
8163               if (null_byte_found)
8164                 {
8165                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8166                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8167                 }
8168               for (i = 0; i < coding_category_raw_text; i++)
8169                 {
8170                   category = coding_priorities[i];
8171                   this = coding_categories + category;
8172
8173                   if (this->id < 0)
8174                     {
8175                       /* No coding system of this category is defined.  */
8176                       detect_info.rejected |= (1 << category);
8177                     }
8178                   else if (category >= coding_category_raw_text)
8179                     continue;
8180                   else if (detect_info.checked & (1 << category))
8181                     {
8182                       if (highest
8183                           && (detect_info.found & (1 << category)))
8184                         break;
8185                     }
8186                   else if ((*(this->detector)) (&coding, &detect_info)
8187                            && highest
8188                            && (detect_info.found & (1 << category)))
8189                     {
8190                       if (category == coding_category_utf_16_auto)
8191                         {
8192                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8193                             category = coding_category_utf_16_le;
8194                           else
8195                             category = coding_category_utf_16_be;
8196                         }
8197                       break;
8198                     }
8199                 }
8200             }
8201         }
8202
8203       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8204           || null_byte_found)
8205         {
8206           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8207           id = CODING_SYSTEM_ID (Qno_conversion);
8208           val = Fcons (make_number (id), Qnil);
8209         }
8210       else if (! detect_info.rejected && ! detect_info.found)
8211         {
8212           detect_info.found = CATEGORY_MASK_ANY;
8213           id = coding_categories[coding_category_undecided].id;
8214           val = Fcons (make_number (id), Qnil);
8215         }
8216       else if (highest)
8217         {
8218           if (detect_info.found)
8219             {
8220               detect_info.found = 1 << category;
8221               val = Fcons (make_number (this->id), Qnil);
8222             }
8223           else
8224             for (i = 0; i < coding_category_raw_text; i++)
8225               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8226                 {
8227                   detect_info.found = 1 << coding_priorities[i];
8228                   id = coding_categories[coding_priorities[i]].id;
8229                   val = Fcons (make_number (id), Qnil);
8230                   break;
8231                 }
8232         }
8233       else
8234         {
8235           int mask = detect_info.rejected | detect_info.found;
8236           int found = 0;
8237
8238           for (i = coding_category_raw_text - 1; i >= 0; i--)
8239             {
8240               category = coding_priorities[i];
8241               if (! (mask & (1 << category)))
8242                 {
8243                   found |= 1 << category;
8244                   id = coding_categories[category].id;
8245                   if (id >= 0)
8246                     val = Fcons (make_number (id), val);
8247                 }
8248             }
8249           for (i = coding_category_raw_text - 1; i >= 0; i--)
8250             {
8251               category = coding_priorities[i];
8252               if (detect_info.found & (1 << category))
8253                 {
8254                   id = coding_categories[category].id;
8255                   val = Fcons (make_number (id), val);
8256                 }
8257             }
8258           detect_info.found |= found;
8259         }
8260     }
8261   else if (base_category == coding_category_utf_8_auto)
8262     {
8263       if (detect_coding_utf_8 (&coding, &detect_info))
8264         {
8265           struct coding_system *this;
8266
8267           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8268             this = coding_categories + coding_category_utf_8_sig;
8269           else
8270             this = coding_categories + coding_category_utf_8_nosig;
8271           val = Fcons (make_number (this->id), Qnil);
8272         }
8273     }
8274   else if (base_category == coding_category_utf_16_auto)
8275     {
8276       if (detect_coding_utf_16 (&coding, &detect_info))
8277         {
8278           struct coding_system *this;
8279
8280           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8281             this = coding_categories + coding_category_utf_16_le;
8282           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8283             this = coding_categories + coding_category_utf_16_be;
8284           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8285             this = coding_categories + coding_category_utf_16_be_nosig;
8286           else
8287             this = coding_categories + coding_category_utf_16_le_nosig;
8288           val = Fcons (make_number (this->id), Qnil);
8289         }
8290     }
8291   else
8292     {
8293       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8294       val = Fcons (make_number (coding.id), Qnil);
8295     }
8296
8297   /* Then, detect eol-format if necessary.  */
8298   {
8299     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8300     Lisp_Object tail;
8301
8302     if (VECTORP (eol_type))
8303       {
8304         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8305           {
8306             if (null_byte_found)
8307               normal_eol = EOL_SEEN_LF;
8308             else
8309               normal_eol = detect_eol (coding.source, src_bytes,
8310                                        coding_category_raw_text);
8311           }
8312         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8313                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8314           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8315                                       coding_category_utf_16_be);
8316         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8317                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8318           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8319                                       coding_category_utf_16_le);
8320       }
8321     else
8322       {
8323         if (EQ (eol_type, Qunix))
8324           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8325         else if (EQ (eol_type, Qdos))
8326           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8327         else
8328           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8329       }
8330
8331     for (tail = val; CONSP (tail); tail = XCDR (tail))
8332       {
8333         enum coding_category category;
8334         int this_eol;
8335
8336         id = XINT (XCAR (tail));
8337         attrs = CODING_ID_ATTRS (id);
8338         category = XINT (CODING_ATTR_CATEGORY (attrs));
8339         eol_type = CODING_ID_EOL_TYPE (id);
8340         if (VECTORP (eol_type))
8341           {
8342             if (category == coding_category_utf_16_be
8343                 || category == coding_category_utf_16_be_nosig)
8344               this_eol = utf_16_be_eol;
8345             else if (category == coding_category_utf_16_le
8346                      || category == coding_category_utf_16_le_nosig)
8347               this_eol = utf_16_le_eol;
8348             else
8349               this_eol = normal_eol;
8350
8351             if (this_eol == EOL_SEEN_LF)
8352               XSETCAR (tail, AREF (eol_type, 0));
8353             else if (this_eol == EOL_SEEN_CRLF)
8354               XSETCAR (tail, AREF (eol_type, 1));
8355             else if (this_eol == EOL_SEEN_CR)
8356               XSETCAR (tail, AREF (eol_type, 2));
8357             else
8358               XSETCAR (tail, CODING_ID_NAME (id));
8359           }
8360         else
8361           XSETCAR (tail, CODING_ID_NAME (id));
8362       }
8363   }
8364
8365   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8366 }
8367
8368
8369 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8370        2, 3, 0,
8371        doc: /* Detect coding system of the text in the region between START and END.
8372 Return a list of possible coding systems ordered by priority.
8373 The coding systems to try and their priorities follows what
8374 the function `coding-system-priority-list' (which see) returns.
8375
8376 If only ASCII characters are found (except for such ISO-2022 control
8377 characters as ESC), it returns a list of single element `undecided'
8378 or its subsidiary coding system according to a detected end-of-line
8379 format.
8380
8381 If optional argument HIGHEST is non-nil, return the coding system of
8382 highest priority.  */)
8383   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8384 {
8385   ptrdiff_t from, to;
8386   ptrdiff_t from_byte, to_byte;
8387
8388   CHECK_NUMBER_COERCE_MARKER (start);
8389   CHECK_NUMBER_COERCE_MARKER (end);
8390
8391   validate_region (&start, &end);
8392   from = XINT (start), to = XINT (end);
8393   from_byte = CHAR_TO_BYTE (from);
8394   to_byte = CHAR_TO_BYTE (to);
8395
8396   if (from < GPT && to >= GPT)
8397     move_gap_both (to, to_byte);
8398
8399   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8400                                to - from, to_byte - from_byte,
8401                                !NILP (highest),
8402                                !NILP (BVAR (current_buffer
8403                                       , enable_multibyte_characters)),
8404                                Qnil);
8405 }
8406
8407 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8408        1, 2, 0,
8409        doc: /* Detect coding system of the text in STRING.
8410 Return a list of possible coding systems ordered by priority.
8411 The coding systems to try and their priorities follows what
8412 the function `coding-system-priority-list' (which see) returns.
8413
8414 If only ASCII characters are found (except for such ISO-2022 control
8415 characters as ESC), it returns a list of single element `undecided'
8416 or its subsidiary coding system according to a detected end-of-line
8417 format.
8418
8419 If optional argument HIGHEST is non-nil, return the coding system of
8420 highest priority.  */)
8421   (Lisp_Object string, Lisp_Object highest)
8422 {
8423   CHECK_STRING (string);
8424
8425   return detect_coding_system (SDATA (string),
8426                                SCHARS (string), SBYTES (string),
8427                                !NILP (highest), STRING_MULTIBYTE (string),
8428                                Qnil);
8429 }
8430
8431
8432 static inline bool
8433 char_encodable_p (int c, Lisp_Object attrs)
8434 {
8435   Lisp_Object tail;
8436   struct charset *charset;
8437   Lisp_Object translation_table;
8438
8439   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8440   if (! NILP (translation_table))
8441     c = translate_char (translation_table, c);
8442   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8443        CONSP (tail); tail = XCDR (tail))
8444     {
8445       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8446       if (CHAR_CHARSET_P (c, charset))
8447         break;
8448     }
8449   return (! NILP (tail));
8450 }
8451
8452
8453 /* Return a list of coding systems that safely encode the text between
8454    START and END.  If EXCLUDE is non-nil, it is a list of coding
8455    systems not to check.  The returned list doesn't contain any such
8456    coding systems.  In any case, if the text contains only ASCII or is
8457    unibyte, return t.  */
8458
8459 DEFUN ("find-coding-systems-region-internal",
8460        Ffind_coding_systems_region_internal,
8461        Sfind_coding_systems_region_internal, 2, 3, 0,
8462        doc: /* Internal use only.  */)
8463   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8464 {
8465   Lisp_Object coding_attrs_list, safe_codings;
8466   ptrdiff_t start_byte, end_byte;
8467   const unsigned char *p, *pbeg, *pend;
8468   int c;
8469   Lisp_Object tail, elt, work_table;
8470
8471   if (STRINGP (start))
8472     {
8473       if (!STRING_MULTIBYTE (start)
8474           || SCHARS (start) == SBYTES (start))
8475         return Qt;
8476       start_byte = 0;
8477       end_byte = SBYTES (start);
8478     }
8479   else
8480     {
8481       CHECK_NUMBER_COERCE_MARKER (start);
8482       CHECK_NUMBER_COERCE_MARKER (end);
8483       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8484         args_out_of_range (start, end);
8485       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8486         return Qt;
8487       start_byte = CHAR_TO_BYTE (XINT (start));
8488       end_byte = CHAR_TO_BYTE (XINT (end));
8489       if (XINT (end) - XINT (start) == end_byte - start_byte)
8490         return Qt;
8491
8492       if (XINT (start) < GPT && XINT (end) > GPT)
8493         {
8494           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8495             move_gap_both (XINT (start), start_byte);
8496           else
8497             move_gap_both (XINT (end), end_byte);
8498         }
8499     }
8500
8501   coding_attrs_list = Qnil;
8502   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8503     if (NILP (exclude)
8504         || NILP (Fmemq (XCAR (tail), exclude)))
8505       {
8506         Lisp_Object attrs;
8507
8508         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8509         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8510             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8511           {
8512             ASET (attrs, coding_attr_trans_tbl,
8513                   get_translation_table (attrs, 1, NULL));
8514             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8515           }
8516       }
8517
8518   if (STRINGP (start))
8519     p = pbeg = SDATA (start);
8520   else
8521     p = pbeg = BYTE_POS_ADDR (start_byte);
8522   pend = p + (end_byte - start_byte);
8523
8524   while (p < pend && ASCII_BYTE_P (*p)) p++;
8525   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8526
8527   work_table = Fmake_char_table (Qnil, Qnil);
8528   while (p < pend)
8529     {
8530       if (ASCII_BYTE_P (*p))
8531         p++;
8532       else
8533         {
8534           c = STRING_CHAR_ADVANCE (p);
8535           if (!NILP (char_table_ref (work_table, c)))
8536             /* This character was already checked.  Ignore it.  */
8537             continue;
8538
8539           charset_map_loaded = 0;
8540           for (tail = coding_attrs_list; CONSP (tail);)
8541             {
8542               elt = XCAR (tail);
8543               if (NILP (elt))
8544                 tail = XCDR (tail);
8545               else if (char_encodable_p (c, elt))
8546                 tail = XCDR (tail);
8547               else if (CONSP (XCDR (tail)))
8548                 {
8549                   XSETCAR (tail, XCAR (XCDR (tail)));
8550                   XSETCDR (tail, XCDR (XCDR (tail)));
8551                 }
8552               else
8553                 {
8554                   XSETCAR (tail, Qnil);
8555                   tail = XCDR (tail);
8556                 }
8557             }
8558           if (charset_map_loaded)
8559             {
8560               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8561
8562               if (STRINGP (start))
8563                 pbeg = SDATA (start);
8564               else
8565                 pbeg = BYTE_POS_ADDR (start_byte);
8566               p = pbeg + p_offset;
8567               pend = pbeg + pend_offset;
8568             }
8569           char_table_set (work_table, c, Qt);
8570         }
8571     }
8572
8573   safe_codings = list2 (Qraw_text, Qno_conversion);
8574   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8575     if (! NILP (XCAR (tail)))
8576       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8577
8578   return safe_codings;
8579 }
8580
8581
8582 DEFUN ("unencodable-char-position", Funencodable_char_position,
8583        Sunencodable_char_position, 3, 5, 0,
8584        doc: /*
8585 Return position of first un-encodable character in a region.
8586 START and END specify the region and CODING-SYSTEM specifies the
8587 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8588
8589 If optional 4th argument COUNT is non-nil, it specifies at most how
8590 many un-encodable characters to search.  In this case, the value is a
8591 list of positions.
8592
8593 If optional 5th argument STRING is non-nil, it is a string to search
8594 for un-encodable characters.  In that case, START and END are indexes
8595 to the string.  */)
8596   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8597 {
8598   EMACS_INT n;
8599   struct coding_system coding;
8600   Lisp_Object attrs, charset_list, translation_table;
8601   Lisp_Object positions;
8602   ptrdiff_t from, to;
8603   const unsigned char *p, *stop, *pend;
8604   bool ascii_compatible;
8605
8606   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8607   attrs = CODING_ID_ATTRS (coding.id);
8608   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8609     return Qnil;
8610   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8611   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8612   translation_table = get_translation_table (attrs, 1, NULL);
8613
8614   if (NILP (string))
8615     {
8616       validate_region (&start, &end);
8617       from = XINT (start);
8618       to = XINT (end);
8619       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8620           || (ascii_compatible
8621               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8622         return Qnil;
8623       p = CHAR_POS_ADDR (from);
8624       pend = CHAR_POS_ADDR (to);
8625       if (from < GPT && to >= GPT)
8626         stop = GPT_ADDR;
8627       else
8628         stop = pend;
8629     }
8630   else
8631     {
8632       CHECK_STRING (string);
8633       CHECK_NATNUM (start);
8634       CHECK_NATNUM (end);
8635       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8636         args_out_of_range_3 (string, start, end);
8637       from = XINT (start);
8638       to = XINT (end);
8639       if (! STRING_MULTIBYTE (string))
8640         return Qnil;
8641       p = SDATA (string) + string_char_to_byte (string, from);
8642       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8643       if (ascii_compatible && (to - from) == (pend - p))
8644         return Qnil;
8645     }
8646
8647   if (NILP (count))
8648     n = 1;
8649   else
8650     {
8651       CHECK_NATNUM (count);
8652       n = XINT (count);
8653     }
8654
8655   positions = Qnil;
8656   charset_map_loaded = 0;
8657   while (1)
8658     {
8659       int c;
8660
8661       if (ascii_compatible)
8662         while (p < stop && ASCII_BYTE_P (*p))
8663           p++, from++;
8664       if (p >= stop)
8665         {
8666           if (p >= pend)
8667             break;
8668           stop = pend;
8669           p = GAP_END_ADDR;
8670         }
8671
8672       c = STRING_CHAR_ADVANCE (p);
8673       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8674           && ! char_charset (translate_char (translation_table, c),
8675                              charset_list, NULL))
8676         {
8677           positions = Fcons (make_number (from), positions);
8678           n--;
8679           if (n == 0)
8680             break;
8681         }
8682
8683       from++;
8684       if (charset_map_loaded && NILP (string))
8685         {
8686           p = CHAR_POS_ADDR (from);
8687           pend = CHAR_POS_ADDR (to);
8688           if (from < GPT && to >= GPT)
8689             stop = GPT_ADDR;
8690           else
8691             stop = pend;
8692           charset_map_loaded = 0;
8693         }
8694     }
8695
8696   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8697 }
8698
8699
8700 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8701        Scheck_coding_systems_region, 3, 3, 0,
8702        doc: /* Check if the region is encodable by coding systems.
8703
8704 START and END are buffer positions specifying the region.
8705 CODING-SYSTEM-LIST is a list of coding systems to check.
8706
8707 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8708 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8709 whole region, POS0, POS1, ... are buffer positions where non-encodable
8710 characters are found.
8711
8712 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8713 value is nil.
8714
8715 START may be a string.  In that case, check if the string is
8716 encodable, and the value contains indices to the string instead of
8717 buffer positions.  END is ignored.
8718
8719 If the current buffer (or START if it is a string) is unibyte, the value
8720 is nil.  */)
8721   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8722 {
8723   Lisp_Object list;
8724   ptrdiff_t start_byte, end_byte;
8725   ptrdiff_t pos;
8726   const unsigned char *p, *pbeg, *pend;
8727   int c;
8728   Lisp_Object tail, elt, attrs;
8729
8730   if (STRINGP (start))
8731     {
8732       if (!STRING_MULTIBYTE (start)
8733           || SCHARS (start) == SBYTES (start))
8734         return Qnil;
8735       start_byte = 0;
8736       end_byte = SBYTES (start);
8737       pos = 0;
8738     }
8739   else
8740     {
8741       CHECK_NUMBER_COERCE_MARKER (start);
8742       CHECK_NUMBER_COERCE_MARKER (end);
8743       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8744         args_out_of_range (start, end);
8745       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8746         return Qnil;
8747       start_byte = CHAR_TO_BYTE (XINT (start));
8748       end_byte = CHAR_TO_BYTE (XINT (end));
8749       if (XINT (end) - XINT (start) == end_byte - start_byte)
8750         return Qnil;
8751
8752       if (XINT (start) < GPT && XINT (end) > GPT)
8753         {
8754           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8755             move_gap_both (XINT (start), start_byte);
8756           else
8757             move_gap_both (XINT (end), end_byte);
8758         }
8759       pos = XINT (start);
8760     }
8761
8762   list = Qnil;
8763   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8764     {
8765       elt = XCAR (tail);
8766       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8767       ASET (attrs, coding_attr_trans_tbl,
8768             get_translation_table (attrs, 1, NULL));
8769       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8770     }
8771
8772   if (STRINGP (start))
8773     p = pbeg = SDATA (start);
8774   else
8775     p = pbeg = BYTE_POS_ADDR (start_byte);
8776   pend = p + (end_byte - start_byte);
8777
8778   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8779   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8780
8781   while (p < pend)
8782     {
8783       if (ASCII_BYTE_P (*p))
8784         p++;
8785       else
8786         {
8787           c = STRING_CHAR_ADVANCE (p);
8788
8789           charset_map_loaded = 0;
8790           for (tail = list; CONSP (tail); tail = XCDR (tail))
8791             {
8792               elt = XCDR (XCAR (tail));
8793               if (! char_encodable_p (c, XCAR (elt)))
8794                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8795             }
8796           if (charset_map_loaded)
8797             {
8798               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8799
8800               if (STRINGP (start))
8801                 pbeg = SDATA (start);
8802               else
8803                 pbeg = BYTE_POS_ADDR (start_byte);
8804               p = pbeg + p_offset;
8805               pend = pbeg + pend_offset;
8806             }
8807         }
8808       pos++;
8809     }
8810
8811   tail = list;
8812   list = Qnil;
8813   for (; CONSP (tail); tail = XCDR (tail))
8814     {
8815       elt = XCAR (tail);
8816       if (CONSP (XCDR (XCDR (elt))))
8817         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8818                       list);
8819     }
8820
8821   return list;
8822 }
8823
8824
8825 static Lisp_Object
8826 code_convert_region (Lisp_Object start, Lisp_Object end,
8827                      Lisp_Object coding_system, Lisp_Object dst_object,
8828                      bool encodep, bool norecord)
8829 {
8830   struct coding_system coding;
8831   ptrdiff_t from, from_byte, to, to_byte;
8832   Lisp_Object src_object;
8833
8834   CHECK_NUMBER_COERCE_MARKER (start);
8835   CHECK_NUMBER_COERCE_MARKER (end);
8836   if (NILP (coding_system))
8837     coding_system = Qno_conversion;
8838   else
8839     CHECK_CODING_SYSTEM (coding_system);
8840   src_object = Fcurrent_buffer ();
8841   if (NILP (dst_object))
8842     dst_object = src_object;
8843   else if (! EQ (dst_object, Qt))
8844     CHECK_BUFFER (dst_object);
8845
8846   validate_region (&start, &end);
8847   from = XFASTINT (start);
8848   from_byte = CHAR_TO_BYTE (from);
8849   to = XFASTINT (end);
8850   to_byte = CHAR_TO_BYTE (to);
8851
8852   setup_coding_system (coding_system, &coding);
8853   coding.mode |= CODING_MODE_LAST_BLOCK;
8854
8855   if (encodep)
8856     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8857                           dst_object);
8858   else
8859     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8860                           dst_object);
8861   if (! norecord)
8862     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8863
8864   return (BUFFERP (dst_object)
8865           ? make_number (coding.produced_char)
8866           : coding.dst_object);
8867 }
8868
8869
8870 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8871        3, 4, "r\nzCoding system: ",
8872        doc: /* Decode the current region from the specified coding system.
8873 When called from a program, takes four arguments:
8874         START, END, CODING-SYSTEM, and DESTINATION.
8875 START and END are buffer positions.
8876
8877 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8878 If nil, the region between START and END is replaced by the decoded text.
8879 If buffer, the decoded text is inserted in that buffer after point (point
8880 does not move).
8881 In those cases, the length of the decoded text is returned.
8882 If DESTINATION is t, the decoded text is returned.
8883
8884 This function sets `last-coding-system-used' to the precise coding system
8885 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8886 not fully specified.)  */)
8887   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8888 {
8889   return code_convert_region (start, end, coding_system, destination, 0, 0);
8890 }
8891
8892 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8893        3, 4, "r\nzCoding system: ",
8894        doc: /* Encode the current region by specified coding system.
8895 When called from a program, takes four arguments:
8896         START, END, CODING-SYSTEM and DESTINATION.
8897 START and END are buffer positions.
8898
8899 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8900 If nil, the region between START and END is replace by the encoded text.
8901 If buffer, the encoded text is inserted in that buffer after point (point
8902 does not move).
8903 In those cases, the length of the encoded text is returned.
8904 If DESTINATION is t, the encoded text is returned.
8905
8906 This function sets `last-coding-system-used' to the precise coding system
8907 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8908 not fully specified.)  */)
8909   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8910 {
8911   return code_convert_region (start, end, coding_system, destination, 1, 0);
8912 }
8913
8914 Lisp_Object
8915 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8916                      Lisp_Object dst_object, bool encodep, bool nocopy,
8917                      bool norecord)
8918 {
8919   struct coding_system coding;
8920   ptrdiff_t chars, bytes;
8921
8922   CHECK_STRING (string);
8923   if (NILP (coding_system))
8924     {
8925       if (! norecord)
8926         Vlast_coding_system_used = Qno_conversion;
8927       if (NILP (dst_object))
8928         return (nocopy ? Fcopy_sequence (string) : string);
8929     }
8930
8931   if (NILP (coding_system))
8932     coding_system = Qno_conversion;
8933   else
8934     CHECK_CODING_SYSTEM (coding_system);
8935   if (NILP (dst_object))
8936     dst_object = Qt;
8937   else if (! EQ (dst_object, Qt))
8938     CHECK_BUFFER (dst_object);
8939
8940   setup_coding_system (coding_system, &coding);
8941   coding.mode |= CODING_MODE_LAST_BLOCK;
8942   chars = SCHARS (string);
8943   bytes = SBYTES (string);
8944   if (encodep)
8945     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8946   else
8947     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8948   if (! norecord)
8949     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8950
8951   return (BUFFERP (dst_object)
8952           ? make_number (coding.produced_char)
8953           : coding.dst_object);
8954 }
8955
8956
8957 /* Encode or decode STRING according to CODING_SYSTEM.
8958    Do not set Vlast_coding_system_used.
8959
8960    This function is called only from macros DECODE_FILE and
8961    ENCODE_FILE, thus we ignore character composition.  */
8962
8963 Lisp_Object
8964 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
8965                               bool encodep)
8966 {
8967   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8968 }
8969
8970
8971 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8972        2, 4, 0,
8973        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8974
8975 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8976 if the decoding operation is trivial.
8977
8978 Optional fourth arg BUFFER non-nil means that the decoded text is
8979 inserted in that buffer after point (point does not move).  In this
8980 case, the return value is the length of the decoded text.
8981
8982 This function sets `last-coding-system-used' to the precise coding system
8983 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8984 not fully specified.)  */)
8985   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
8986 {
8987   return code_convert_string (string, coding_system, buffer,
8988                               0, ! NILP (nocopy), 0);
8989 }
8990
8991 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8992        2, 4, 0,
8993        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8994
8995 Optional third arg NOCOPY non-nil means it is OK to return STRING
8996 itself if the encoding operation is trivial.
8997
8998 Optional fourth arg BUFFER non-nil means that the encoded text is
8999 inserted in that buffer after point (point does not move).  In this
9000 case, the return value is the length of the encoded text.
9001
9002 This function sets `last-coding-system-used' to the precise coding system
9003 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9004 not fully specified.)  */)
9005   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9006 {
9007   return code_convert_string (string, coding_system, buffer,
9008                               1, ! NILP (nocopy), 0);
9009 }
9010
9011 \f
9012 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9013        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9014 Return the corresponding character.  */)
9015   (Lisp_Object code)
9016 {
9017   Lisp_Object spec, attrs, val;
9018   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9019   EMACS_INT ch;
9020   int c;
9021
9022   CHECK_NATNUM (code);
9023   ch = XFASTINT (code);
9024   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9025   attrs = AREF (spec, 0);
9026
9027   if (ASCII_BYTE_P (ch)
9028       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9029     return code;
9030
9031   val = CODING_ATTR_CHARSET_LIST (attrs);
9032   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9033   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9034   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9035
9036   if (ch <= 0x7F)
9037     {
9038       c = ch;
9039       charset = charset_roman;
9040     }
9041   else if (ch >= 0xA0 && ch < 0xDF)
9042     {
9043       c = ch - 0x80;
9044       charset = charset_kana;
9045     }
9046   else
9047     {
9048       EMACS_INT c1 = ch >> 8;
9049       int c2 = ch & 0xFF;
9050
9051       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9052           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9053         error ("Invalid code: %"pI"d", ch);
9054       c = ch;
9055       SJIS_TO_JIS (c);
9056       charset = charset_kanji;
9057     }
9058   c = DECODE_CHAR (charset, c);
9059   if (c < 0)
9060     error ("Invalid code: %"pI"d", ch);
9061   return make_number (c);
9062 }
9063
9064
9065 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9066        doc: /* Encode a Japanese character CH to shift_jis encoding.
9067 Return the corresponding code in SJIS.  */)
9068   (Lisp_Object ch)
9069 {
9070   Lisp_Object spec, attrs, charset_list;
9071   int c;
9072   struct charset *charset;
9073   unsigned code;
9074
9075   CHECK_CHARACTER (ch);
9076   c = XFASTINT (ch);
9077   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9078   attrs = AREF (spec, 0);
9079
9080   if (ASCII_CHAR_P (c)
9081       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9082     return ch;
9083
9084   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9085   charset = char_charset (c, charset_list, &code);
9086   if (code == CHARSET_INVALID_CODE (charset))
9087     error ("Can't encode by shift_jis encoding: %c", c);
9088   JIS_TO_SJIS (code);
9089
9090   return make_number (code);
9091 }
9092
9093 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9094        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9095 Return the corresponding character.  */)
9096   (Lisp_Object code)
9097 {
9098   Lisp_Object spec, attrs, val;
9099   struct charset *charset_roman, *charset_big5, *charset;
9100   EMACS_INT ch;
9101   int c;
9102
9103   CHECK_NATNUM (code);
9104   ch = XFASTINT (code);
9105   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9106   attrs = AREF (spec, 0);
9107
9108   if (ASCII_BYTE_P (ch)
9109       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9110     return code;
9111
9112   val = CODING_ATTR_CHARSET_LIST (attrs);
9113   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9114   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9115
9116   if (ch <= 0x7F)
9117     {
9118       c = ch;
9119       charset = charset_roman;
9120     }
9121   else
9122     {
9123       EMACS_INT b1 = ch >> 8;
9124       int b2 = ch & 0x7F;
9125       if (b1 < 0xA1 || b1 > 0xFE
9126           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9127         error ("Invalid code: %"pI"d", ch);
9128       c = ch;
9129       charset = charset_big5;
9130     }
9131   c = DECODE_CHAR (charset, c);
9132   if (c < 0)
9133     error ("Invalid code: %"pI"d", ch);
9134   return make_number (c);
9135 }
9136
9137 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9138        doc: /* Encode the Big5 character CH to BIG5 coding system.
9139 Return the corresponding character code in Big5.  */)
9140   (Lisp_Object ch)
9141 {
9142   Lisp_Object spec, attrs, charset_list;
9143   struct charset *charset;
9144   int c;
9145   unsigned code;
9146
9147   CHECK_CHARACTER (ch);
9148   c = XFASTINT (ch);
9149   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9150   attrs = AREF (spec, 0);
9151   if (ASCII_CHAR_P (c)
9152       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9153     return ch;
9154
9155   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9156   charset = char_charset (c, charset_list, &code);
9157   if (code == CHARSET_INVALID_CODE (charset))
9158     error ("Can't encode by Big5 encoding: %c", c);
9159
9160   return make_number (code);
9161 }
9162
9163 \f
9164 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9165        Sset_terminal_coding_system_internal, 1, 2, 0,
9166        doc: /* Internal use only.  */)
9167   (Lisp_Object coding_system, Lisp_Object terminal)
9168 {
9169   struct terminal *term = get_terminal (terminal, 1);
9170   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9171   CHECK_SYMBOL (coding_system);
9172   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9173   /* We had better not send unsafe characters to terminal.  */
9174   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9175   /* Character composition should be disabled.  */
9176   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9177   terminal_coding->src_multibyte = 1;
9178   terminal_coding->dst_multibyte = 0;
9179   tset_charset_list
9180     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9181             ? coding_charset_list (terminal_coding)
9182             : Fcons (make_number (charset_ascii), Qnil)));
9183   return Qnil;
9184 }
9185
9186 DEFUN ("set-safe-terminal-coding-system-internal",
9187        Fset_safe_terminal_coding_system_internal,
9188        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9189        doc: /* Internal use only.  */)
9190   (Lisp_Object coding_system)
9191 {
9192   CHECK_SYMBOL (coding_system);
9193   setup_coding_system (Fcheck_coding_system (coding_system),
9194                        &safe_terminal_coding);
9195   /* Character composition should be disabled.  */
9196   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9197   safe_terminal_coding.src_multibyte = 1;
9198   safe_terminal_coding.dst_multibyte = 0;
9199   return Qnil;
9200 }
9201
9202 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9203        Sterminal_coding_system, 0, 1, 0,
9204        doc: /* Return coding system specified for terminal output on the given terminal.
9205 TERMINAL may be a terminal object, a frame, or nil for the selected
9206 frame's terminal device.  */)
9207   (Lisp_Object terminal)
9208 {
9209   struct coding_system *terminal_coding
9210     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9211   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9212
9213   /* For backward compatibility, return nil if it is `undecided'.  */
9214   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9215 }
9216
9217 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9218        Sset_keyboard_coding_system_internal, 1, 2, 0,
9219        doc: /* Internal use only.  */)
9220   (Lisp_Object coding_system, Lisp_Object terminal)
9221 {
9222   struct terminal *t = get_terminal (terminal, 1);
9223   CHECK_SYMBOL (coding_system);
9224   if (NILP (coding_system))
9225     coding_system = Qno_conversion;
9226   else
9227     Fcheck_coding_system (coding_system);
9228   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9229   /* Character composition should be disabled.  */
9230   TERMINAL_KEYBOARD_CODING (t)->common_flags
9231     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9232   return Qnil;
9233 }
9234
9235 DEFUN ("keyboard-coding-system",
9236        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9237        doc: /* Return coding system specified for decoding keyboard input.  */)
9238   (Lisp_Object terminal)
9239 {
9240   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9241                          (get_terminal (terminal, 1))->id);
9242 }
9243
9244 \f
9245 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9246        Sfind_operation_coding_system,  1, MANY, 0,
9247        doc: /* Choose a coding system for an operation based on the target name.
9248 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9249 DECODING-SYSTEM is the coding system to use for decoding
9250 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9251 for encoding (in case OPERATION does encoding).
9252
9253 The first argument OPERATION specifies an I/O primitive:
9254   For file I/O, `insert-file-contents' or `write-region'.
9255   For process I/O, `call-process', `call-process-region', or `start-process'.
9256   For network I/O, `open-network-stream'.
9257
9258 The remaining arguments should be the same arguments that were passed
9259 to the primitive.  Depending on which primitive, one of those arguments
9260 is selected as the TARGET.  For example, if OPERATION does file I/O,
9261 whichever argument specifies the file name is TARGET.
9262
9263 TARGET has a meaning which depends on OPERATION:
9264   For file I/O, TARGET is a file name (except for the special case below).
9265   For process I/O, TARGET is a process name.
9266   For network I/O, TARGET is a service name or a port number.
9267
9268 This function looks up what is specified for TARGET in
9269 `file-coding-system-alist', `process-coding-system-alist',
9270 or `network-coding-system-alist' depending on OPERATION.
9271 They may specify a coding system, a cons of coding systems,
9272 or a function symbol to call.
9273 In the last case, we call the function with one argument,
9274 which is a list of all the arguments given to this function.
9275 If the function can't decide a coding system, it can return
9276 `undecided' so that the normal code-detection is performed.
9277
9278 If OPERATION is `insert-file-contents', the argument corresponding to
9279 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9280 file name to look up, and BUFFER is a buffer that contains the file's
9281 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9282 function to call for FILENAME, that function should examine the
9283 contents of BUFFER instead of reading the file.
9284
9285 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9286   (ptrdiff_t nargs, Lisp_Object *args)
9287 {
9288   Lisp_Object operation, target_idx, target, val;
9289   register Lisp_Object chain;
9290
9291   if (nargs < 2)
9292     error ("Too few arguments");
9293   operation = args[0];
9294   if (!SYMBOLP (operation)
9295       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9296     error ("Invalid first argument");
9297   if (nargs <= 1 + XFASTINT (target_idx))
9298     error ("Too few arguments for operation `%s'",
9299            SDATA (SYMBOL_NAME (operation)));
9300   target = args[XFASTINT (target_idx) + 1];
9301   if (!(STRINGP (target)
9302         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9303             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9304         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9305     error ("Invalid argument %"pI"d of operation `%s'",
9306            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9307   if (CONSP (target))
9308     target = XCAR (target);
9309
9310   chain = ((EQ (operation, Qinsert_file_contents)
9311             || EQ (operation, Qwrite_region))
9312            ? Vfile_coding_system_alist
9313            : (EQ (operation, Qopen_network_stream)
9314               ? Vnetwork_coding_system_alist
9315               : Vprocess_coding_system_alist));
9316   if (NILP (chain))
9317     return Qnil;
9318
9319   for (; CONSP (chain); chain = XCDR (chain))
9320     {
9321       Lisp_Object elt;
9322
9323       elt = XCAR (chain);
9324       if (CONSP (elt)
9325           && ((STRINGP (target)
9326                && STRINGP (XCAR (elt))
9327                && fast_string_match (XCAR (elt), target) >= 0)
9328               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9329         {
9330           val = XCDR (elt);
9331           /* Here, if VAL is both a valid coding system and a valid
9332              function symbol, we return VAL as a coding system.  */
9333           if (CONSP (val))
9334             return val;
9335           if (! SYMBOLP (val))
9336             return Qnil;
9337           if (! NILP (Fcoding_system_p (val)))
9338             return Fcons (val, val);
9339           if (! NILP (Ffboundp (val)))
9340             {
9341               /* We use call1 rather than safe_call1
9342                  so as to get bug reports about functions called here
9343                  which don't handle the current interface.  */
9344               val = call1 (val, Flist (nargs, args));
9345               if (CONSP (val))
9346                 return val;
9347               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9348                 return Fcons (val, val);
9349             }
9350           return Qnil;
9351         }
9352     }
9353   return Qnil;
9354 }
9355
9356 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9357        Sset_coding_system_priority, 0, MANY, 0,
9358        doc: /* Assign higher priority to the coding systems given as arguments.
9359 If multiple coding systems belong to the same category,
9360 all but the first one are ignored.
9361
9362 usage: (set-coding-system-priority &rest coding-systems)  */)
9363   (ptrdiff_t nargs, Lisp_Object *args)
9364 {
9365   ptrdiff_t i, j;
9366   bool changed[coding_category_max];
9367   enum coding_category priorities[coding_category_max];
9368
9369   memset (changed, 0, sizeof changed);
9370
9371   for (i = j = 0; i < nargs; i++)
9372     {
9373       enum coding_category category;
9374       Lisp_Object spec, attrs;
9375
9376       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9377       attrs = AREF (spec, 0);
9378       category = XINT (CODING_ATTR_CATEGORY (attrs));
9379       if (changed[category])
9380         /* Ignore this coding system because a coding system of the
9381            same category already had a higher priority.  */
9382         continue;
9383       changed[category] = 1;
9384       priorities[j++] = category;
9385       if (coding_categories[category].id >= 0
9386           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9387         setup_coding_system (args[i], &coding_categories[category]);
9388       Fset (AREF (Vcoding_category_table, category), args[i]);
9389     }
9390
9391   /* Now we have decided top J priorities.  Reflect the order of the
9392      original priorities to the remaining priorities.  */
9393
9394   for (i = j, j = 0; i < coding_category_max; i++, j++)
9395     {
9396       while (j < coding_category_max
9397              && changed[coding_priorities[j]])
9398         j++;
9399       if (j == coding_category_max)
9400         emacs_abort ();
9401       priorities[i] = coding_priorities[j];
9402     }
9403
9404   memcpy (coding_priorities, priorities, sizeof priorities);
9405
9406   /* Update `coding-category-list'.  */
9407   Vcoding_category_list = Qnil;
9408   for (i = coding_category_max; i-- > 0; )
9409     Vcoding_category_list
9410       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9411                Vcoding_category_list);
9412
9413   return Qnil;
9414 }
9415
9416 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9417        Scoding_system_priority_list, 0, 1, 0,
9418        doc: /* Return a list of coding systems ordered by their priorities.
9419 The list contains a subset of coding systems; i.e. coding systems
9420 assigned to each coding category (see `coding-category-list').
9421
9422 HIGHESTP non-nil means just return the highest priority one.  */)
9423   (Lisp_Object highestp)
9424 {
9425   int i;
9426   Lisp_Object val;
9427
9428   for (i = 0, val = Qnil; i < coding_category_max; i++)
9429     {
9430       enum coding_category category = coding_priorities[i];
9431       int id = coding_categories[category].id;
9432       Lisp_Object attrs;
9433
9434       if (id < 0)
9435         continue;
9436       attrs = CODING_ID_ATTRS (id);
9437       if (! NILP (highestp))
9438         return CODING_ATTR_BASE_NAME (attrs);
9439       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9440     }
9441   return Fnreverse (val);
9442 }
9443
9444 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9445
9446 static Lisp_Object
9447 make_subsidiaries (Lisp_Object base)
9448 {
9449   Lisp_Object subsidiaries;
9450   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9451   char *buf = alloca (base_name_len + 6);
9452   int i;
9453
9454   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9455   subsidiaries = Fmake_vector (make_number (3), Qnil);
9456   for (i = 0; i < 3; i++)
9457     {
9458       strcpy (buf + base_name_len, suffixes[i]);
9459       ASET (subsidiaries, i, intern (buf));
9460     }
9461   return subsidiaries;
9462 }
9463
9464
9465 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9466        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9467        doc: /* For internal use only.
9468 usage: (define-coding-system-internal ...)  */)
9469   (ptrdiff_t nargs, Lisp_Object *args)
9470 {
9471   Lisp_Object name;
9472   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9473   Lisp_Object attrs;            /* Vector of attributes.  */
9474   Lisp_Object eol_type;
9475   Lisp_Object aliases;
9476   Lisp_Object coding_type, charset_list, safe_charsets;
9477   enum coding_category category;
9478   Lisp_Object tail, val;
9479   int max_charset_id = 0;
9480   int i;
9481
9482   if (nargs < coding_arg_max)
9483     goto short_args;
9484
9485   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9486
9487   name = args[coding_arg_name];
9488   CHECK_SYMBOL (name);
9489   ASET (attrs, coding_attr_base_name, name);
9490
9491   val = args[coding_arg_mnemonic];
9492   if (! STRINGP (val))
9493     CHECK_CHARACTER (val);
9494   ASET (attrs, coding_attr_mnemonic, val);
9495
9496   coding_type = args[coding_arg_coding_type];
9497   CHECK_SYMBOL (coding_type);
9498   ASET (attrs, coding_attr_type, coding_type);
9499
9500   charset_list = args[coding_arg_charset_list];
9501   if (SYMBOLP (charset_list))
9502     {
9503       if (EQ (charset_list, Qiso_2022))
9504         {
9505           if (! EQ (coding_type, Qiso_2022))
9506             error ("Invalid charset-list");
9507           charset_list = Viso_2022_charset_list;
9508         }
9509       else if (EQ (charset_list, Qemacs_mule))
9510         {
9511           if (! EQ (coding_type, Qemacs_mule))
9512             error ("Invalid charset-list");
9513           charset_list = Vemacs_mule_charset_list;
9514         }
9515       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9516         {
9517           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9518             error ("Invalid charset-list");
9519           if (max_charset_id < XFASTINT (XCAR (tail)))
9520             max_charset_id = XFASTINT (XCAR (tail));
9521         }
9522     }
9523   else
9524     {
9525       charset_list = Fcopy_sequence (charset_list);
9526       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9527         {
9528           struct charset *charset;
9529
9530           val = XCAR (tail);
9531           CHECK_CHARSET_GET_CHARSET (val, charset);
9532           if (EQ (coding_type, Qiso_2022)
9533               ? CHARSET_ISO_FINAL (charset) < 0
9534               : EQ (coding_type, Qemacs_mule)
9535               ? CHARSET_EMACS_MULE_ID (charset) < 0
9536               : 0)
9537             error ("Can't handle charset `%s'",
9538                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9539
9540           XSETCAR (tail, make_number (charset->id));
9541           if (max_charset_id < charset->id)
9542             max_charset_id = charset->id;
9543         }
9544     }
9545   ASET (attrs, coding_attr_charset_list, charset_list);
9546
9547   safe_charsets = make_uninit_string (max_charset_id + 1);
9548   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9549   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9550     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9551   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9552
9553   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9554
9555   val = args[coding_arg_decode_translation_table];
9556   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9557     CHECK_SYMBOL (val);
9558   ASET (attrs, coding_attr_decode_tbl, val);
9559
9560   val = args[coding_arg_encode_translation_table];
9561   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9562     CHECK_SYMBOL (val);
9563   ASET (attrs, coding_attr_encode_tbl, val);
9564
9565   val = args[coding_arg_post_read_conversion];
9566   CHECK_SYMBOL (val);
9567   ASET (attrs, coding_attr_post_read, val);
9568
9569   val = args[coding_arg_pre_write_conversion];
9570   CHECK_SYMBOL (val);
9571   ASET (attrs, coding_attr_pre_write, val);
9572
9573   val = args[coding_arg_default_char];
9574   if (NILP (val))
9575     ASET (attrs, coding_attr_default_char, make_number (' '));
9576   else
9577     {
9578       CHECK_CHARACTER (val);
9579       ASET (attrs, coding_attr_default_char, val);
9580     }
9581
9582   val = args[coding_arg_for_unibyte];
9583   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9584
9585   val = args[coding_arg_plist];
9586   CHECK_LIST (val);
9587   ASET (attrs, coding_attr_plist, val);
9588
9589   if (EQ (coding_type, Qcharset))
9590     {
9591       /* Generate a lisp vector of 256 elements.  Each element is nil,
9592          integer, or a list of charset IDs.
9593
9594          If Nth element is nil, the byte code N is invalid in this
9595          coding system.
9596
9597          If Nth element is a number NUM, N is the first byte of a
9598          charset whose ID is NUM.
9599
9600          If Nth element is a list of charset IDs, N is the first byte
9601          of one of them.  The list is sorted by dimensions of the
9602          charsets.  A charset of smaller dimension comes first. */
9603       val = Fmake_vector (make_number (256), Qnil);
9604
9605       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9606         {
9607           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9608           int dim = CHARSET_DIMENSION (charset);
9609           int idx = (dim - 1) * 4;
9610
9611           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9612             ASET (attrs, coding_attr_ascii_compat, Qt);
9613
9614           for (i = charset->code_space[idx];
9615                i <= charset->code_space[idx + 1]; i++)
9616             {
9617               Lisp_Object tmp, tmp2;
9618               int dim2;
9619
9620               tmp = AREF (val, i);
9621               if (NILP (tmp))
9622                 tmp = XCAR (tail);
9623               else if (NUMBERP (tmp))
9624                 {
9625                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9626                   if (dim < dim2)
9627                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9628                   else
9629                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9630                 }
9631               else
9632                 {
9633                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9634                     {
9635                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9636                       if (dim < dim2)
9637                         break;
9638                     }
9639                   if (NILP (tmp2))
9640                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9641                   else
9642                     {
9643                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9644                       XSETCAR (tmp2, XCAR (tail));
9645                     }
9646                 }
9647               ASET (val, i, tmp);
9648             }
9649         }
9650       ASET (attrs, coding_attr_charset_valids, val);
9651       category = coding_category_charset;
9652     }
9653   else if (EQ (coding_type, Qccl))
9654     {
9655       Lisp_Object valids;
9656
9657       if (nargs < coding_arg_ccl_max)
9658         goto short_args;
9659
9660       val = args[coding_arg_ccl_decoder];
9661       CHECK_CCL_PROGRAM (val);
9662       if (VECTORP (val))
9663         val = Fcopy_sequence (val);
9664       ASET (attrs, coding_attr_ccl_decoder, val);
9665
9666       val = args[coding_arg_ccl_encoder];
9667       CHECK_CCL_PROGRAM (val);
9668       if (VECTORP (val))
9669         val = Fcopy_sequence (val);
9670       ASET (attrs, coding_attr_ccl_encoder, val);
9671
9672       val = args[coding_arg_ccl_valids];
9673       valids = Fmake_string (make_number (256), make_number (0));
9674       for (tail = val; CONSP (tail); tail = XCDR (tail))
9675         {
9676           int from, to;
9677
9678           val = XCAR (tail);
9679           if (INTEGERP (val))
9680             {
9681               if (! (0 <= XINT (val) && XINT (val) <= 255))
9682                 args_out_of_range_3 (val, make_number (0), make_number (255));
9683               from = to = XINT (val);
9684             }
9685           else
9686             {
9687               CHECK_CONS (val);
9688               CHECK_NATNUM_CAR (val);
9689               CHECK_NUMBER_CDR (val);
9690               if (XINT (XCAR (val)) > 255)
9691                 args_out_of_range_3 (XCAR (val),
9692                                      make_number (0), make_number (255));
9693               from = XINT (XCAR (val));
9694               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9695                 args_out_of_range_3 (XCDR (val),
9696                                      XCAR (val), make_number (255));
9697               to = XINT (XCDR (val));
9698             }
9699           for (i = from; i <= to; i++)
9700             SSET (valids, i, 1);
9701         }
9702       ASET (attrs, coding_attr_ccl_valids, valids);
9703
9704       category = coding_category_ccl;
9705     }
9706   else if (EQ (coding_type, Qutf_16))
9707     {
9708       Lisp_Object bom, endian;
9709
9710       ASET (attrs, coding_attr_ascii_compat, Qnil);
9711
9712       if (nargs < coding_arg_utf16_max)
9713         goto short_args;
9714
9715       bom = args[coding_arg_utf16_bom];
9716       if (! NILP (bom) && ! EQ (bom, Qt))
9717         {
9718           CHECK_CONS (bom);
9719           val = XCAR (bom);
9720           CHECK_CODING_SYSTEM (val);
9721           val = XCDR (bom);
9722           CHECK_CODING_SYSTEM (val);
9723         }
9724       ASET (attrs, coding_attr_utf_bom, bom);
9725
9726       endian = args[coding_arg_utf16_endian];
9727       CHECK_SYMBOL (endian);
9728       if (NILP (endian))
9729         endian = Qbig;
9730       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9731         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9732       ASET (attrs, coding_attr_utf_16_endian, endian);
9733
9734       category = (CONSP (bom)
9735                   ? coding_category_utf_16_auto
9736                   : NILP (bom)
9737                   ? (EQ (endian, Qbig)
9738                      ? coding_category_utf_16_be_nosig
9739                      : coding_category_utf_16_le_nosig)
9740                   : (EQ (endian, Qbig)
9741                      ? coding_category_utf_16_be
9742                      : coding_category_utf_16_le));
9743     }
9744   else if (EQ (coding_type, Qiso_2022))
9745     {
9746       Lisp_Object initial, reg_usage, request, flags;
9747
9748       if (nargs < coding_arg_iso2022_max)
9749         goto short_args;
9750
9751       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9752       CHECK_VECTOR (initial);
9753       for (i = 0; i < 4; i++)
9754         {
9755           val = Faref (initial, make_number (i));
9756           if (! NILP (val))
9757             {
9758               struct charset *charset;
9759
9760               CHECK_CHARSET_GET_CHARSET (val, charset);
9761               ASET (initial, i, make_number (CHARSET_ID (charset)));
9762               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9763                 ASET (attrs, coding_attr_ascii_compat, Qt);
9764             }
9765           else
9766             ASET (initial, i, make_number (-1));
9767         }
9768
9769       reg_usage = args[coding_arg_iso2022_reg_usage];
9770       CHECK_CONS (reg_usage);
9771       CHECK_NUMBER_CAR (reg_usage);
9772       CHECK_NUMBER_CDR (reg_usage);
9773
9774       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9775       for (tail = request; CONSP (tail); tail = XCDR (tail))
9776         {
9777           int id;
9778           Lisp_Object tmp1;
9779
9780           val = XCAR (tail);
9781           CHECK_CONS (val);
9782           tmp1 = XCAR (val);
9783           CHECK_CHARSET_GET_ID (tmp1, id);
9784           CHECK_NATNUM_CDR (val);
9785           if (XINT (XCDR (val)) >= 4)
9786             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9787           XSETCAR (val, make_number (id));
9788         }
9789
9790       flags = args[coding_arg_iso2022_flags];
9791       CHECK_NATNUM (flags);
9792       i = XINT (flags) & INT_MAX;
9793       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9794         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9795       flags = make_number (i);
9796
9797       ASET (attrs, coding_attr_iso_initial, initial);
9798       ASET (attrs, coding_attr_iso_usage, reg_usage);
9799       ASET (attrs, coding_attr_iso_request, request);
9800       ASET (attrs, coding_attr_iso_flags, flags);
9801       setup_iso_safe_charsets (attrs);
9802
9803       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9804         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9805                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9806                     ? coding_category_iso_7_else
9807                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9808                     ? coding_category_iso_7
9809                     : coding_category_iso_7_tight);
9810       else
9811         {
9812           int id = XINT (AREF (initial, 1));
9813
9814           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9815                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9816                        || id < 0)
9817                       ? coding_category_iso_8_else
9818                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9819                       ? coding_category_iso_8_1
9820                       : coding_category_iso_8_2);
9821         }
9822       if (category != coding_category_iso_8_1
9823           && category != coding_category_iso_8_2)
9824         ASET (attrs, coding_attr_ascii_compat, Qnil);
9825     }
9826   else if (EQ (coding_type, Qemacs_mule))
9827     {
9828       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9829         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9830       ASET (attrs, coding_attr_ascii_compat, Qt);
9831       category = coding_category_emacs_mule;
9832     }
9833   else if (EQ (coding_type, Qshift_jis))
9834     {
9835
9836       struct charset *charset;
9837
9838       if (XINT (Flength (charset_list)) != 3
9839           && XINT (Flength (charset_list)) != 4)
9840         error ("There should be three or four charsets");
9841
9842       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9843       if (CHARSET_DIMENSION (charset) != 1)
9844         error ("Dimension of charset %s is not one",
9845                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9846       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9847         ASET (attrs, coding_attr_ascii_compat, Qt);
9848
9849       charset_list = XCDR (charset_list);
9850       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9851       if (CHARSET_DIMENSION (charset) != 1)
9852         error ("Dimension of charset %s is not one",
9853                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9854
9855       charset_list = XCDR (charset_list);
9856       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9857       if (CHARSET_DIMENSION (charset) != 2)
9858         error ("Dimension of charset %s is not two",
9859                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9860
9861       charset_list = XCDR (charset_list);
9862       if (! NILP (charset_list))
9863         {
9864           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9865           if (CHARSET_DIMENSION (charset) != 2)
9866             error ("Dimension of charset %s is not two",
9867                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9868         }
9869
9870       category = coding_category_sjis;
9871       Vsjis_coding_system = name;
9872     }
9873   else if (EQ (coding_type, Qbig5))
9874     {
9875       struct charset *charset;
9876
9877       if (XINT (Flength (charset_list)) != 2)
9878         error ("There should be just two charsets");
9879
9880       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9881       if (CHARSET_DIMENSION (charset) != 1)
9882         error ("Dimension of charset %s is not one",
9883                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9884       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9885         ASET (attrs, coding_attr_ascii_compat, Qt);
9886
9887       charset_list = XCDR (charset_list);
9888       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9889       if (CHARSET_DIMENSION (charset) != 2)
9890         error ("Dimension of charset %s is not two",
9891                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9892
9893       category = coding_category_big5;
9894       Vbig5_coding_system = name;
9895     }
9896   else if (EQ (coding_type, Qraw_text))
9897     {
9898       category = coding_category_raw_text;
9899       ASET (attrs, coding_attr_ascii_compat, Qt);
9900     }
9901   else if (EQ (coding_type, Qutf_8))
9902     {
9903       Lisp_Object bom;
9904
9905       if (nargs < coding_arg_utf8_max)
9906         goto short_args;
9907
9908       bom = args[coding_arg_utf8_bom];
9909       if (! NILP (bom) && ! EQ (bom, Qt))
9910         {
9911           CHECK_CONS (bom);
9912           val = XCAR (bom);
9913           CHECK_CODING_SYSTEM (val);
9914           val = XCDR (bom);
9915           CHECK_CODING_SYSTEM (val);
9916         }
9917       ASET (attrs, coding_attr_utf_bom, bom);
9918       if (NILP (bom))
9919         ASET (attrs, coding_attr_ascii_compat, Qt);
9920
9921       category = (CONSP (bom) ? coding_category_utf_8_auto
9922                   : NILP (bom) ? coding_category_utf_8_nosig
9923                   : coding_category_utf_8_sig);
9924     }
9925   else if (EQ (coding_type, Qundecided))
9926     category = coding_category_undecided;
9927   else
9928     error ("Invalid coding system type: %s",
9929            SDATA (SYMBOL_NAME (coding_type)));
9930
9931   ASET (attrs, coding_attr_category, make_number (category));
9932   ASET (attrs, coding_attr_plist,
9933         Fcons (QCcategory,
9934                Fcons (AREF (Vcoding_category_table, category),
9935                       CODING_ATTR_PLIST (attrs))));
9936   ASET (attrs, coding_attr_plist,
9937         Fcons (QCascii_compatible_p,
9938                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9939                       CODING_ATTR_PLIST (attrs))));
9940
9941   eol_type = args[coding_arg_eol_type];
9942   if (! NILP (eol_type)
9943       && ! EQ (eol_type, Qunix)
9944       && ! EQ (eol_type, Qdos)
9945       && ! EQ (eol_type, Qmac))
9946     error ("Invalid eol-type");
9947
9948   aliases = Fcons (name, Qnil);
9949
9950   if (NILP (eol_type))
9951     {
9952       eol_type = make_subsidiaries (name);
9953       for (i = 0; i < 3; i++)
9954         {
9955           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
9956
9957           this_name = AREF (eol_type, i);
9958           this_aliases = Fcons (this_name, Qnil);
9959           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
9960           this_spec = Fmake_vector (make_number (3), attrs);
9961           ASET (this_spec, 1, this_aliases);
9962           ASET (this_spec, 2, this_eol_type);
9963           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
9964           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
9965           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
9966           if (NILP (val))
9967             Vcoding_system_alist
9968               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
9969                        Vcoding_system_alist);
9970         }
9971     }
9972
9973   spec_vec = Fmake_vector (make_number (3), attrs);
9974   ASET (spec_vec, 1, aliases);
9975   ASET (spec_vec, 2, eol_type);
9976
9977   Fputhash (name, spec_vec, Vcoding_system_hash_table);
9978   Vcoding_system_list = Fcons (name, Vcoding_system_list);
9979   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
9980   if (NILP (val))
9981     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9982                                   Vcoding_system_alist);
9983
9984   {
9985     int id = coding_categories[category].id;
9986
9987     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9988       setup_coding_system (name, &coding_categories[category]);
9989   }
9990
9991   return Qnil;
9992
9993  short_args:
9994   return Fsignal (Qwrong_number_of_arguments,
9995                   Fcons (intern ("define-coding-system-internal"),
9996                          make_number (nargs)));
9997 }
9998
9999
10000 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10001        3, 3, 0,
10002        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10003   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10004 {
10005   Lisp_Object spec, attrs;
10006
10007   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10008   attrs = AREF (spec, 0);
10009   if (EQ (prop, QCmnemonic))
10010     {
10011       if (! STRINGP (val))
10012         CHECK_CHARACTER (val);
10013       ASET (attrs, coding_attr_mnemonic, val);
10014     }
10015   else if (EQ (prop, QCdefault_char))
10016     {
10017       if (NILP (val))
10018         val = make_number (' ');
10019       else
10020         CHECK_CHARACTER (val);
10021       ASET (attrs, coding_attr_default_char, val);
10022     }
10023   else if (EQ (prop, QCdecode_translation_table))
10024     {
10025       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10026         CHECK_SYMBOL (val);
10027       ASET (attrs, coding_attr_decode_tbl, val);
10028     }
10029   else if (EQ (prop, QCencode_translation_table))
10030     {
10031       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10032         CHECK_SYMBOL (val);
10033       ASET (attrs, coding_attr_encode_tbl, val);
10034     }
10035   else if (EQ (prop, QCpost_read_conversion))
10036     {
10037       CHECK_SYMBOL (val);
10038       ASET (attrs, coding_attr_post_read, val);
10039     }
10040   else if (EQ (prop, QCpre_write_conversion))
10041     {
10042       CHECK_SYMBOL (val);
10043       ASET (attrs, coding_attr_pre_write, val);
10044     }
10045   else if (EQ (prop, QCascii_compatible_p))
10046     {
10047       ASET (attrs, coding_attr_ascii_compat, val);
10048     }
10049
10050   ASET (attrs, coding_attr_plist,
10051         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10052   return val;
10053 }
10054
10055
10056 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10057        Sdefine_coding_system_alias, 2, 2, 0,
10058        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10059   (Lisp_Object alias, Lisp_Object coding_system)
10060 {
10061   Lisp_Object spec, aliases, eol_type, val;
10062
10063   CHECK_SYMBOL (alias);
10064   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10065   aliases = AREF (spec, 1);
10066   /* ALIASES should be a list of length more than zero, and the first
10067      element is a base coding system.  Append ALIAS at the tail of the
10068      list.  */
10069   while (!NILP (XCDR (aliases)))
10070     aliases = XCDR (aliases);
10071   XSETCDR (aliases, Fcons (alias, Qnil));
10072
10073   eol_type = AREF (spec, 2);
10074   if (VECTORP (eol_type))
10075     {
10076       Lisp_Object subsidiaries;
10077       int i;
10078
10079       subsidiaries = make_subsidiaries (alias);
10080       for (i = 0; i < 3; i++)
10081         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10082                                      AREF (eol_type, i));
10083     }
10084
10085   Fputhash (alias, spec, Vcoding_system_hash_table);
10086   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10087   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10088   if (NILP (val))
10089     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10090                                   Vcoding_system_alist);
10091
10092   return Qnil;
10093 }
10094
10095 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10096        1, 1, 0,
10097        doc: /* Return the base of CODING-SYSTEM.
10098 Any alias or subsidiary coding system is not a base coding system.  */)
10099   (Lisp_Object coding_system)
10100 {
10101   Lisp_Object spec, attrs;
10102
10103   if (NILP (coding_system))
10104     return (Qno_conversion);
10105   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10106   attrs = AREF (spec, 0);
10107   return CODING_ATTR_BASE_NAME (attrs);
10108 }
10109
10110 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10111        1, 1, 0,
10112        doc: "Return the property list of CODING-SYSTEM.")
10113   (Lisp_Object coding_system)
10114 {
10115   Lisp_Object spec, attrs;
10116
10117   if (NILP (coding_system))
10118     coding_system = Qno_conversion;
10119   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10120   attrs = AREF (spec, 0);
10121   return CODING_ATTR_PLIST (attrs);
10122 }
10123
10124
10125 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10126        1, 1, 0,
10127        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10128   (Lisp_Object coding_system)
10129 {
10130   Lisp_Object spec;
10131
10132   if (NILP (coding_system))
10133     coding_system = Qno_conversion;
10134   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10135   return AREF (spec, 1);
10136 }
10137
10138 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10139        Scoding_system_eol_type, 1, 1, 0,
10140        doc: /* Return eol-type of CODING-SYSTEM.
10141 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10142
10143 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10144 and CR respectively.
10145
10146 A vector value indicates that a format of end-of-line should be
10147 detected automatically.  Nth element of the vector is the subsidiary
10148 coding system whose eol-type is N.  */)
10149   (Lisp_Object coding_system)
10150 {
10151   Lisp_Object spec, eol_type;
10152   int n;
10153
10154   if (NILP (coding_system))
10155     coding_system = Qno_conversion;
10156   if (! CODING_SYSTEM_P (coding_system))
10157     return Qnil;
10158   spec = CODING_SYSTEM_SPEC (coding_system);
10159   eol_type = AREF (spec, 2);
10160   if (VECTORP (eol_type))
10161     return Fcopy_sequence (eol_type);
10162   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10163   return make_number (n);
10164 }
10165
10166 #endif /* emacs */
10167
10168 \f
10169 /*** 9. Post-amble ***/
10170
10171 void
10172 init_coding_once (void)
10173 {
10174   int i;
10175
10176   for (i = 0; i < coding_category_max; i++)
10177     {
10178       coding_categories[i].id = -1;
10179       coding_priorities[i] = i;
10180     }
10181
10182   /* ISO2022 specific initialize routine.  */
10183   for (i = 0; i < 0x20; i++)
10184     iso_code_class[i] = ISO_control_0;
10185   for (i = 0x21; i < 0x7F; i++)
10186     iso_code_class[i] = ISO_graphic_plane_0;
10187   for (i = 0x80; i < 0xA0; i++)
10188     iso_code_class[i] = ISO_control_1;
10189   for (i = 0xA1; i < 0xFF; i++)
10190     iso_code_class[i] = ISO_graphic_plane_1;
10191   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10192   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10193   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10194   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10195   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10196   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10197   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10198   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10199   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10200
10201   for (i = 0; i < 256; i++)
10202     {
10203       emacs_mule_bytes[i] = 1;
10204     }
10205   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10206   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10207   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10208   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10209 }
10210
10211 #ifdef emacs
10212
10213 void
10214 syms_of_coding (void)
10215 {
10216   staticpro (&Vcoding_system_hash_table);
10217   {
10218     Lisp_Object args[2];
10219     args[0] = QCtest;
10220     args[1] = Qeq;
10221     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10222   }
10223
10224   staticpro (&Vsjis_coding_system);
10225   Vsjis_coding_system = Qnil;
10226
10227   staticpro (&Vbig5_coding_system);
10228   Vbig5_coding_system = Qnil;
10229
10230   staticpro (&Vcode_conversion_reused_workbuf);
10231   Vcode_conversion_reused_workbuf = Qnil;
10232
10233   staticpro (&Vcode_conversion_workbuf_name);
10234   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10235
10236   reused_workbuf_in_use = 0;
10237
10238   DEFSYM (Qcharset, "charset");
10239   DEFSYM (Qtarget_idx, "target-idx");
10240   DEFSYM (Qcoding_system_history, "coding-system-history");
10241   Fset (Qcoding_system_history, Qnil);
10242
10243   /* Target FILENAME is the first argument.  */
10244   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10245   /* Target FILENAME is the third argument.  */
10246   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10247
10248   DEFSYM (Qcall_process, "call-process");
10249   /* Target PROGRAM is the first argument.  */
10250   Fput (Qcall_process, Qtarget_idx, make_number (0));
10251
10252   DEFSYM (Qcall_process_region, "call-process-region");
10253   /* Target PROGRAM is the third argument.  */
10254   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10255
10256   DEFSYM (Qstart_process, "start-process");
10257   /* Target PROGRAM is the third argument.  */
10258   Fput (Qstart_process, Qtarget_idx, make_number (2));
10259
10260   DEFSYM (Qopen_network_stream, "open-network-stream");
10261   /* Target SERVICE is the fourth argument.  */
10262   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10263
10264   DEFSYM (Qcoding_system, "coding-system");
10265   DEFSYM (Qcoding_aliases, "coding-aliases");
10266
10267   DEFSYM (Qeol_type, "eol-type");
10268   DEFSYM (Qunix, "unix");
10269   DEFSYM (Qdos, "dos");
10270
10271   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10272   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10273   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10274   DEFSYM (Qdefault_char, "default-char");
10275   DEFSYM (Qundecided, "undecided");
10276   DEFSYM (Qno_conversion, "no-conversion");
10277   DEFSYM (Qraw_text, "raw-text");
10278
10279   DEFSYM (Qiso_2022, "iso-2022");
10280
10281   DEFSYM (Qutf_8, "utf-8");
10282   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10283
10284   DEFSYM (Qutf_16, "utf-16");
10285   DEFSYM (Qbig, "big");
10286   DEFSYM (Qlittle, "little");
10287
10288   DEFSYM (Qshift_jis, "shift-jis");
10289   DEFSYM (Qbig5, "big5");
10290
10291   DEFSYM (Qcoding_system_p, "coding-system-p");
10292
10293   DEFSYM (Qcoding_system_error, "coding-system-error");
10294   Fput (Qcoding_system_error, Qerror_conditions,
10295         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10296   Fput (Qcoding_system_error, Qerror_message,
10297         build_pure_c_string ("Invalid coding system"));
10298
10299   /* Intern this now in case it isn't already done.
10300      Setting this variable twice is harmless.
10301      But don't staticpro it here--that is done in alloc.c.  */
10302   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10303
10304   DEFSYM (Qtranslation_table, "translation-table");
10305   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10306   DEFSYM (Qtranslation_table_id, "translation-table-id");
10307   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10308   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10309
10310   DEFSYM (Qvalid_codes, "valid-codes");
10311
10312   DEFSYM (Qemacs_mule, "emacs-mule");
10313
10314   DEFSYM (QCcategory, ":category");
10315   DEFSYM (QCmnemonic, ":mnemonic");
10316   DEFSYM (QCdefault_char, ":default-char");
10317   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10318   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10319   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10320   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10321   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10322
10323   Vcoding_category_table
10324     = Fmake_vector (make_number (coding_category_max), Qnil);
10325   staticpro (&Vcoding_category_table);
10326   /* Followings are target of code detection.  */
10327   ASET (Vcoding_category_table, coding_category_iso_7,
10328         intern_c_string ("coding-category-iso-7"));
10329   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10330         intern_c_string ("coding-category-iso-7-tight"));
10331   ASET (Vcoding_category_table, coding_category_iso_8_1,
10332         intern_c_string ("coding-category-iso-8-1"));
10333   ASET (Vcoding_category_table, coding_category_iso_8_2,
10334         intern_c_string ("coding-category-iso-8-2"));
10335   ASET (Vcoding_category_table, coding_category_iso_7_else,
10336         intern_c_string ("coding-category-iso-7-else"));
10337   ASET (Vcoding_category_table, coding_category_iso_8_else,
10338         intern_c_string ("coding-category-iso-8-else"));
10339   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10340         intern_c_string ("coding-category-utf-8-auto"));
10341   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10342         intern_c_string ("coding-category-utf-8"));
10343   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10344         intern_c_string ("coding-category-utf-8-sig"));
10345   ASET (Vcoding_category_table, coding_category_utf_16_be,
10346         intern_c_string ("coding-category-utf-16-be"));
10347   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10348         intern_c_string ("coding-category-utf-16-auto"));
10349   ASET (Vcoding_category_table, coding_category_utf_16_le,
10350         intern_c_string ("coding-category-utf-16-le"));
10351   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10352         intern_c_string ("coding-category-utf-16-be-nosig"));
10353   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10354         intern_c_string ("coding-category-utf-16-le-nosig"));
10355   ASET (Vcoding_category_table, coding_category_charset,
10356         intern_c_string ("coding-category-charset"));
10357   ASET (Vcoding_category_table, coding_category_sjis,
10358         intern_c_string ("coding-category-sjis"));
10359   ASET (Vcoding_category_table, coding_category_big5,
10360         intern_c_string ("coding-category-big5"));
10361   ASET (Vcoding_category_table, coding_category_ccl,
10362         intern_c_string ("coding-category-ccl"));
10363   ASET (Vcoding_category_table, coding_category_emacs_mule,
10364         intern_c_string ("coding-category-emacs-mule"));
10365   /* Followings are NOT target of code detection.  */
10366   ASET (Vcoding_category_table, coding_category_raw_text,
10367         intern_c_string ("coding-category-raw-text"));
10368   ASET (Vcoding_category_table, coding_category_undecided,
10369         intern_c_string ("coding-category-undecided"));
10370
10371   DEFSYM (Qinsufficient_source, "insufficient-source");
10372   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10373   DEFSYM (Qinvalid_source, "invalid-source");
10374   DEFSYM (Qinterrupted, "interrupted");
10375   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10376   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10377
10378   defsubr (&Scoding_system_p);
10379   defsubr (&Sread_coding_system);
10380   defsubr (&Sread_non_nil_coding_system);
10381   defsubr (&Scheck_coding_system);
10382   defsubr (&Sdetect_coding_region);
10383   defsubr (&Sdetect_coding_string);
10384   defsubr (&Sfind_coding_systems_region_internal);
10385   defsubr (&Sunencodable_char_position);
10386   defsubr (&Scheck_coding_systems_region);
10387   defsubr (&Sdecode_coding_region);
10388   defsubr (&Sencode_coding_region);
10389   defsubr (&Sdecode_coding_string);
10390   defsubr (&Sencode_coding_string);
10391   defsubr (&Sdecode_sjis_char);
10392   defsubr (&Sencode_sjis_char);
10393   defsubr (&Sdecode_big5_char);
10394   defsubr (&Sencode_big5_char);
10395   defsubr (&Sset_terminal_coding_system_internal);
10396   defsubr (&Sset_safe_terminal_coding_system_internal);
10397   defsubr (&Sterminal_coding_system);
10398   defsubr (&Sset_keyboard_coding_system_internal);
10399   defsubr (&Skeyboard_coding_system);
10400   defsubr (&Sfind_operation_coding_system);
10401   defsubr (&Sset_coding_system_priority);
10402   defsubr (&Sdefine_coding_system_internal);
10403   defsubr (&Sdefine_coding_system_alias);
10404   defsubr (&Scoding_system_put);
10405   defsubr (&Scoding_system_base);
10406   defsubr (&Scoding_system_plist);
10407   defsubr (&Scoding_system_aliases);
10408   defsubr (&Scoding_system_eol_type);
10409   defsubr (&Scoding_system_priority_list);
10410
10411   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10412                doc: /* List of coding systems.
10413
10414 Do not alter the value of this variable manually.  This variable should be
10415 updated by the functions `define-coding-system' and
10416 `define-coding-system-alias'.  */);
10417   Vcoding_system_list = Qnil;
10418
10419   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10420                doc: /* Alist of coding system names.
10421 Each element is one element list of coding system name.
10422 This variable is given to `completing-read' as COLLECTION argument.
10423
10424 Do not alter the value of this variable manually.  This variable should be
10425 updated by the functions `make-coding-system' and
10426 `define-coding-system-alias'.  */);
10427   Vcoding_system_alist = Qnil;
10428
10429   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10430                doc: /* List of coding-categories (symbols) ordered by priority.
10431
10432 On detecting a coding system, Emacs tries code detection algorithms
10433 associated with each coding-category one by one in this order.  When
10434 one algorithm agrees with a byte sequence of source text, the coding
10435 system bound to the corresponding coding-category is selected.
10436
10437 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10438   {
10439     int i;
10440
10441     Vcoding_category_list = Qnil;
10442     for (i = coding_category_max - 1; i >= 0; i--)
10443       Vcoding_category_list
10444         = Fcons (AREF (Vcoding_category_table, i),
10445                  Vcoding_category_list);
10446   }
10447
10448   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10449                doc: /* Specify the coding system for read operations.
10450 It is useful to bind this variable with `let', but do not set it globally.
10451 If the value is a coding system, it is used for decoding on read operation.
10452 If not, an appropriate element is used from one of the coding system alists.
10453 There are three such tables: `file-coding-system-alist',
10454 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10455   Vcoding_system_for_read = Qnil;
10456
10457   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10458                doc: /* Specify the coding system for write operations.
10459 Programs bind this variable with `let', but you should not set it globally.
10460 If the value is a coding system, it is used for encoding of output,
10461 when writing it to a file and when sending it to a file or subprocess.
10462
10463 If this does not specify a coding system, an appropriate element
10464 is used from one of the coding system alists.
10465 There are three such tables: `file-coding-system-alist',
10466 `process-coding-system-alist', and `network-coding-system-alist'.
10467 For output to files, if the above procedure does not specify a coding system,
10468 the value of `buffer-file-coding-system' is used.  */);
10469   Vcoding_system_for_write = Qnil;
10470
10471   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10472                doc: /*
10473 Coding system used in the latest file or process I/O.  */);
10474   Vlast_coding_system_used = Qnil;
10475
10476   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10477                doc: /*
10478 Error status of the last code conversion.
10479
10480 When an error was detected in the last code conversion, this variable
10481 is set to one of the following symbols.
10482   `insufficient-source'
10483   `inconsistent-eol'
10484   `invalid-source'
10485   `interrupted'
10486   `insufficient-memory'
10487 When no error was detected, the value doesn't change.  So, to check
10488 the error status of a code conversion by this variable, you must
10489 explicitly set this variable to nil before performing code
10490 conversion.  */);
10491   Vlast_code_conversion_error = Qnil;
10492
10493   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10494                doc: /*
10495 *Non-nil means always inhibit code conversion of end-of-line format.
10496 See info node `Coding Systems' and info node `Text and Binary' concerning
10497 such conversion.  */);
10498   inhibit_eol_conversion = 0;
10499
10500   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10501                doc: /*
10502 Non-nil means process buffer inherits coding system of process output.
10503 Bind it to t if the process output is to be treated as if it were a file
10504 read from some filesystem.  */);
10505   inherit_process_coding_system = 0;
10506
10507   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10508                doc: /*
10509 Alist to decide a coding system to use for a file I/O operation.
10510 The format is ((PATTERN . VAL) ...),
10511 where PATTERN is a regular expression matching a file name,
10512 VAL is a coding system, a cons of coding systems, or a function symbol.
10513 If VAL is a coding system, it is used for both decoding and encoding
10514 the file contents.
10515 If VAL is a cons of coding systems, the car part is used for decoding,
10516 and the cdr part is used for encoding.
10517 If VAL is a function symbol, the function must return a coding system
10518 or a cons of coding systems which are used as above.  The function is
10519 called with an argument that is a list of the arguments with which
10520 `find-operation-coding-system' was called.  If the function can't decide
10521 a coding system, it can return `undecided' so that the normal
10522 code-detection is performed.
10523
10524 See also the function `find-operation-coding-system'
10525 and the variable `auto-coding-alist'.  */);
10526   Vfile_coding_system_alist = Qnil;
10527
10528   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10529                doc: /*
10530 Alist to decide a coding system to use for a process I/O operation.
10531 The format is ((PATTERN . VAL) ...),
10532 where PATTERN is a regular expression matching a program name,
10533 VAL is a coding system, a cons of coding systems, or a function symbol.
10534 If VAL is a coding system, it is used for both decoding what received
10535 from the program and encoding what sent to the program.
10536 If VAL is a cons of coding systems, the car part is used for decoding,
10537 and the cdr part is used for encoding.
10538 If VAL is a function symbol, the function must return a coding system
10539 or a cons of coding systems which are used as above.
10540
10541 See also the function `find-operation-coding-system'.  */);
10542   Vprocess_coding_system_alist = Qnil;
10543
10544   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10545                doc: /*
10546 Alist to decide a coding system to use for a network I/O operation.
10547 The format is ((PATTERN . VAL) ...),
10548 where PATTERN is a regular expression matching a network service name
10549 or is a port number to connect to,
10550 VAL is a coding system, a cons of coding systems, or a function symbol.
10551 If VAL is a coding system, it is used for both decoding what received
10552 from the network stream and encoding what sent to the network stream.
10553 If VAL is a cons of coding systems, the car part is used for decoding,
10554 and the cdr part is used for encoding.
10555 If VAL is a function symbol, the function must return a coding system
10556 or a cons of coding systems which are used as above.
10557
10558 See also the function `find-operation-coding-system'.  */);
10559   Vnetwork_coding_system_alist = Qnil;
10560
10561   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10562                doc: /* Coding system to use with system messages.
10563 Also used for decoding keyboard input on X Window system.  */);
10564   Vlocale_coding_system = Qnil;
10565
10566   /* The eol mnemonics are reset in startup.el system-dependently.  */
10567   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10568                doc: /*
10569 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10570   eol_mnemonic_unix = build_pure_c_string (":");
10571
10572   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10573                doc: /*
10574 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10575   eol_mnemonic_dos = build_pure_c_string ("\\");
10576
10577   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10578                doc: /*
10579 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10580   eol_mnemonic_mac = build_pure_c_string ("/");
10581
10582   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10583                doc: /*
10584 *String displayed in mode line when end-of-line format is not yet determined.  */);
10585   eol_mnemonic_undecided = build_pure_c_string (":");
10586
10587   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10588                doc: /*
10589 *Non-nil enables character translation while encoding and decoding.  */);
10590   Venable_character_translation = Qt;
10591
10592   DEFVAR_LISP ("standard-translation-table-for-decode",
10593                Vstandard_translation_table_for_decode,
10594                doc: /* Table for translating characters while decoding.  */);
10595   Vstandard_translation_table_for_decode = Qnil;
10596
10597   DEFVAR_LISP ("standard-translation-table-for-encode",
10598                Vstandard_translation_table_for_encode,
10599                doc: /* Table for translating characters while encoding.  */);
10600   Vstandard_translation_table_for_encode = Qnil;
10601
10602   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10603                doc: /* Alist of charsets vs revision numbers.
10604 While encoding, if a charset (car part of an element) is found,
10605 designate it with the escape sequence identifying revision (cdr part
10606 of the element).  */);
10607   Vcharset_revision_table = Qnil;
10608
10609   DEFVAR_LISP ("default-process-coding-system",
10610                Vdefault_process_coding_system,
10611                doc: /* Cons of coding systems used for process I/O by default.
10612 The car part is used for decoding a process output,
10613 the cdr part is used for encoding a text to be sent to a process.  */);
10614   Vdefault_process_coding_system = Qnil;
10615
10616   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10617                doc: /*
10618 Table of extra Latin codes in the range 128..159 (inclusive).
10619 This is a vector of length 256.
10620 If Nth element is non-nil, the existence of code N in a file
10621 \(or output of subprocess) doesn't prevent it to be detected as
10622 a coding system of ISO 2022 variant which has a flag
10623 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10624 or reading output of a subprocess.
10625 Only 128th through 159th elements have a meaning.  */);
10626   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10627
10628   DEFVAR_LISP ("select-safe-coding-system-function",
10629                Vselect_safe_coding_system_function,
10630                doc: /*
10631 Function to call to select safe coding system for encoding a text.
10632
10633 If set, this function is called to force a user to select a proper
10634 coding system which can encode the text in the case that a default
10635 coding system used in each operation can't encode the text.  The
10636 function should take care that the buffer is not modified while
10637 the coding system is being selected.
10638
10639 The default value is `select-safe-coding-system' (which see).  */);
10640   Vselect_safe_coding_system_function = Qnil;
10641
10642   DEFVAR_BOOL ("coding-system-require-warning",
10643                coding_system_require_warning,
10644                doc: /* Internal use only.
10645 If non-nil, on writing a file, `select-safe-coding-system-function' is
10646 called even if `coding-system-for-write' is non-nil.  The command
10647 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10648   coding_system_require_warning = 0;
10649
10650
10651   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10652                inhibit_iso_escape_detection,
10653                doc: /*
10654 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10655
10656 When Emacs reads text, it tries to detect how the text is encoded.
10657 This code detection is sensitive to escape sequences.  If Emacs sees
10658 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10659 of the ISO2022 encodings, and decodes text by the corresponding coding
10660 system (e.g. `iso-2022-7bit').
10661
10662 However, there may be a case that you want to read escape sequences in
10663 a file as is.  In such a case, you can set this variable to non-nil.
10664 Then the code detection will ignore any escape sequences, and no text is
10665 detected as encoded in some ISO-2022 encoding.  The result is that all
10666 escape sequences become visible in a buffer.
10667
10668 The default value is nil, and it is strongly recommended not to change
10669 it.  That is because many Emacs Lisp source files that contain
10670 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10671 in Emacs's distribution, and they won't be decoded correctly on
10672 reading if you suppress escape sequence detection.
10673
10674 The other way to read escape sequences in a file without decoding is
10675 to explicitly specify some coding system that doesn't use ISO-2022
10676 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10677   inhibit_iso_escape_detection = 0;
10678
10679   DEFVAR_BOOL ("inhibit-null-byte-detection",
10680                inhibit_null_byte_detection,
10681                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10682 By default, Emacs treats it as binary data, and does not attempt to
10683 decode it.  The effect is as if you specified `no-conversion' for
10684 reading that text.
10685
10686 Set this to non-nil when a regular text happens to include null bytes.
10687 Examples are Index nodes of Info files and null-byte delimited output
10688 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10689 decode text as usual.  */);
10690   inhibit_null_byte_detection = 0;
10691
10692   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10693                doc: /* Char table for translating self-inserting characters.
10694 This is applied to the result of input methods, not their input.
10695 See also `keyboard-translate-table'.
10696
10697 Use of this variable for character code unification was rendered
10698 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10699 internal character representation.  */);
10700     Vtranslation_table_for_input = Qnil;
10701
10702   {
10703     Lisp_Object args[coding_arg_max];
10704     Lisp_Object plist[16];
10705     int i;
10706
10707     for (i = 0; i < coding_arg_max; i++)
10708       args[i] = Qnil;
10709
10710     plist[0] = intern_c_string (":name");
10711     plist[1] = args[coding_arg_name] = Qno_conversion;
10712     plist[2] = intern_c_string (":mnemonic");
10713     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10714     plist[4] = intern_c_string (":coding-type");
10715     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10716     plist[6] = intern_c_string (":ascii-compatible-p");
10717     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10718     plist[8] = intern_c_string (":default-char");
10719     plist[9] = args[coding_arg_default_char] = make_number (0);
10720     plist[10] = intern_c_string (":for-unibyte");
10721     plist[11] = args[coding_arg_for_unibyte] = Qt;
10722     plist[12] = intern_c_string (":docstring");
10723     plist[13] = build_pure_c_string ("Do no conversion.\n\
10724 \n\
10725 When you visit a file with this coding, the file is read into a\n\
10726 unibyte buffer as is, thus each byte of a file is treated as a\n\
10727 character.");
10728     plist[14] = intern_c_string (":eol-type");
10729     plist[15] = args[coding_arg_eol_type] = Qunix;
10730     args[coding_arg_plist] = Flist (16, plist);
10731     Fdefine_coding_system_internal (coding_arg_max, args);
10732
10733     plist[1] = args[coding_arg_name] = Qundecided;
10734     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10735     plist[5] = args[coding_arg_coding_type] = Qundecided;
10736     /* This is already set.
10737        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10738     plist[8] = intern_c_string (":charset-list");
10739     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10740     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10741     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10742     plist[15] = args[coding_arg_eol_type] = Qnil;
10743     args[coding_arg_plist] = Flist (16, plist);
10744     Fdefine_coding_system_internal (coding_arg_max, args);
10745   }
10746
10747   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10748
10749   {
10750     int i;
10751
10752     for (i = 0; i < coding_category_max; i++)
10753       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10754   }
10755 #if defined (DOS_NT)
10756   system_eol_type = Qdos;
10757 #else
10758   system_eol_type = Qunix;
10759 #endif
10760   staticpro (&system_eol_type);
10761 }
10762
10763 char *
10764 emacs_strerror (int error_number)
10765 {
10766   char *str;
10767
10768   synchronize_system_messages_locale ();
10769   str = strerror (error_number);
10770
10771   if (! NILP (Vlocale_coding_system))
10772     {
10773       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10774                                                       Vlocale_coding_system,
10775                                                       0);
10776       str = SSDATA (dec);
10777     }
10778
10779   return str;
10780 }
10781
10782 #endif /* emacs */