src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2013 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #include "lisp.h"
 290 #include "character.h"
 291 #include "buffer.h"
 292 #include "charset.h"
 293 #include "ccl.h"
 294 #include "composite.h"
 295 #include "coding.h"
 296 #include "window.h"
 297 #include "frame.h"
 298 #include "termhooks.h"
 299
 300 Lisp_Object Vcoding_system_hash_table;
 301
 302 static Lisp_Object Qcoding_system, Qeol_type;
 303 static Lisp_Object Qcoding_aliases;
 304 Lisp_Object Qunix, Qdos;
 305 static Lisp_Object Qmac;
 306 Lisp_Object Qbuffer_file_coding_system;
 307 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 static Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qutf_8;
 311 static Lisp_Object Qiso_2022;
 312 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 313 static Lisp_Object Qbig, Qlittle;
 314 static Lisp_Object Qcoding_system_history;
 315 static Lisp_Object Qvalid_codes;
 316 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 317 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 static Lisp_Object QCascii_compatible_p;
 320
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 static Lisp_Object Qtarget_idx;
 324
 325 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 326
 327 /* If a symbol has this property, evaluate the value to define the
 328    symbol as a coding system.  */
 329 static Lisp_Object Qcoding_system_define_form;
 330
 331 /* Format of end-of-line decided by system.  This is Qunix on
 332    Unix and Mac, Qdos on DOS/Windows.
 333    This has an effect only for external encoding (i.e. for output to
 334    file and process), not for in-buffer or Lisp string encoding.  */
 335 static Lisp_Object system_eol_type;
 336
 337 #ifdef emacs
 338
 339 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 340
 341 /* Coding system emacs-mule and raw-text are for converting only
 342    end-of-line format.  */
 343 Lisp_Object Qemacs_mule, Qraw_text;
 344 Lisp_Object Qutf_8_emacs;
 345
 346 #if defined (WINDOWSNT) || defined (CYGWIN)
 347 static Lisp_Object Qutf_16le;
 348 #endif
 349
 350 /* Coding-systems are handed between Emacs Lisp programs and C internal
 351    routines by the following three variables.  */
 352 /* Coding system to be used to encode text for terminal display when
 353    terminal coding system is nil.  */
 354 struct coding_system safe_terminal_coding;
 355
 356 #endif /* emacs */
 357
 358 Lisp_Object Qtranslation_table;
 359 Lisp_Object Qtranslation_table_id;
 360 static Lisp_Object Qtranslation_table_for_decode;
 361 static Lisp_Object Qtranslation_table_for_encode;
 362
 363 /* Two special coding systems.  */
 364 static Lisp_Object Vsjis_coding_system;
 365 static Lisp_Object Vbig5_coding_system;
 366
 367 /* ISO2022 section */
 368
 369 #define CODING_ISO_INITIAL(coding, reg)                 \
 370   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 371                      coding_attr_iso_initial),          \
 372                reg)))
 373
 374
 375 #define CODING_ISO_REQUEST(coding, charset_id)          \
 376   (((charset_id) <= (coding)->max_charset_id            \
 377     ? ((coding)->safe_charsets[charset_id] != 255       \
 378        ? (coding)->safe_charsets[charset_id]            \
 379        : -1)                                            \
 380     : -1))
 381
 382
 383 #define CODING_ISO_FLAGS(coding)        \
 384   ((coding)->spec.iso_2022.flags)
 385 #define CODING_ISO_DESIGNATION(coding, reg)     \
 386   ((coding)->spec.iso_2022.current_designation[reg])
 387 #define CODING_ISO_INVOCATION(coding, plane)    \
 388   ((coding)->spec.iso_2022.current_invocation[plane])
 389 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 390   ((coding)->spec.iso_2022.single_shifting)
 391 #define CODING_ISO_BOL(coding)  \
 392   ((coding)->spec.iso_2022.bol)
 393 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 394   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 395 #define CODING_ISO_CMP_STATUS(coding)   \
 396   (&(coding)->spec.iso_2022.cmp_status)
 397 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 398   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 399 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 400   ((coding)->spec.iso_2022.embedded_utf_8)
 401
 402 /* Control characters of ISO2022.  */
 403                         /* code */      /* function */
 404 #define ISO_CODE_SO     0x0E            /* shift-out */
 405 #define ISO_CODE_SI     0x0F            /* shift-in */
 406 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 407 #define ISO_CODE_ESC    0x1B            /* escape */
 408 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 409 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 410 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 411
 412 /* All code (1-byte) of ISO2022 is classified into one of the
 413    followings.  */
 414 enum iso_code_class_type
 415   {
 416     ISO_control_0,              /* Control codes in the range
 417                                    0x00..0x1F and 0x7F, except for the
 418                                    following 5 codes.  */
 419     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 420     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 421     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 422     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 423     ISO_control_1,              /* Control codes in the range
 424                                    0x80..0x9F, except for the
 425                                    following 3 codes.  */
 426     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 427     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 428     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 429     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 430     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 431     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 432     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 433   };
 434
 435 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 436     `iso-flags' attribute of an iso2022 coding system.  */
 437
 438 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 439    instead of the correct short-form sequence (e.g. ESC $ A).  */
 440 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 441
 442 /* If set, reset graphic planes and registers at end-of-line to the
 443    initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 445
 446 /* If set, reset graphic planes and registers before any control
 447    characters to the initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 449
 450 /* If set, encode by 7-bit environment.  */
 451 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 452
 453 /* If set, use locking-shift function.  */
 454 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 455
 456 /* If set, use single-shift function.  Overwrite
 457    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 458 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 459
 460 /* If set, use designation escape sequence.  */
 461 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 462
 463 /* If set, produce revision number sequence.  */
 464 #define CODING_ISO_FLAG_REVISION        0x0080
 465
 466 /* If set, produce ISO6429's direction specifying sequence.  */
 467 #define CODING_ISO_FLAG_DIRECTION       0x0100
 468
 469 /* If set, assume designation states are reset at beginning of line on
 470    output.  */
 471 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 472
 473 /* If set, designation sequence should be placed at beginning of line
 474    on output.  */
 475 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 476
 477 /* If set, do not encode unsafe characters on output.  */
 478 #define CODING_ISO_FLAG_SAFE            0x0800
 479
 480 /* If set, extra latin codes (128..159) are accepted as a valid code
 481    on input.  */
 482 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 483
 484 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 485
 486 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 487
 488 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 489
 490 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 491
 492 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 493
 494 /* A character to be produced on output if encoding of the original
 495    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 496 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 497
 498 /* UTF-8 section */
 499 #define CODING_UTF_8_BOM(coding)        \
 500   ((coding)->spec.utf_8_bom)
 501
 502 /* UTF-16 section */
 503 #define CODING_UTF_16_BOM(coding)       \
 504   ((coding)->spec.utf_16.bom)
 505
 506 #define CODING_UTF_16_ENDIAN(coding)    \
 507   ((coding)->spec.utf_16.endian)
 508
 509 #define CODING_UTF_16_SURROGATE(coding) \
 510   ((coding)->spec.utf_16.surrogate)
 511
 512
 513 /* CCL section */
 514 #define CODING_CCL_DECODER(coding)      \
 515   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 516 #define CODING_CCL_ENCODER(coding)      \
 517   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 518 #define CODING_CCL_VALIDS(coding)                                          \
 519   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 520
 521 /* Index for each coding category in `coding_categories' */
 522
 523 enum coding_category
 524   {
 525     coding_category_iso_7,
 526     coding_category_iso_7_tight,
 527     coding_category_iso_8_1,
 528     coding_category_iso_8_2,
 529     coding_category_iso_7_else,
 530     coding_category_iso_8_else,
 531     coding_category_utf_8_auto,
 532     coding_category_utf_8_nosig,
 533     coding_category_utf_8_sig,
 534     coding_category_utf_16_auto,
 535     coding_category_utf_16_be,
 536     coding_category_utf_16_le,
 537     coding_category_utf_16_be_nosig,
 538     coding_category_utf_16_le_nosig,
 539     coding_category_charset,
 540     coding_category_sjis,
 541     coding_category_big5,
 542     coding_category_ccl,
 543     coding_category_emacs_mule,
 544     /* All above are targets of code detection.  */
 545     coding_category_raw_text,
 546     coding_category_undecided,
 547     coding_category_max
 548   };
 549
 550 /* Definitions of flag bits used in detect_coding_XXXX.  */
 551 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 552 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 553 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 554 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 555 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 556 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 557 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 558 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 559 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 560 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 561 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 562 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 563 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 564 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 565 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 566 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 567 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 568 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 569 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 570 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 571
 572 /* This value is returned if detect_coding_mask () find nothing other
 573    than ASCII characters.  */
 574 #define CATEGORY_MASK_ANY               \
 575   (CATEGORY_MASK_ISO_7                  \
 576    | CATEGORY_MASK_ISO_7_TIGHT          \
 577    | CATEGORY_MASK_ISO_8_1              \
 578    | CATEGORY_MASK_ISO_8_2              \
 579    | CATEGORY_MASK_ISO_7_ELSE           \
 580    | CATEGORY_MASK_ISO_8_ELSE           \
 581    | CATEGORY_MASK_UTF_8_AUTO           \
 582    | CATEGORY_MASK_UTF_8_NOSIG          \
 583    | CATEGORY_MASK_UTF_8_SIG            \
 584    | CATEGORY_MASK_UTF_16_AUTO          \
 585    | CATEGORY_MASK_UTF_16_BE            \
 586    | CATEGORY_MASK_UTF_16_LE            \
 587    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 588    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 589    | CATEGORY_MASK_CHARSET              \
 590    | CATEGORY_MASK_SJIS                 \
 591    | CATEGORY_MASK_BIG5                 \
 592    | CATEGORY_MASK_CCL                  \
 593    | CATEGORY_MASK_EMACS_MULE)
 594
 595
 596 #define CATEGORY_MASK_ISO_7BIT \
 597   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 598
 599 #define CATEGORY_MASK_ISO_8BIT \
 600   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 601
 602 #define CATEGORY_MASK_ISO_ELSE \
 603   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 604
 605 #define CATEGORY_MASK_ISO_ESCAPE        \
 606   (CATEGORY_MASK_ISO_7                  \
 607    | CATEGORY_MASK_ISO_7_TIGHT          \
 608    | CATEGORY_MASK_ISO_7_ELSE           \
 609    | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO       \
 612   (  CATEGORY_MASK_ISO_7BIT     \
 613      | CATEGORY_MASK_ISO_8BIT   \
 614      | CATEGORY_MASK_ISO_ELSE)
 615
 616 #define CATEGORY_MASK_UTF_16            \
 617   (CATEGORY_MASK_UTF_16_AUTO            \
 618    | CATEGORY_MASK_UTF_16_BE            \
 619    | CATEGORY_MASK_UTF_16_LE            \
 620    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 621    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 622
 623 #define CATEGORY_MASK_UTF_8     \
 624   (CATEGORY_MASK_UTF_8_AUTO     \
 625    | CATEGORY_MASK_UTF_8_NOSIG  \
 626    | CATEGORY_MASK_UTF_8_SIG)
 627
 628 /* Table of coding categories (Lisp symbols).  This variable is for
 629    internal use only.  */
 630 static Lisp_Object Vcoding_category_table;
 631
 632 /* Table of coding-categories ordered by priority.  */
 633 static enum coding_category coding_priorities[coding_category_max];
 634
 635 /* Nth element is a coding context for the coding system bound to the
 636    Nth coding category.  */
 637 static struct coding_system coding_categories[coding_category_max];
 638
 639 /*** Commonly used macros and functions ***/
 640
 641 #ifndef min
 642 #define min(a, b) ((a) < (b) ? (a) : (b))
 643 #endif
 644 #ifndef max
 645 #define max(a, b) ((a) > (b) ? (a) : (b))
 646 #endif
 647
 648 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 649   do {                                                  \
 650     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 651     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 652   } while (0)
 653
 654
 655 /* Safely get one byte from the source text pointed by SRC which ends
 656    at SRC_END, and set C to that byte.  If there are not enough bytes
 657    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 658    and a multibyte character is found at SRC, set C to the
 659    negative value of the character code.  The caller should declare
 660    and set these variables appropriately in advance:
 661         src, src_end, multibytep */
 662
 663 #define ONE_MORE_BYTE(c)                                \
 664   do {                                                  \
 665     if (src == src_end)                                 \
 666       {                                                 \
 667         if (src_base < src)                             \
 668           record_conversion_result                      \
 669             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 670         goto no_more_source;                            \
 671       }                                                 \
 672     c = *src++;                                         \
 673     if (multibytep && (c & 0x80))                       \
 674       {                                                 \
 675         if ((c & 0xFE) == 0xC0)                         \
 676           c = ((c & 1) << 6) | *src++;                  \
 677         else                                            \
 678           {                                             \
 679             src--;                                      \
 680             c = - string_char (src, &src, NULL);        \
 681             record_conversion_result                    \
 682               (coding, CODING_RESULT_INVALID_SRC);      \
 683           }                                             \
 684       }                                                 \
 685     consumed_chars++;                                   \
 686   } while (0)
 687
 688 /* Safely get two bytes from the source text pointed by SRC which ends
 689    at SRC_END, and set C1 and C2 to those bytes while skipping the
 690    heading multibyte characters.  If there are not enough bytes in the
 691    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 692    a multibyte character is found for C2, set C2 to the negative value
 693    of the character code.  The caller should declare and set these
 694    variables appropriately in advance:
 695         src, src_end, multibytep
 696    It is intended that this macro is used in detect_coding_utf_16.  */
 697
 698 #define TWO_MORE_BYTES(c1, c2)                          \
 699   do {                                                  \
 700     do {                                                \
 701       if (src == src_end)                               \
 702         goto no_more_source;                            \
 703       c1 = *src++;                                      \
 704       if (multibytep && (c1 & 0x80))                    \
 705         {                                               \
 706           if ((c1 & 0xFE) == 0xC0)                      \
 707             c1 = ((c1 & 1) << 6) | *src++;              \
 708           else                                          \
 709             {                                           \
 710               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 711               c1 = -1;                                  \
 712             }                                           \
 713         }                                               \
 714     } while (c1 < 0);                                   \
 715     if (src == src_end)                                 \
 716       goto no_more_source;                              \
 717     c2 = *src++;                                        \
 718     if (multibytep && (c2 & 0x80))                      \
 719       {                                                 \
 720         if ((c2 & 0xFE) == 0xC0)                        \
 721           c2 = ((c2 & 1) << 6) | *src++;                \
 722         else                                            \
 723           c2 = -1;                                      \
 724       }                                                 \
 725   } while (0)
 726
 727
 728 /* Store a byte C in the place pointed by DST and increment DST to the
 729    next free point, and increment PRODUCED_CHARS.  The caller should
 730    assure that C is 0..127, and declare and set the variable `dst'
 731    appropriately in advance.
 732 */
 733
 734
 735 #define EMIT_ONE_ASCII_BYTE(c)  \
 736   do {                          \
 737     produced_chars++;           \
 738     *dst++ = (c);               \
 739   } while (0)
 740
 741
 742 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 743
 744 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 745   do {                                  \
 746     produced_chars += 2;                \
 747     *dst++ = (c1), *dst++ = (c2);       \
 748   } while (0)
 749
 750
 751 /* Store a byte C in the place pointed by DST and increment DST to the
 752    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 753    store in an appropriate multibyte form.  The caller should
 754    declare and set the variables `dst' and `multibytep' appropriately
 755    in advance.  */
 756
 757 #define EMIT_ONE_BYTE(c)                \
 758   do {                                  \
 759     produced_chars++;                   \
 760     if (multibytep)                     \
 761       {                                 \
 762         unsigned ch = (c);              \
 763         if (ch >= 0x80)                 \
 764           ch = BYTE8_TO_CHAR (ch);      \
 765         CHAR_STRING_ADVANCE (ch, dst);  \
 766       }                                 \
 767     else                                \
 768       *dst++ = (c);                     \
 769   } while (0)
 770
 771
 772 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 773
 774 #define EMIT_TWO_BYTES(c1, c2)          \
 775   do {                                  \
 776     produced_chars += 2;                \
 777     if (multibytep)                     \
 778       {                                 \
 779         unsigned ch;                    \
 780                                         \
 781         ch = (c1);                      \
 782         if (ch >= 0x80)                 \
 783           ch = BYTE8_TO_CHAR (ch);      \
 784         CHAR_STRING_ADVANCE (ch, dst);  \
 785         ch = (c2);                      \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789       }                                 \
 790     else                                \
 791       {                                 \
 792         *dst++ = (c1);                  \
 793         *dst++ = (c2);                  \
 794       }                                 \
 795   } while (0)
 796
 797
 798 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 799   do {                                  \
 800     EMIT_ONE_BYTE (c1);                 \
 801     EMIT_TWO_BYTES (c2, c3);            \
 802   } while (0)
 803
 804
 805 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 806   do {                                          \
 807     EMIT_TWO_BYTES (c1, c2);                    \
 808     EMIT_TWO_BYTES (c3, c4);                    \
 809   } while (0)
 810
 811
 812 static void
 813 record_conversion_result (struct coding_system *coding,
 814                           enum coding_result_code result)
 815 {
 816   coding->result = result;
 817   switch (result)
 818     {
 819     case CODING_RESULT_INSUFFICIENT_SRC:
 820       Vlast_code_conversion_error = Qinsufficient_source;
 821       break;
 822     case CODING_RESULT_INVALID_SRC:
 823       Vlast_code_conversion_error = Qinvalid_source;
 824       break;
 825     case CODING_RESULT_INTERRUPT:
 826       Vlast_code_conversion_error = Qinterrupted;
 827       break;
 828     case CODING_RESULT_INSUFFICIENT_DST:
 829       /* Don't record this error in Vlast_code_conversion_error
 830          because it happens just temporarily and is resolved when the
 831          whole conversion is finished.  */
 832       break;
 833     case CODING_RESULT_SUCCESS:
 834       break;
 835     default:
 836       Vlast_code_conversion_error = intern ("Unknown error");
 837     }
 838 }
 839
 840 /* These wrapper macros are used to preserve validity of pointers into
 841    buffer text across calls to decode_char, encode_char, etc, which
 842    could cause relocation of buffers if it loads a charset map,
 843    because loading a charset map allocates large structures.  */
 844
 845 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 846   do {                                                                       \
 847     ptrdiff_t offset;                                                        \
 848                                                                              \
 849     charset_map_loaded = 0;                                                  \
 850     c = DECODE_CHAR (charset, code);                                         \
 851     if (charset_map_loaded                                                   \
 852         && (offset = coding_change_source (coding)))                         \
 853       {                                                                      \
 854         src += offset;                                                       \
 855         src_base += offset;                                                  \
 856         src_end += offset;                                                   \
 857       }                                                                      \
 858   } while (0)
 859
 860 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 861   do {                                                                  \
 862     ptrdiff_t offset;                                                   \
 863                                                                         \
 864     charset_map_loaded = 0;                                             \
 865     code = ENCODE_CHAR (charset, c);                                    \
 866     if (charset_map_loaded                                              \
 867         && (offset = coding_change_destination (coding)))               \
 868       {                                                                 \
 869         dst += offset;                                                  \
 870         dst_end += offset;                                              \
 871       }                                                                 \
 872   } while (0)
 873
 874 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 875   do {                                                                  \
 876     ptrdiff_t offset;                                                   \
 877                                                                         \
 878     charset_map_loaded = 0;                                             \
 879     charset = char_charset (c, charset_list, code_return);              \
 880     if (charset_map_loaded                                              \
 881         && (offset = coding_change_destination (coding)))               \
 882       {                                                                 \
 883         dst += offset;                                                  \
 884         dst_end += offset;                                              \
 885       }                                                                 \
 886   } while (0)
 887
 888 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 889   do {                                                                  \
 890     ptrdiff_t offset;                                                   \
 891                                                                         \
 892     charset_map_loaded = 0;                                             \
 893     result = CHAR_CHARSET_P (c, charset);                               \
 894     if (charset_map_loaded                                              \
 895         && (offset = coding_change_destination (coding)))               \
 896       {                                                                 \
 897         dst += offset;                                                  \
 898         dst_end += offset;                                              \
 899       }                                                                 \
 900   } while (0)
 901
 902
 903 /* If there are at least BYTES length of room at dst, allocate memory
 904    for coding->destination and update dst and dst_end.  We don't have
 905    to take care of coding->source which will be relocated.  It is
 906    handled by calling coding_set_source in encode_coding.  */
 907
 908 #define ASSURE_DESTINATION(bytes)                               \
 909   do {                                                          \
 910     if (dst + (bytes) >= dst_end)                               \
 911       {                                                         \
 912         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 913                                                                 \
 914         dst = alloc_destination (coding, more_bytes, dst);      \
 915         dst_end = coding->destination + coding->dst_bytes;      \
 916       }                                                         \
 917   } while (0)
 918
 919
 920 /* Store multibyte form of the character C in P, and advance P to the
 921    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 922    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 923    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 924
 925 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 926
 927 /* Return the character code of character whose multibyte form is at
 928    P, and advance P to the end of the multibyte form.  This used to be
 929    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 930    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 931
 932 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 933
 934 /* Set coding->source from coding->src_object.  */
 935
 936 static void
 937 coding_set_source (struct coding_system *coding)
 938 {
 939   if (BUFFERP (coding->src_object))
 940     {
 941       struct buffer *buf = XBUFFER (coding->src_object);
 942
 943       if (coding->src_pos < 0)
 944         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 945       else
 946         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 947     }
 948   else if (STRINGP (coding->src_object))
 949     {
 950       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 951     }
 952   else
 953     {
 954       /* Otherwise, the source is C string and is never relocated
 955          automatically.  Thus we don't have to update anything.  */
 956     }
 957 }
 958
 959
 960 /* Set coding->source from coding->src_object, and return how many
 961    bytes coding->source was changed.  */
 962
 963 static ptrdiff_t
 964 coding_change_source (struct coding_system *coding)
 965 {
 966   const unsigned char *orig = coding->source;
 967   coding_set_source (coding);
 968   return coding->source - orig;
 969 }
 970
 971
 972 /* Set coding->destination from coding->dst_object.  */
 973
 974 static void
 975 coding_set_destination (struct coding_system *coding)
 976 {
 977   if (BUFFERP (coding->dst_object))
 978     {
 979       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
 980         {
 981           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
 982           coding->dst_bytes = (GAP_END_ADDR
 983                                - (coding->src_bytes - coding->consumed)
 984                                - coding->destination);
 985         }
 986       else
 987         {
 988           /* We are sure that coding->dst_pos_byte is before the gap
 989              of the buffer. */
 990           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 991                                  + coding->dst_pos_byte - BEG_BYTE);
 992           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 993                                - coding->destination);
 994         }
 995     }
 996   else
 997     {
 998       /* Otherwise, the destination is C string and is never relocated
 999          automatically.  Thus we don't have to update anything.  */
1000     }
1001 }
1002
1003
1004 /* Set coding->destination from coding->dst_object, and return how
1005    many bytes coding->destination was changed.  */
1006
1007 static ptrdiff_t
1008 coding_change_destination (struct coding_system *coding)
1009 {
1010   const unsigned char *orig = coding->destination;
1011   coding_set_destination (coding);
1012   return coding->destination - orig;
1013 }
1014
1015
1016 static void
1017 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1018 {
1019   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1020     string_overflow ();
1021   coding->destination = xrealloc (coding->destination,
1022                                   coding->dst_bytes + bytes);
1023   coding->dst_bytes += bytes;
1024 }
1025
1026 static void
1027 coding_alloc_by_making_gap (struct coding_system *coding,
1028                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1029 {
1030   if (EQ (coding->src_object, coding->dst_object))
1031     {
1032       /* The gap may contain the produced data at the head and not-yet
1033          consumed data at the tail.  To preserve those data, we at
1034          first make the gap size to zero, then increase the gap
1035          size.  */
1036       ptrdiff_t add = GAP_SIZE;
1037
1038       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1039       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1043     }
1044   else
1045     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1046 }
1047
1048
1049 static unsigned char *
1050 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1051                    unsigned char *dst)
1052 {
1053   ptrdiff_t offset = dst - coding->destination;
1054
1055   if (BUFFERP (coding->dst_object))
1056     {
1057       struct buffer *buf = XBUFFER (coding->dst_object);
1058
1059       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1060     }
1061   else
1062     coding_alloc_by_realloc (coding, nbytes);
1063   coding_set_destination (coding);
1064   dst = coding->destination + offset;
1065   return dst;
1066 }
1067
1068 /** Macros for annotations.  */
1069
1070 /* An annotation data is stored in the array coding->charbuf in this
1071    format:
1072      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1073    LENGTH is the number of elements in the annotation.
1074    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1075    NCHARS is the number of characters in the text annotated.
1076
1077    The format of the following elements depend on ANNOTATION_MASK.
1078
1079    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1080    follows:
1081      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1082
1083    NBYTES is the number of bytes specified in the header part of
1084    old-style emacs-mule encoding, or 0 for the other kind of
1085    composition.
1086
1087    METHOD is one of enum composition_method.
1088
1089    Optional COMPOSITION-COMPONENTS are characters and composition
1090    rules.
1091
1092    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1093    follows.
1094
1095    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1096    recover from an invalid annotation, and should be skipped by
1097    produce_annotation.  */
1098
1099 /* Maximum length of the header of annotation data.  */
1100 #define MAX_ANNOTATION_LENGTH 5
1101
1102 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1103   do {                                                  \
1104     *(buf)++ = -(len);                                  \
1105     *(buf)++ = (mask);                                  \
1106     *(buf)++ = (nchars);                                \
1107     coding->annotated = 1;                              \
1108   } while (0);
1109
1110 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1111   do {                                                                      \
1112     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1113     *buf++ = nbytes;                                                        \
1114     *buf++ = method;                                                        \
1115   } while (0)
1116
1117
1118 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1119   do {                                                                  \
1120     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1121     *buf++ = id;                                                        \
1122   } while (0)
1123
1124 \f
1125 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1126
1127
1128
1129 \f
1130 /*** 3. UTF-8 ***/
1131
1132 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1133    Return true if a text is encoded in UTF-8.  */
1134
1135 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1136 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1137 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1138 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1139 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1140 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1141
1142 #define UTF_8_BOM_1 0xEF
1143 #define UTF_8_BOM_2 0xBB
1144 #define UTF_8_BOM_3 0xBF
1145
1146 static bool
1147 detect_coding_utf_8 (struct coding_system *coding,
1148                      struct coding_detection_info *detect_info)
1149 {
1150   const unsigned char *src = coding->source, *src_base;
1151   const unsigned char *src_end = coding->source + coding->src_bytes;
1152   bool multibytep = coding->src_multibyte;
1153   ptrdiff_t consumed_chars = 0;
1154   bool bom_found = 0;
1155   bool found = 0;
1156
1157   detect_info->checked |= CATEGORY_MASK_UTF_8;
1158   /* A coding system of this category is always ASCII compatible.  */
1159   src += coding->head_ascii;
1160
1161   while (1)
1162     {
1163       int c, c1, c2, c3, c4;
1164
1165       src_base = src;
1166       ONE_MORE_BYTE (c);
1167       if (c < 0 || UTF_8_1_OCTET_P (c))
1168         continue;
1169       ONE_MORE_BYTE (c1);
1170       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1171         break;
1172       if (UTF_8_2_OCTET_LEADING_P (c))
1173         {
1174           found = 1;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c2);
1178       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1179         break;
1180       if (UTF_8_3_OCTET_LEADING_P (c))
1181         {
1182           found = 1;
1183           if (src_base == coding->source
1184               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1185             bom_found = 1;
1186           continue;
1187         }
1188       ONE_MORE_BYTE (c3);
1189       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1190         break;
1191       if (UTF_8_4_OCTET_LEADING_P (c))
1192         {
1193           found = 1;
1194           continue;
1195         }
1196       ONE_MORE_BYTE (c4);
1197       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1198         break;
1199       if (UTF_8_5_OCTET_LEADING_P (c))
1200         {
1201           found = 1;
1202           continue;
1203         }
1204       break;
1205     }
1206   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1207   return 0;
1208
1209  no_more_source:
1210   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1211     {
1212       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1213       return 0;
1214     }
1215   if (bom_found)
1216     {
1217       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1218       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1219     }
1220   else
1221     {
1222       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1223       if (found)
1224         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1225     }
1226   return 1;
1227 }
1228
1229
1230 static void
1231 decode_coding_utf_8 (struct coding_system *coding)
1232 {
1233   const unsigned char *src = coding->source + coding->consumed;
1234   const unsigned char *src_end = coding->source + coding->src_bytes;
1235   const unsigned char *src_base;
1236   int *charbuf = coding->charbuf + coding->charbuf_used;
1237   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1238   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1239   bool multibytep = coding->src_multibyte;
1240   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1241   bool eol_dos
1242     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1243   int byte_after_cr = -1;
1244
1245   if (bom != utf_without_bom)
1246     {
1247       int c1, c2, c3;
1248
1249       src_base = src;
1250       ONE_MORE_BYTE (c1);
1251       if (! UTF_8_3_OCTET_LEADING_P (c1))
1252         src = src_base;
1253       else
1254         {
1255           ONE_MORE_BYTE (c2);
1256           if (! UTF_8_EXTRA_OCTET_P (c2))
1257             src = src_base;
1258           else
1259             {
1260               ONE_MORE_BYTE (c3);
1261               if (! UTF_8_EXTRA_OCTET_P (c3))
1262                 src = src_base;
1263               else
1264                 {
1265                   if ((c1 != UTF_8_BOM_1)
1266                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1267                     src = src_base;
1268                   else
1269                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1270                 }
1271             }
1272         }
1273     }
1274   CODING_UTF_8_BOM (coding) = utf_without_bom;
1275
1276   while (1)
1277     {
1278       int c, c1, c2, c3, c4, c5;
1279
1280       src_base = src;
1281       consumed_chars_base = consumed_chars;
1282
1283       if (charbuf >= charbuf_end)
1284         {
1285           if (byte_after_cr >= 0)
1286             src_base--;
1287           break;
1288         }
1289
1290       if (byte_after_cr >= 0)
1291         c1 = byte_after_cr, byte_after_cr = -1;
1292       else
1293         ONE_MORE_BYTE (c1);
1294       if (c1 < 0)
1295         {
1296           c = - c1;
1297         }
1298       else if (UTF_8_1_OCTET_P (c1))
1299         {
1300           if (eol_dos && c1 == '\r')
1301             ONE_MORE_BYTE (byte_after_cr);
1302           c = c1;
1303         }
1304       else
1305         {
1306           ONE_MORE_BYTE (c2);
1307           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1308             goto invalid_code;
1309           if (UTF_8_2_OCTET_LEADING_P (c1))
1310             {
1311               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1312               /* Reject overlong sequences here and below.  Encoders
1313                  producing them are incorrect, they can be misleading,
1314                  and they mess up read/write invariance.  */
1315               if (c < 128)
1316                 goto invalid_code;
1317             }
1318           else
1319             {
1320               ONE_MORE_BYTE (c3);
1321               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1322                 goto invalid_code;
1323               if (UTF_8_3_OCTET_LEADING_P (c1))
1324                 {
1325                   c = (((c1 & 0xF) << 12)
1326                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1327                   if (c < 0x800
1328                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1329                     goto invalid_code;
1330                 }
1331               else
1332                 {
1333                   ONE_MORE_BYTE (c4);
1334                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1335                     goto invalid_code;
1336                   if (UTF_8_4_OCTET_LEADING_P (c1))
1337                     {
1338                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1339                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1340                     if (c < 0x10000)
1341                       goto invalid_code;
1342                     }
1343                   else
1344                     {
1345                       ONE_MORE_BYTE (c5);
1346                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1347                         goto invalid_code;
1348                       if (UTF_8_5_OCTET_LEADING_P (c1))
1349                         {
1350                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1351                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1352                                | (c5 & 0x3F));
1353                           if ((c > MAX_CHAR) || (c < 0x200000))
1354                             goto invalid_code;
1355                         }
1356                       else
1357                         goto invalid_code;
1358                     }
1359                 }
1360             }
1361         }
1362
1363       *charbuf++ = c;
1364       continue;
1365
1366     invalid_code:
1367       src = src_base;
1368       consumed_chars = consumed_chars_base;
1369       ONE_MORE_BYTE (c);
1370       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1371       coding->errors++;
1372     }
1373
1374  no_more_source:
1375   coding->consumed_char += consumed_chars_base;
1376   coding->consumed = src_base - coding->source;
1377   coding->charbuf_used = charbuf - coding->charbuf;
1378 }
1379
1380
1381 static bool
1382 encode_coding_utf_8 (struct coding_system *coding)
1383 {
1384   bool multibytep = coding->dst_multibyte;
1385   int *charbuf = coding->charbuf;
1386   int *charbuf_end = charbuf + coding->charbuf_used;
1387   unsigned char *dst = coding->destination + coding->produced;
1388   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1389   ptrdiff_t produced_chars = 0;
1390   int c;
1391
1392   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1393     {
1394       ASSURE_DESTINATION (3);
1395       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1396       CODING_UTF_8_BOM (coding) = utf_without_bom;
1397     }
1398
1399   if (multibytep)
1400     {
1401       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1402
1403       while (charbuf < charbuf_end)
1404         {
1405           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1406
1407           ASSURE_DESTINATION (safe_room);
1408           c = *charbuf++;
1409           if (CHAR_BYTE8_P (c))
1410             {
1411               c = CHAR_TO_BYTE8 (c);
1412               EMIT_ONE_BYTE (c);
1413             }
1414           else
1415             {
1416               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1417               for (p = str; p < pend; p++)
1418                 EMIT_ONE_BYTE (*p);
1419             }
1420         }
1421     }
1422   else
1423     {
1424       int safe_room = MAX_MULTIBYTE_LENGTH;
1425
1426       while (charbuf < charbuf_end)
1427         {
1428           ASSURE_DESTINATION (safe_room);
1429           c = *charbuf++;
1430           if (CHAR_BYTE8_P (c))
1431             *dst++ = CHAR_TO_BYTE8 (c);
1432           else
1433             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1434           produced_chars++;
1435         }
1436     }
1437   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1438   coding->produced_char += produced_chars;
1439   coding->produced = dst - coding->destination;
1440   return 0;
1441 }
1442
1443
1444 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1445    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1446
1447 #define UTF_16_HIGH_SURROGATE_P(val) \
1448   (((val) & 0xFC00) == 0xD800)
1449
1450 #define UTF_16_LOW_SURROGATE_P(val) \
1451   (((val) & 0xFC00) == 0xDC00)
1452
1453
1454 static bool
1455 detect_coding_utf_16 (struct coding_system *coding,
1456                       struct coding_detection_info *detect_info)
1457 {
1458   const unsigned char *src = coding->source;
1459   const unsigned char *src_end = coding->source + coding->src_bytes;
1460   bool multibytep = coding->src_multibyte;
1461   int c1, c2;
1462
1463   detect_info->checked |= CATEGORY_MASK_UTF_16;
1464   if (coding->mode & CODING_MODE_LAST_BLOCK
1465       && (coding->src_chars & 1))
1466     {
1467       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1468       return 0;
1469     }
1470
1471   TWO_MORE_BYTES (c1, c2);
1472   if ((c1 == 0xFF) && (c2 == 0xFE))
1473     {
1474       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1475                              | CATEGORY_MASK_UTF_16_AUTO);
1476       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1477                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1478                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1479     }
1480   else if ((c1 == 0xFE) && (c2 == 0xFF))
1481     {
1482       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1483                              | CATEGORY_MASK_UTF_16_AUTO);
1484       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1485                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1486                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1487     }
1488   else if (c2 < 0)
1489     {
1490       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1491       return 0;
1492     }
1493   else
1494     {
1495       /* We check the dispersion of Eth and Oth bytes where E is even and
1496          O is odd.  If both are high, we assume binary data.*/
1497       unsigned char e[256], o[256];
1498       unsigned e_num = 1, o_num = 1;
1499
1500       memset (e, 0, 256);
1501       memset (o, 0, 256);
1502       e[c1] = 1;
1503       o[c2] = 1;
1504
1505       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1506                                 |CATEGORY_MASK_UTF_16_BE
1507                                 | CATEGORY_MASK_UTF_16_LE);
1508
1509       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1510              != CATEGORY_MASK_UTF_16)
1511         {
1512           TWO_MORE_BYTES (c1, c2);
1513           if (c2 < 0)
1514             break;
1515           if (! e[c1])
1516             {
1517               e[c1] = 1;
1518               e_num++;
1519               if (e_num >= 128)
1520                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1521             }
1522           if (! o[c2])
1523             {
1524               o[c2] = 1;
1525               o_num++;
1526               if (o_num >= 128)
1527                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1528             }
1529         }
1530       return 0;
1531     }
1532
1533  no_more_source:
1534   return 1;
1535 }
1536
1537 static void
1538 decode_coding_utf_16 (struct coding_system *coding)
1539 {
1540   const unsigned char *src = coding->source + coding->consumed;
1541   const unsigned char *src_end = coding->source + coding->src_bytes;
1542   const unsigned char *src_base;
1543   int *charbuf = coding->charbuf + coding->charbuf_used;
1544   /* We may produces at most 3 chars in one loop.  */
1545   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1546   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1547   bool multibytep = coding->src_multibyte;
1548   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1549   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1550   int surrogate = CODING_UTF_16_SURROGATE (coding);
1551   bool eol_dos
1552     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1553   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1554
1555   if (bom == utf_with_bom)
1556     {
1557       int c, c1, c2;
1558
1559       src_base = src;
1560       ONE_MORE_BYTE (c1);
1561       ONE_MORE_BYTE (c2);
1562       c = (c1 << 8) | c2;
1563
1564       if (endian == utf_16_big_endian
1565           ? c != 0xFEFF : c != 0xFFFE)
1566         {
1567           /* The first two bytes are not BOM.  Treat them as bytes
1568              for a normal character.  */
1569           src = src_base;
1570           coding->errors++;
1571         }
1572       CODING_UTF_16_BOM (coding) = utf_without_bom;
1573     }
1574   else if (bom == utf_detect_bom)
1575     {
1576       /* We have already tried to detect BOM and failed in
1577          detect_coding.  */
1578       CODING_UTF_16_BOM (coding) = utf_without_bom;
1579     }
1580
1581   while (1)
1582     {
1583       int c, c1, c2;
1584
1585       src_base = src;
1586       consumed_chars_base = consumed_chars;
1587
1588       if (charbuf >= charbuf_end)
1589         {
1590           if (byte_after_cr1 >= 0)
1591             src_base -= 2;
1592           break;
1593         }
1594
1595       if (byte_after_cr1 >= 0)
1596         c1 = byte_after_cr1, byte_after_cr1 = -1;
1597       else
1598         ONE_MORE_BYTE (c1);
1599       if (c1 < 0)
1600         {
1601           *charbuf++ = -c1;
1602           continue;
1603         }
1604       if (byte_after_cr2 >= 0)
1605         c2 = byte_after_cr2, byte_after_cr2 = -1;
1606       else
1607         ONE_MORE_BYTE (c2);
1608       if (c2 < 0)
1609         {
1610           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1611           *charbuf++ = -c2;
1612           continue;
1613         }
1614       c = (endian == utf_16_big_endian
1615            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1616
1617       if (surrogate)
1618         {
1619           if (! UTF_16_LOW_SURROGATE_P (c))
1620             {
1621               if (endian == utf_16_big_endian)
1622                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1623               else
1624                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1625               *charbuf++ = c1;
1626               *charbuf++ = c2;
1627               coding->errors++;
1628               if (UTF_16_HIGH_SURROGATE_P (c))
1629                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1630               else
1631                 *charbuf++ = c;
1632             }
1633           else
1634             {
1635               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1636               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1637               *charbuf++ = 0x10000 + c;
1638             }
1639         }
1640       else
1641         {
1642           if (UTF_16_HIGH_SURROGATE_P (c))
1643             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1644           else
1645             {
1646               if (eol_dos && c == '\r')
1647                 {
1648                   ONE_MORE_BYTE (byte_after_cr1);
1649                   ONE_MORE_BYTE (byte_after_cr2);
1650                 }
1651               *charbuf++ = c;
1652             }
1653         }
1654     }
1655
1656  no_more_source:
1657   coding->consumed_char += consumed_chars_base;
1658   coding->consumed = src_base - coding->source;
1659   coding->charbuf_used = charbuf - coding->charbuf;
1660 }
1661
1662 static bool
1663 encode_coding_utf_16 (struct coding_system *coding)
1664 {
1665   bool multibytep = coding->dst_multibyte;
1666   int *charbuf = coding->charbuf;
1667   int *charbuf_end = charbuf + coding->charbuf_used;
1668   unsigned char *dst = coding->destination + coding->produced;
1669   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1670   int safe_room = 8;
1671   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1672   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1673   ptrdiff_t produced_chars = 0;
1674   int c;
1675
1676   if (bom != utf_without_bom)
1677     {
1678       ASSURE_DESTINATION (safe_room);
1679       if (big_endian)
1680         EMIT_TWO_BYTES (0xFE, 0xFF);
1681       else
1682         EMIT_TWO_BYTES (0xFF, 0xFE);
1683       CODING_UTF_16_BOM (coding) = utf_without_bom;
1684     }
1685
1686   while (charbuf < charbuf_end)
1687     {
1688       ASSURE_DESTINATION (safe_room);
1689       c = *charbuf++;
1690       if (c > MAX_UNICODE_CHAR)
1691         c = coding->default_char;
1692
1693       if (c < 0x10000)
1694         {
1695           if (big_endian)
1696             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1697           else
1698             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1699         }
1700       else
1701         {
1702           int c1, c2;
1703
1704           c -= 0x10000;
1705           c1 = (c >> 10) + 0xD800;
1706           c2 = (c & 0x3FF) + 0xDC00;
1707           if (big_endian)
1708             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1709           else
1710             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1711         }
1712     }
1713   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1714   coding->produced = dst - coding->destination;
1715   coding->produced_char += produced_chars;
1716   return 0;
1717 }
1718
1719 \f
1720 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1721
1722 /* Emacs' internal format for representation of multiple character
1723    sets is a kind of multi-byte encoding, i.e. characters are
1724    represented by variable-length sequences of one-byte codes.
1725
1726    ASCII characters and control characters (e.g. `tab', `newline') are
1727    represented by one-byte sequences which are their ASCII codes, in
1728    the range 0x00 through 0x7F.
1729
1730    8-bit characters of the range 0x80..0x9F are represented by
1731    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1732    code + 0x20).
1733
1734    8-bit characters of the range 0xA0..0xFF are represented by
1735    one-byte sequences which are their 8-bit code.
1736
1737    The other characters are represented by a sequence of `base
1738    leading-code', optional `extended leading-code', and one or two
1739    `position-code's.  The length of the sequence is determined by the
1740    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1741    whereas extended leading-code and position-code take the range 0xA0
1742    through 0xFF.  See `charset.h' for more details about leading-code
1743    and position-code.
1744
1745    --- CODE RANGE of Emacs' internal format ---
1746    character set        range
1747    -------------        -----
1748    ascii                0x00..0x7F
1749    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1750    eight-bit-graphic    0xA0..0xBF
1751    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1752    ---------------------------------------------
1753
1754    As this is the internal character representation, the format is
1755    usually not used externally (i.e. in a file or in a data sent to a
1756    process).  But, it is possible to have a text externally in this
1757    format (i.e. by encoding by the coding system `emacs-mule').
1758
1759    In that case, a sequence of one-byte codes has a slightly different
1760    form.
1761
1762    At first, all characters in eight-bit-control are represented by
1763    one-byte sequences which are their 8-bit code.
1764
1765    Next, character composition data are represented by the byte
1766    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1767    where,
1768         METHOD is 0xF2 plus one of composition method (enum
1769         composition_method),
1770
1771         BYTES is 0xA0 plus a byte length of this composition data,
1772
1773         CHARS is 0xA0 plus a number of characters composed by this
1774         data,
1775
1776         COMPONENTs are characters of multibyte form or composition
1777         rules encoded by two-byte of ASCII codes.
1778
1779    In addition, for backward compatibility, the following formats are
1780    also recognized as composition data on decoding.
1781
1782    0x80 MSEQ ...
1783    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1784
1785    Here,
1786         MSEQ is a multibyte form but in these special format:
1787           ASCII: 0xA0 ASCII_CODE+0x80,
1788           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1789         RULE is a one byte code of the range 0xA0..0xF0 that
1790         represents a composition rule.
1791   */
1792
1793 char emacs_mule_bytes[256];
1794
1795
1796 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1797    Return true if a text is encoded in 'emacs-mule'.  */
1798
1799 static bool
1800 detect_coding_emacs_mule (struct coding_system *coding,
1801                           struct coding_detection_info *detect_info)
1802 {
1803   const unsigned char *src = coding->source, *src_base;
1804   const unsigned char *src_end = coding->source + coding->src_bytes;
1805   bool multibytep = coding->src_multibyte;
1806   ptrdiff_t consumed_chars = 0;
1807   int c;
1808   int found = 0;
1809
1810   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1811   /* A coding system of this category is always ASCII compatible.  */
1812   src += coding->head_ascii;
1813
1814   while (1)
1815     {
1816       src_base = src;
1817       ONE_MORE_BYTE (c);
1818       if (c < 0)
1819         continue;
1820       if (c == 0x80)
1821         {
1822           /* Perhaps the start of composite character.  We simply skip
1823              it because analyzing it is too heavy for detecting.  But,
1824              at least, we check that the composite character
1825              constitutes of more than 4 bytes.  */
1826           const unsigned char *src_start;
1827
1828         repeat:
1829           src_start = src;
1830           do
1831             {
1832               ONE_MORE_BYTE (c);
1833             }
1834           while (c >= 0xA0);
1835
1836           if (src - src_start <= 4)
1837             break;
1838           found = CATEGORY_MASK_EMACS_MULE;
1839           if (c == 0x80)
1840             goto repeat;
1841         }
1842
1843       if (c < 0x80)
1844         {
1845           if (c < 0x20
1846               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1847             break;
1848         }
1849       else
1850         {
1851           int more_bytes = emacs_mule_bytes[c] - 1;
1852
1853           while (more_bytes > 0)
1854             {
1855               ONE_MORE_BYTE (c);
1856               if (c < 0xA0)
1857                 {
1858                   src--;        /* Unread the last byte.  */
1859                   break;
1860                 }
1861               more_bytes--;
1862             }
1863           if (more_bytes != 0)
1864             break;
1865           found = CATEGORY_MASK_EMACS_MULE;
1866         }
1867     }
1868   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1869   return 0;
1870
1871  no_more_source:
1872   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1873     {
1874       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1875       return 0;
1876     }
1877   detect_info->found |= found;
1878   return 1;
1879 }
1880
1881
1882 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1883    character.  If CMP_STATUS indicates that we must expect MSEQ or
1884    RULE described above, decode it and return the negative value of
1885    the decoded character or rule.  If an invalid byte is found, return
1886    -1.  If SRC is too short, return -2.  */
1887
1888 static int
1889 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1890                  int *nbytes, int *nchars, int *id,
1891                  struct composition_status *cmp_status)
1892 {
1893   const unsigned char *src_end = coding->source + coding->src_bytes;
1894   const unsigned char *src_base = src;
1895   bool multibytep = coding->src_multibyte;
1896   int charset_ID;
1897   unsigned code;
1898   int c;
1899   int consumed_chars = 0;
1900   bool mseq_found = 0;
1901
1902   ONE_MORE_BYTE (c);
1903   if (c < 0)
1904     {
1905       c = -c;
1906       charset_ID = emacs_mule_charset[0];
1907     }
1908   else
1909     {
1910       if (c >= 0xA0)
1911         {
1912           if (cmp_status->state != COMPOSING_NO
1913               && cmp_status->old_form)
1914             {
1915               if (cmp_status->state == COMPOSING_CHAR)
1916                 {
1917                   if (c == 0xA0)
1918                     {
1919                       ONE_MORE_BYTE (c);
1920                       c -= 0x80;
1921                       if (c < 0)
1922                         goto invalid_code;
1923                     }
1924                   else
1925                     c -= 0x20;
1926                   mseq_found = 1;
1927                 }
1928               else
1929                 {
1930                   *nbytes = src - src_base;
1931                   *nchars = consumed_chars;
1932                   return -c;
1933                 }
1934             }
1935           else
1936             goto invalid_code;
1937         }
1938
1939       switch (emacs_mule_bytes[c])
1940         {
1941         case 2:
1942           if ((charset_ID = emacs_mule_charset[c]) < 0)
1943             goto invalid_code;
1944           ONE_MORE_BYTE (c);
1945           if (c < 0xA0)
1946             goto invalid_code;
1947           code = c & 0x7F;
1948           break;
1949
1950         case 3:
1951           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1952               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1953             {
1954               ONE_MORE_BYTE (c);
1955               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
1956                 goto invalid_code;
1957               ONE_MORE_BYTE (c);
1958               if (c < 0xA0)
1959                 goto invalid_code;
1960               code = c & 0x7F;
1961             }
1962           else
1963             {
1964               if ((charset_ID = emacs_mule_charset[c]) < 0)
1965                 goto invalid_code;
1966               ONE_MORE_BYTE (c);
1967               if (c < 0xA0)
1968                 goto invalid_code;
1969               code = (c & 0x7F) << 8;
1970               ONE_MORE_BYTE (c);
1971               if (c < 0xA0)
1972                 goto invalid_code;
1973               code |= c & 0x7F;
1974             }
1975           break;
1976
1977         case 4:
1978           ONE_MORE_BYTE (c);
1979           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
1980             goto invalid_code;
1981           ONE_MORE_BYTE (c);
1982           if (c < 0xA0)
1983             goto invalid_code;
1984           code = (c & 0x7F) << 8;
1985           ONE_MORE_BYTE (c);
1986           if (c < 0xA0)
1987             goto invalid_code;
1988           code |= c & 0x7F;
1989           break;
1990
1991         case 1:
1992           code = c;
1993           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
1994           break;
1995
1996         default:
1997           emacs_abort ();
1998         }
1999       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2000                           CHARSET_FROM_ID (charset_ID), code, c);
2001       if (c < 0)
2002         goto invalid_code;
2003     }
2004   *nbytes = src - src_base;
2005   *nchars = consumed_chars;
2006   if (id)
2007     *id = charset_ID;
2008   return (mseq_found ? -c : c);
2009
2010  no_more_source:
2011   return -2;
2012
2013  invalid_code:
2014   return -1;
2015 }
2016
2017
2018 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2019
2020 /* Handle these composition sequence ('|': the end of header elements,
2021    BYTES and CHARS >= 0xA0):
2022
2023    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2024    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2025    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2026
2027    and these old form:
2028
2029    (4) relative composition: 0x80 | MSEQ ... MSEQ
2030    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2031
2032    When the starter 0x80 and the following header elements are found,
2033    this annotation header is produced.
2034
2035         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2036
2037    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2038    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2039
2040    Then, upon reading the following elements, these codes are produced
2041    until the composition end is found:
2042
2043    (1) CHAR ... CHAR
2044    (2) ALT ... ALT CHAR ... CHAR
2045    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2046    (4) CHAR ... CHAR
2047    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2048
2049    When the composition end is found, LENGTH and NCHARS in the
2050    annotation header is updated as below:
2051
2052    (1) LENGTH: unchanged, NCHARS: unchanged
2053    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2054    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2055    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2056    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2057
2058    If an error is found while composing, the annotation header is
2059    changed to the original composition header (plus filler -1s) as
2060    below:
2061
2062    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2063    (5)          [ 0x80 0xFF -1 -1- -1 ]
2064
2065    and the sequence [ -2 DECODED-RULE ] is changed to the original
2066    byte sequence as below:
2067         o the original byte sequence is B: [ B -1 ]
2068         o the original byte sequence is B1 B2: [ B1 B2 ]
2069
2070    Most of the routines are implemented by macros because many
2071    variables and labels in the caller decode_coding_emacs_mule must be
2072    accessible, and they are usually called just once (thus doesn't
2073    increase the size of compiled object).  */
2074
2075 /* Decode a composition rule represented by C as a component of
2076    composition sequence of Emacs 20 style.  Set RULE to the decoded
2077    rule. */
2078
2079 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2080   do {                                                  \
2081     int gref, nref;                                     \
2082                                                         \
2083     c -= 0xA0;                                          \
2084     if (c < 0 || c >= 81)                               \
2085       goto invalid_code;                                \
2086     gref = c / 9, nref = c % 9;                         \
2087     if (gref == 4) gref = 10;                           \
2088     if (nref == 4) nref = 10;                           \
2089     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2090   } while (0)
2091
2092
2093 /* Decode a composition rule represented by C and the following byte
2094    at SRC as a component of composition sequence of Emacs 21 style.
2095    Set RULE to the decoded rule.  */
2096
2097 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2098   do {                                                  \
2099     int gref, nref;                                     \
2100                                                         \
2101     gref = c - 0x20;                                    \
2102     if (gref < 0 || gref >= 81)                         \
2103       goto invalid_code;                                \
2104     ONE_MORE_BYTE (c);                                  \
2105     nref = c - 0x20;                                    \
2106     if (nref < 0 || nref >= 81)                         \
2107       goto invalid_code;                                \
2108     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2109   } while (0)
2110
2111
2112 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2113    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2114    byte length of this composition information, CHARS is the number of
2115    characters composed by this composition.  */
2116
2117 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2118   do {                                                                  \
2119     enum composition_method method = c - 0xF2;                          \
2120     int nbytes, nchars;                                                 \
2121                                                                         \
2122     ONE_MORE_BYTE (c);                                                  \
2123     if (c < 0)                                                          \
2124       goto invalid_code;                                                \
2125     nbytes = c - 0xA0;                                                  \
2126     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2127       goto invalid_code;                                                \
2128     ONE_MORE_BYTE (c);                                                  \
2129     nchars = c - 0xA0;                                                  \
2130     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2131       goto invalid_code;                                                \
2132     cmp_status->old_form = 0;                                           \
2133     cmp_status->method = method;                                        \
2134     if (method == COMPOSITION_RELATIVE)                                 \
2135       cmp_status->state = COMPOSING_CHAR;                               \
2136     else                                                                \
2137       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2138     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2139     cmp_status->nchars = nchars;                                        \
2140     cmp_status->ncomps = nbytes - 4;                                    \
2141     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2142   } while (0)
2143
2144
2145 /* Start of Emacs 20 style format for relative composition.  */
2146
2147 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2148   do {                                                          \
2149     cmp_status->old_form = 1;                                   \
2150     cmp_status->method = COMPOSITION_RELATIVE;                  \
2151     cmp_status->state = COMPOSING_CHAR;                         \
2152     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2153     cmp_status->nchars = cmp_status->ncomps = 0;                \
2154     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2155   } while (0)
2156
2157
2158 /* Start of Emacs 20 style format for rule-base composition.  */
2159
2160 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2161   do {                                                          \
2162     cmp_status->old_form = 1;                                   \
2163     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2164     cmp_status->state = COMPOSING_CHAR;                         \
2165     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2166     cmp_status->nchars = cmp_status->ncomps = 0;                \
2167     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2168   } while (0)
2169
2170
2171 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2172   do {                                                  \
2173     const unsigned char *current_src = src;             \
2174                                                         \
2175     ONE_MORE_BYTE (c);                                  \
2176     if (c < 0)                                          \
2177       goto invalid_code;                                \
2178     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2179         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2180       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2181     else if (c < 0xA0)                                  \
2182       goto invalid_code;                                \
2183     else if (c < 0xC0)                                  \
2184       {                                                 \
2185         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2186         /* Re-read C as a composition component.  */    \
2187         src = current_src;                              \
2188       }                                                 \
2189     else if (c == 0xFF)                                 \
2190       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2191     else                                                \
2192       goto invalid_code;                                \
2193   } while (0)
2194
2195 #define EMACS_MULE_COMPOSITION_END()                            \
2196   do {                                                          \
2197     int idx = - cmp_status->length;                             \
2198                                                                 \
2199     if (cmp_status->old_form)                                   \
2200       charbuf[idx + 2] = cmp_status->nchars;                    \
2201     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2202       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2203     cmp_status->state = COMPOSING_NO;                           \
2204   } while (0)
2205
2206
2207 static int
2208 emacs_mule_finish_composition (int *charbuf,
2209                                struct composition_status *cmp_status)
2210 {
2211   int idx = - cmp_status->length;
2212   int new_chars;
2213
2214   if (cmp_status->old_form && cmp_status->nchars > 0)
2215     {
2216       charbuf[idx + 2] = cmp_status->nchars;
2217       new_chars = 0;
2218       if (cmp_status->method == COMPOSITION_WITH_RULE
2219           && cmp_status->state == COMPOSING_CHAR)
2220         {
2221           /* The last rule was invalid.  */
2222           int rule = charbuf[-1] + 0xA0;
2223
2224           charbuf[-2] = BYTE8_TO_CHAR (rule);
2225           charbuf[-1] = -1;
2226           new_chars = 1;
2227         }
2228     }
2229   else
2230     {
2231       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2232
2233       if (cmp_status->method == COMPOSITION_WITH_RULE)
2234         {
2235           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2236           charbuf[idx++] = -3;
2237           charbuf[idx++] = 0;
2238           new_chars = 1;
2239         }
2240       else
2241         {
2242           int nchars = charbuf[idx + 1] + 0xA0;
2243           int nbytes = charbuf[idx + 2] + 0xA0;
2244
2245           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2246           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2247           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2248           charbuf[idx++] = -1;
2249           new_chars = 4;
2250         }
2251     }
2252   cmp_status->state = COMPOSING_NO;
2253   return new_chars;
2254 }
2255
2256 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2257   do {                                                                    \
2258     if (cmp_status->state != COMPOSING_NO)                                \
2259       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2260   } while (0)
2261
2262
2263 static void
2264 decode_coding_emacs_mule (struct coding_system *coding)
2265 {
2266   const unsigned char *src = coding->source + coding->consumed;
2267   const unsigned char *src_end = coding->source + coding->src_bytes;
2268   const unsigned char *src_base;
2269   int *charbuf = coding->charbuf + coding->charbuf_used;
2270   /* We may produce two annotations (charset and composition) in one
2271      loop and one more charset annotation at the end.  */
2272   int *charbuf_end
2273     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2274       /* We can produce up to 2 characters in a loop.  */
2275       - 1;
2276   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2277   bool multibytep = coding->src_multibyte;
2278   ptrdiff_t char_offset = coding->produced_char;
2279   ptrdiff_t last_offset = char_offset;
2280   int last_id = charset_ascii;
2281   bool eol_dos
2282     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2283   int byte_after_cr = -1;
2284   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2285
2286   if (cmp_status->state != COMPOSING_NO)
2287     {
2288       int i;
2289
2290       if (charbuf_end - charbuf < cmp_status->length)
2291         emacs_abort ();
2292       for (i = 0; i < cmp_status->length; i++)
2293         *charbuf++ = cmp_status->carryover[i];
2294       coding->annotated = 1;
2295     }
2296
2297   while (1)
2298     {
2299       int c, id IF_LINT (= 0);
2300
2301       src_base = src;
2302       consumed_chars_base = consumed_chars;
2303
2304       if (charbuf >= charbuf_end)
2305         {
2306           if (byte_after_cr >= 0)
2307             src_base--;
2308           break;
2309         }
2310
2311       if (byte_after_cr >= 0)
2312         c = byte_after_cr, byte_after_cr = -1;
2313       else
2314         ONE_MORE_BYTE (c);
2315
2316       if (c < 0 || c == 0x80)
2317         {
2318           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2319           if (c < 0)
2320             {
2321               *charbuf++ = -c;
2322               char_offset++;
2323             }
2324           else
2325             DECODE_EMACS_MULE_COMPOSITION_START ();
2326           continue;
2327         }
2328
2329       if (c < 0x80)
2330         {
2331           if (eol_dos && c == '\r')
2332             ONE_MORE_BYTE (byte_after_cr);
2333           id = charset_ascii;
2334           if (cmp_status->state != COMPOSING_NO)
2335             {
2336               if (cmp_status->old_form)
2337                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2338               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2339                 cmp_status->ncomps--;
2340             }
2341         }
2342       else
2343         {
2344           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2345           /* emacs_mule_char can load a charset map from a file, which
2346              allocates a large structure and might cause buffer text
2347              to be relocated as result.  Thus, we need to remember the
2348              original pointer to buffer text, and fix up all related
2349              pointers after the call.  */
2350           const unsigned char *orig = coding->source;
2351           ptrdiff_t offset;
2352
2353           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2354                                cmp_status);
2355           offset = coding->source - orig;
2356           if (offset)
2357             {
2358               src += offset;
2359               src_base += offset;
2360               src_end += offset;
2361             }
2362           if (c < 0)
2363             {
2364               if (c == -1)
2365                 goto invalid_code;
2366               if (c == -2)
2367                 break;
2368             }
2369           src = src_base + nbytes;
2370           consumed_chars = consumed_chars_base + nchars;
2371           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2372             cmp_status->ncomps -= nchars;
2373         }
2374
2375       /* Now if C >= 0, we found a normally encoded character, if C <
2376          0, we found an old-style composition component character or
2377          rule.  */
2378
2379       if (cmp_status->state == COMPOSING_NO)
2380         {
2381           if (last_id != id)
2382             {
2383               if (last_id != charset_ascii)
2384                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2385                                   last_id);
2386               last_id = id;
2387               last_offset = char_offset;
2388             }
2389           *charbuf++ = c;
2390           char_offset++;
2391         }
2392       else if (cmp_status->state == COMPOSING_CHAR)
2393         {
2394           if (cmp_status->old_form)
2395             {
2396               if (c >= 0)
2397                 {
2398                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2399                   *charbuf++ = c;
2400                   char_offset++;
2401                 }
2402               else
2403                 {
2404                   *charbuf++ = -c;
2405                   cmp_status->nchars++;
2406                   cmp_status->length++;
2407                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2408                     EMACS_MULE_COMPOSITION_END ();
2409                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2410                     cmp_status->state = COMPOSING_RULE;
2411                 }
2412             }
2413           else
2414             {
2415               *charbuf++ = c;
2416               cmp_status->length++;
2417               cmp_status->nchars--;
2418               if (cmp_status->nchars == 0)
2419                 EMACS_MULE_COMPOSITION_END ();
2420             }
2421         }
2422       else if (cmp_status->state == COMPOSING_RULE)
2423         {
2424           int rule;
2425
2426           if (c >= 0)
2427             {
2428               EMACS_MULE_COMPOSITION_END ();
2429               *charbuf++ = c;
2430               char_offset++;
2431             }
2432           else
2433             {
2434               c = -c;
2435               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2436               if (rule < 0)
2437                 goto invalid_code;
2438               *charbuf++ = -2;
2439               *charbuf++ = rule;
2440               cmp_status->length += 2;
2441               cmp_status->state = COMPOSING_CHAR;
2442             }
2443         }
2444       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2445         {
2446           *charbuf++ = c;
2447           cmp_status->length++;
2448           if (cmp_status->ncomps == 0)
2449             cmp_status->state = COMPOSING_CHAR;
2450           else if (cmp_status->ncomps > 0)
2451             {
2452               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2453                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2454             }
2455           else
2456             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2457         }
2458       else                      /* COMPOSING_COMPONENT_RULE */
2459         {
2460           int rule;
2461
2462           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2463           if (rule < 0)
2464             goto invalid_code;
2465           *charbuf++ = -2;
2466           *charbuf++ = rule;
2467           cmp_status->length += 2;
2468           cmp_status->ncomps--;
2469           if (cmp_status->ncomps > 0)
2470             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2471           else
2472             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2473         }
2474       continue;
2475
2476     invalid_code:
2477       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2478       src = src_base;
2479       consumed_chars = consumed_chars_base;
2480       ONE_MORE_BYTE (c);
2481       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2482       char_offset++;
2483       coding->errors++;
2484     }
2485
2486  no_more_source:
2487   if (cmp_status->state != COMPOSING_NO)
2488     {
2489       if (coding->mode & CODING_MODE_LAST_BLOCK)
2490         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2491       else
2492         {
2493           int i;
2494
2495           charbuf -= cmp_status->length;
2496           for (i = 0; i < cmp_status->length; i++)
2497             cmp_status->carryover[i] = charbuf[i];
2498         }
2499     }
2500   if (last_id != charset_ascii)
2501     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2502   coding->consumed_char += consumed_chars_base;
2503   coding->consumed = src_base - coding->source;
2504   coding->charbuf_used = charbuf - coding->charbuf;
2505 }
2506
2507
2508 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2509   do {                                          \
2510     if (id < 0xA0)                              \
2511       codes[0] = id, codes[1] = 0;              \
2512     else if (id < 0xE0)                         \
2513       codes[0] = 0x9A, codes[1] = id;           \
2514     else if (id < 0xF0)                         \
2515       codes[0] = 0x9B, codes[1] = id;           \
2516     else if (id < 0xF5)                         \
2517       codes[0] = 0x9C, codes[1] = id;           \
2518     else                                        \
2519       codes[0] = 0x9D, codes[1] = id;           \
2520   } while (0);
2521
2522
2523 static bool
2524 encode_coding_emacs_mule (struct coding_system *coding)
2525 {
2526   bool multibytep = coding->dst_multibyte;
2527   int *charbuf = coding->charbuf;
2528   int *charbuf_end = charbuf + coding->charbuf_used;
2529   unsigned char *dst = coding->destination + coding->produced;
2530   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2531   int safe_room = 8;
2532   ptrdiff_t produced_chars = 0;
2533   Lisp_Object attrs, charset_list;
2534   int c;
2535   int preferred_charset_id = -1;
2536
2537   CODING_GET_INFO (coding, attrs, charset_list);
2538   if (! EQ (charset_list, Vemacs_mule_charset_list))
2539     {
2540       charset_list = Vemacs_mule_charset_list;
2541       ASET (attrs, coding_attr_charset_list, charset_list);
2542     }
2543
2544   while (charbuf < charbuf_end)
2545     {
2546       ASSURE_DESTINATION (safe_room);
2547       c = *charbuf++;
2548
2549       if (c < 0)
2550         {
2551           /* Handle an annotation.  */
2552           switch (*charbuf)
2553             {
2554             case CODING_ANNOTATE_COMPOSITION_MASK:
2555               /* Not yet implemented.  */
2556               break;
2557             case CODING_ANNOTATE_CHARSET_MASK:
2558               preferred_charset_id = charbuf[3];
2559               if (preferred_charset_id >= 0
2560                   && NILP (Fmemq (make_number (preferred_charset_id),
2561                                   charset_list)))
2562                 preferred_charset_id = -1;
2563               break;
2564             default:
2565               emacs_abort ();
2566             }
2567           charbuf += -c - 1;
2568           continue;
2569         }
2570
2571       if (ASCII_CHAR_P (c))
2572         EMIT_ONE_ASCII_BYTE (c);
2573       else if (CHAR_BYTE8_P (c))
2574         {
2575           c = CHAR_TO_BYTE8 (c);
2576           EMIT_ONE_BYTE (c);
2577         }
2578       else
2579         {
2580           struct charset *charset;
2581           unsigned code;
2582           int dimension;
2583           int emacs_mule_id;
2584           unsigned char leading_codes[2];
2585
2586           if (preferred_charset_id >= 0)
2587             {
2588               bool result;
2589
2590               charset = CHARSET_FROM_ID (preferred_charset_id);
2591               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2592               if (result)
2593                 code = ENCODE_CHAR (charset, c);
2594               else
2595                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2596                                      &code, charset);
2597             }
2598           else
2599             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2600                                  &code, charset);
2601           if (! charset)
2602             {
2603               c = coding->default_char;
2604               if (ASCII_CHAR_P (c))
2605                 {
2606                   EMIT_ONE_ASCII_BYTE (c);
2607                   continue;
2608                 }
2609               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2610                                    &code, charset);
2611             }
2612           dimension = CHARSET_DIMENSION (charset);
2613           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2614           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2615           EMIT_ONE_BYTE (leading_codes[0]);
2616           if (leading_codes[1])
2617             EMIT_ONE_BYTE (leading_codes[1]);
2618           if (dimension == 1)
2619             EMIT_ONE_BYTE (code | 0x80);
2620           else
2621             {
2622               code |= 0x8080;
2623               EMIT_ONE_BYTE (code >> 8);
2624               EMIT_ONE_BYTE (code & 0xFF);
2625             }
2626         }
2627     }
2628   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2629   coding->produced_char += produced_chars;
2630   coding->produced = dst - coding->destination;
2631   return 0;
2632 }
2633
2634 \f
2635 /*** 7. ISO2022 handlers ***/
2636
2637 /* The following note describes the coding system ISO2022 briefly.
2638    Since the intention of this note is to help understand the
2639    functions in this file, some parts are NOT ACCURATE or are OVERLY
2640    SIMPLIFIED.  For thorough understanding, please refer to the
2641    original document of ISO2022.  This is equivalent to the standard
2642    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2643
2644    ISO2022 provides many mechanisms to encode several character sets
2645    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2646    is encoded using bytes less than 128.  This may make the encoded
2647    text a little bit longer, but the text passes more easily through
2648    several types of gateway, some of which strip off the MSB (Most
2649    Significant Bit).
2650
2651    There are two kinds of character sets: control character sets and
2652    graphic character sets.  The former contain control characters such
2653    as `newline' and `escape' to provide control functions (control
2654    functions are also provided by escape sequences).  The latter
2655    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2656    two control character sets and many graphic character sets.
2657
2658    Graphic character sets are classified into one of the following
2659    four classes, according to the number of bytes (DIMENSION) and
2660    number of characters in one dimension (CHARS) of the set:
2661    - DIMENSION1_CHARS94
2662    - DIMENSION1_CHARS96
2663    - DIMENSION2_CHARS94
2664    - DIMENSION2_CHARS96
2665
2666    In addition, each character set is assigned an identification tag,
2667    unique for each set, called the "final character" (denoted as <F>
2668    hereafter).  The <F> of each character set is decided by ECMA(*)
2669    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2670    (0x30..0x3F are for private use only).
2671
2672    Note (*): ECMA = European Computer Manufacturers Association
2673
2674    Here are examples of graphic character sets [NAME(<F>)]:
2675         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2676         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2677         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2678         o DIMENSION2_CHARS96 -- none for the moment
2679
2680    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2681         C0 [0x00..0x1F] -- control character plane 0
2682         GL [0x20..0x7F] -- graphic character plane 0
2683         C1 [0x80..0x9F] -- control character plane 1
2684         GR [0xA0..0xFF] -- graphic character plane 1
2685
2686    A control character set is directly designated and invoked to C0 or
2687    C1 by an escape sequence.  The most common case is that:
2688    - ISO646's  control character set is designated/invoked to C0, and
2689    - ISO6429's control character set is designated/invoked to C1,
2690    and usually these designations/invocations are omitted in encoded
2691    text.  In a 7-bit environment, only C0 can be used, and a control
2692    character for C1 is encoded by an appropriate escape sequence to
2693    fit into the environment.  All control characters for C1 are
2694    defined to have corresponding escape sequences.
2695
2696    A graphic character set is at first designated to one of four
2697    graphic registers (G0 through G3), then these graphic registers are
2698    invoked to GL or GR.  These designations and invocations can be
2699    done independently.  The most common case is that G0 is invoked to
2700    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2701    these invocations and designations are omitted in encoded text.
2702    In a 7-bit environment, only GL can be used.
2703
2704    When a graphic character set of CHARS94 is invoked to GL, codes
2705    0x20 and 0x7F of the GL area work as control characters SPACE and
2706    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2707    be used.
2708
2709    There are two ways of invocation: locking-shift and single-shift.
2710    With locking-shift, the invocation lasts until the next different
2711    invocation, whereas with single-shift, the invocation affects the
2712    following character only and doesn't affect the locking-shift
2713    state.  Invocations are done by the following control characters or
2714    escape sequences:
2715
2716    ----------------------------------------------------------------------
2717    abbrev  function                  cntrl escape seq   description
2718    ----------------------------------------------------------------------
2719    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2720    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2721    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2722    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2723    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2724    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2725    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2726    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2727    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2728    ----------------------------------------------------------------------
2729    (*) These are not used by any known coding system.
2730
2731    Control characters for these functions are defined by macros
2732    ISO_CODE_XXX in `coding.h'.
2733
2734    Designations are done by the following escape sequences:
2735    ----------------------------------------------------------------------
2736    escape sequence      description
2737    ----------------------------------------------------------------------
2738    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2739    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2740    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2741    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2742    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2743    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2744    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2745    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2746    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2747    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2748    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2749    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2750    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2751    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2752    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2753    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2754    ----------------------------------------------------------------------
2755
2756    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2757    of dimension 1, chars 94, and final character <F>, etc...
2758
2759    Note (*): Although these designations are not allowed in ISO2022,
2760    Emacs accepts them on decoding, and produces them on encoding
2761    CHARS96 character sets in a coding system which is characterized as
2762    7-bit environment, non-locking-shift, and non-single-shift.
2763
2764    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2765    '(' must be omitted.  We refer to this as "short-form" hereafter.
2766
2767    Now you may notice that there are a lot of ways of encoding the
2768    same multilingual text in ISO2022.  Actually, there exist many
2769    coding systems such as Compound Text (used in X11's inter client
2770    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2771    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2772    localized platforms), and all of these are variants of ISO2022.
2773
2774    In addition to the above, Emacs handles two more kinds of escape
2775    sequences: ISO6429's direction specification and Emacs' private
2776    sequence for specifying character composition.
2777
2778    ISO6429's direction specification takes the following form:
2779         o CSI ']'      -- end of the current direction
2780         o CSI '0' ']'  -- end of the current direction
2781         o CSI '1' ']'  -- start of left-to-right text
2782         o CSI '2' ']'  -- start of right-to-left text
2783    The control character CSI (0x9B: control sequence introducer) is
2784    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2785
2786    Character composition specification takes the following form:
2787         o ESC '0' -- start relative composition
2788         o ESC '1' -- end composition
2789         o ESC '2' -- start rule-base composition (*)
2790         o ESC '3' -- start relative composition with alternate chars  (**)
2791         o ESC '4' -- start rule-base composition with alternate chars  (**)
2792   Since these are not standard escape sequences of any ISO standard,
2793   the use of them with these meanings is restricted to Emacs only.
2794
2795   (*) This form is used only in Emacs 20.7 and older versions,
2796   but newer versions can safely decode it.
2797   (**) This form is used only in Emacs 21.1 and newer versions,
2798   and older versions can't decode it.
2799
2800   Here's a list of example usages of these composition escape
2801   sequences (categorized by `enum composition_method').
2802
2803   COMPOSITION_RELATIVE:
2804         ESC 0 CHAR [ CHAR ] ESC 1
2805   COMPOSITION_WITH_RULE:
2806         ESC 2 CHAR [ RULE CHAR ] ESC 1
2807   COMPOSITION_WITH_ALTCHARS:
2808         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2809   COMPOSITION_WITH_RULE_ALTCHARS:
2810         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2811
2812 static enum iso_code_class_type iso_code_class[256];
2813
2814 #define SAFE_CHARSET_P(coding, id)      \
2815   ((id) <= (coding)->max_charset_id     \
2816    && (coding)->safe_charsets[id] != 255)
2817
2818 static void
2819 setup_iso_safe_charsets (Lisp_Object attrs)
2820 {
2821   Lisp_Object charset_list, safe_charsets;
2822   Lisp_Object request;
2823   Lisp_Object reg_usage;
2824   Lisp_Object tail;
2825   EMACS_INT reg94, reg96;
2826   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2827   int max_charset_id;
2828
2829   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2830   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2831       && ! EQ (charset_list, Viso_2022_charset_list))
2832     {
2833       charset_list = Viso_2022_charset_list;
2834       ASET (attrs, coding_attr_charset_list, charset_list);
2835       ASET (attrs, coding_attr_safe_charsets, Qnil);
2836     }
2837
2838   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2839     return;
2840
2841   max_charset_id = 0;
2842   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2843     {
2844       int id = XINT (XCAR (tail));
2845       if (max_charset_id < id)
2846         max_charset_id = id;
2847     }
2848
2849   safe_charsets = make_uninit_string (max_charset_id + 1);
2850   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2851   request = AREF (attrs, coding_attr_iso_request);
2852   reg_usage = AREF (attrs, coding_attr_iso_usage);
2853   reg94 = XINT (XCAR (reg_usage));
2854   reg96 = XINT (XCDR (reg_usage));
2855
2856   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2857     {
2858       Lisp_Object id;
2859       Lisp_Object reg;
2860       struct charset *charset;
2861
2862       id = XCAR (tail);
2863       charset = CHARSET_FROM_ID (XINT (id));
2864       reg = Fcdr (Fassq (id, request));
2865       if (! NILP (reg))
2866         SSET (safe_charsets, XINT (id), XINT (reg));
2867       else if (charset->iso_chars_96)
2868         {
2869           if (reg96 < 4)
2870             SSET (safe_charsets, XINT (id), reg96);
2871         }
2872       else
2873         {
2874           if (reg94 < 4)
2875             SSET (safe_charsets, XINT (id), reg94);
2876         }
2877     }
2878   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2879 }
2880
2881
2882 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2883    Return true if a text is encoded in one of ISO-2022 based coding
2884    systems.  */
2885
2886 static bool
2887 detect_coding_iso_2022 (struct coding_system *coding,
2888                         struct coding_detection_info *detect_info)
2889 {
2890   const unsigned char *src = coding->source, *src_base = src;
2891   const unsigned char *src_end = coding->source + coding->src_bytes;
2892   bool multibytep = coding->src_multibyte;
2893   bool single_shifting = 0;
2894   int id;
2895   int c, c1;
2896   ptrdiff_t consumed_chars = 0;
2897   int i;
2898   int rejected = 0;
2899   int found = 0;
2900   int composition_count = -1;
2901
2902   detect_info->checked |= CATEGORY_MASK_ISO;
2903
2904   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2905     {
2906       struct coding_system *this = &(coding_categories[i]);
2907       Lisp_Object attrs, val;
2908
2909       if (this->id < 0)
2910         continue;
2911       attrs = CODING_ID_ATTRS (this->id);
2912       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2913           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
2914         setup_iso_safe_charsets (attrs);
2915       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2916       this->max_charset_id = SCHARS (val) - 1;
2917       this->safe_charsets = SDATA (val);
2918     }
2919
2920   /* A coding system of this category is always ASCII compatible.  */
2921   src += coding->head_ascii;
2922
2923   while (rejected != CATEGORY_MASK_ISO)
2924     {
2925       src_base = src;
2926       ONE_MORE_BYTE (c);
2927       switch (c)
2928         {
2929         case ISO_CODE_ESC:
2930           if (inhibit_iso_escape_detection)
2931             break;
2932           single_shifting = 0;
2933           ONE_MORE_BYTE (c);
2934           if (c == 'N' || c == 'O')
2935             {
2936               /* ESC <Fe> for SS2 or SS3.  */
2937               single_shifting = 1;
2938               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2939             }
2940           else if (c == '1')
2941             {
2942               /* End of composition.  */
2943               if (composition_count < 0
2944                   || composition_count > MAX_COMPOSITION_COMPONENTS)
2945                 /* Invalid */
2946                 break;
2947               composition_count = -1;
2948               found |= CATEGORY_MASK_ISO;
2949             }
2950           else if (c >= '0' && c <= '4')
2951             {
2952               /* ESC <Fp> for start/end composition.  */
2953               composition_count = 0;
2954             }
2955           else
2956             {
2957               if (c >= '(' && c <= '/')
2958                 {
2959                   /* Designation sequence for a charset of dimension 1.  */
2960                   ONE_MORE_BYTE (c1);
2961                   if (c1 < ' ' || c1 >= 0x80
2962                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2963                     /* Invalid designation sequence.  Just ignore.  */
2964                     break;
2965                 }
2966               else if (c == '$')
2967                 {
2968                   /* Designation sequence for a charset of dimension 2.  */
2969                   ONE_MORE_BYTE (c);
2970                   if (c >= '@' && c <= 'B')
2971                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2972                     id = iso_charset_table[1][0][c];
2973                   else if (c >= '(' && c <= '/')
2974                     {
2975                       ONE_MORE_BYTE (c1);
2976                       if (c1 < ' ' || c1 >= 0x80
2977                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2978                         /* Invalid designation sequence.  Just ignore.  */
2979                         break;
2980                     }
2981                   else
2982                     /* Invalid designation sequence.  Just ignore it.  */
2983                     break;
2984                 }
2985               else
2986                 {
2987                   /* Invalid escape sequence.  Just ignore it.  */
2988                   break;
2989                 }
2990
2991               /* We found a valid designation sequence for CHARSET.  */
2992               rejected |= CATEGORY_MASK_ISO_8BIT;
2993               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2994                                   id))
2995                 found |= CATEGORY_MASK_ISO_7;
2996               else
2997                 rejected |= CATEGORY_MASK_ISO_7;
2998               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2999                                   id))
3000                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3001               else
3002                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3003               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3004                                   id))
3005                 found |= CATEGORY_MASK_ISO_7_ELSE;
3006               else
3007                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3008               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3009                                   id))
3010                 found |= CATEGORY_MASK_ISO_8_ELSE;
3011               else
3012                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3013             }
3014           break;
3015
3016         case ISO_CODE_SO:
3017         case ISO_CODE_SI:
3018           /* Locking shift out/in.  */
3019           if (inhibit_iso_escape_detection)
3020             break;
3021           single_shifting = 0;
3022           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3023           break;
3024
3025         case ISO_CODE_CSI:
3026           /* Control sequence introducer.  */
3027           single_shifting = 0;
3028           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3029           found |= CATEGORY_MASK_ISO_8_ELSE;
3030           goto check_extra_latin;
3031
3032         case ISO_CODE_SS2:
3033         case ISO_CODE_SS3:
3034           /* Single shift.   */
3035           if (inhibit_iso_escape_detection)
3036             break;
3037           single_shifting = 0;
3038           rejected |= CATEGORY_MASK_ISO_7BIT;
3039           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3040               & CODING_ISO_FLAG_SINGLE_SHIFT)
3041             {
3042               found |= CATEGORY_MASK_ISO_8_1;
3043               single_shifting = 1;
3044             }
3045           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3046               & CODING_ISO_FLAG_SINGLE_SHIFT)
3047             {
3048               found |= CATEGORY_MASK_ISO_8_2;
3049               single_shifting = 1;
3050             }
3051           if (single_shifting)
3052             break;
3053           goto check_extra_latin;
3054
3055         default:
3056           if (c < 0)
3057             continue;
3058           if (c < 0x80)
3059             {
3060               if (composition_count >= 0)
3061                 composition_count++;
3062               single_shifting = 0;
3063               break;
3064             }
3065           if (c >= 0xA0)
3066             {
3067               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3068               found |= CATEGORY_MASK_ISO_8_1;
3069               /* Check the length of succeeding codes of the range
3070                  0xA0..0FF.  If the byte length is even, we include
3071                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3072                  only when we are not single shifting.  */
3073               if (! single_shifting
3074                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3075                 {
3076                   int len = 1;
3077                   while (src < src_end)
3078                     {
3079                       src_base = src;
3080                       ONE_MORE_BYTE (c);
3081                       if (c < 0xA0)
3082                         {
3083                           src = src_base;
3084                           break;
3085                         }
3086                       len++;
3087                     }
3088
3089                   if (len & 1 && src < src_end)
3090                     {
3091                       rejected |= CATEGORY_MASK_ISO_8_2;
3092                       if (composition_count >= 0)
3093                         composition_count += len;
3094                     }
3095                   else
3096                     {
3097                       found |= CATEGORY_MASK_ISO_8_2;
3098                       if (composition_count >= 0)
3099                         composition_count += len / 2;
3100                     }
3101                 }
3102               break;
3103             }
3104         check_extra_latin:
3105           if (! VECTORP (Vlatin_extra_code_table)
3106               || NILP (AREF (Vlatin_extra_code_table, c)))
3107             {
3108               rejected = CATEGORY_MASK_ISO;
3109               break;
3110             }
3111           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3112               & CODING_ISO_FLAG_LATIN_EXTRA)
3113             found |= CATEGORY_MASK_ISO_8_1;
3114           else
3115             rejected |= CATEGORY_MASK_ISO_8_1;
3116           rejected |= CATEGORY_MASK_ISO_8_2;
3117           break;
3118         }
3119     }
3120   detect_info->rejected |= CATEGORY_MASK_ISO;
3121   return 0;
3122
3123  no_more_source:
3124   detect_info->rejected |= rejected;
3125   detect_info->found |= (found & ~rejected);
3126   return 1;
3127 }
3128
3129
3130 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3131    escape sequence should be kept.  */
3132 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3133   do {                                                                  \
3134     int id, prev;                                                       \
3135                                                                         \
3136     if (final < '0' || final >= 128                                     \
3137         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3138         || !SAFE_CHARSET_P (coding, id))                                \
3139       {                                                                 \
3140         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3141         chars_96 = -1;                                                  \
3142         break;                                                          \
3143       }                                                                 \
3144     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3145     if (id == charset_jisx0201_roman)                                   \
3146       {                                                                 \
3147         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3148           id = charset_ascii;                                           \
3149       }                                                                 \
3150     else if (id == charset_jisx0208_1978)                               \
3151       {                                                                 \
3152         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3153           id = charset_jisx0208;                                        \
3154       }                                                                 \
3155     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3156     /* If there was an invalid designation to REG previously, and this  \
3157        designation is ASCII to REG, we should keep this designation     \
3158        sequence.  */                                                    \
3159     if (prev == -2 && id == charset_ascii)                              \
3160       chars_96 = -1;                                                    \
3161   } while (0)
3162
3163
3164 /* Handle these composition sequence (ALT: alternate char):
3165
3166    (1) relative composition: ESC 0 CHAR ... ESC 1
3167    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3168    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3169    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3170
3171    When the start sequence (ESC 0/2/3/4) is found, this annotation
3172    header is produced.
3173
3174         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3175
3176    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3177    produced until the end sequence (ESC 1) is found:
3178
3179    (1) CHAR ... CHAR
3180    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3181    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3182    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3183
3184    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3185    annotation header is updated as below:
3186
3187    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3188    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3189    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3190    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3191
3192    If an error is found while composing, the annotation header is
3193    changed to:
3194
3195         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3196
3197    and the sequence [ -2 DECODED-RULE ] is changed to the original
3198    byte sequence as below:
3199         o the original byte sequence is B: [ B -1 ]
3200         o the original byte sequence is B1 B2: [ B1 B2 ]
3201    and the sequence [ -1 -1 ] is changed to the original byte
3202    sequence:
3203         [ ESC '0' ]
3204 */
3205
3206 /* Decode a composition rule C1 and maybe one more byte from the
3207    source, and set RULE to the encoded composition rule.  If the rule
3208    is invalid, goto invalid_code.  */
3209
3210 #define DECODE_COMPOSITION_RULE(rule)                                   \
3211   do {                                                                  \
3212     rule = c1 - 32;                                                     \
3213     if (rule < 0)                                                       \
3214       goto invalid_code;                                                \
3215     if (rule < 81)              /* old format (before ver.21) */        \
3216       {                                                                 \
3217         int gref = (rule) / 9;                                          \
3218         int nref = (rule) % 9;                                          \
3219         if (gref == 4) gref = 10;                                       \
3220         if (nref == 4) nref = 10;                                       \
3221         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3222       }                                                                 \
3223     else                        /* new format (after ver.21) */         \
3224       {                                                                 \
3225         int b;                                                          \
3226                                                                         \
3227         ONE_MORE_BYTE (b);                                              \
3228         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3229           goto invalid_code;                                            \
3230         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3231         rule += 0x100;   /* Distinguish it from the old format.  */     \
3232       }                                                                 \
3233   } while (0)
3234
3235 #define ENCODE_COMPOSITION_RULE(rule)                           \
3236   do {                                                          \
3237     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3238                                                                 \
3239     if (rule < 0x100)           /* old format */                \
3240       {                                                         \
3241         if (gref == 10) gref = 4;                               \
3242         if (nref == 10) nref = 4;                               \
3243         charbuf[idx] = 32 + gref * 9 + nref;                    \
3244         charbuf[idx + 1] = -1;                                  \
3245         new_chars++;                                            \
3246       }                                                         \
3247     else                                /* new format */        \
3248       {                                                         \
3249         charbuf[idx] = 32 + 81 + gref;                          \
3250         charbuf[idx + 1] = 32 + nref;                           \
3251         new_chars += 2;                                         \
3252       }                                                         \
3253   } while (0)
3254
3255 /* Finish the current composition as invalid.  */
3256
3257 static int
3258 finish_composition (int *charbuf, struct composition_status *cmp_status)
3259 {
3260   int idx = - cmp_status->length;
3261   int new_chars;
3262
3263   /* Recover the original ESC sequence */
3264   charbuf[idx++] = ISO_CODE_ESC;
3265   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3266                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3267                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3268                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3269                     : '4');
3270   charbuf[idx++] = -2;
3271   charbuf[idx++] = 0;
3272   charbuf[idx++] = -1;
3273   new_chars = cmp_status->nchars;
3274   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3275     for (; idx < 0; idx++)
3276       {
3277         int elt = charbuf[idx];
3278
3279         if (elt == -2)
3280           {
3281             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3282             idx++;
3283           }
3284         else if (elt == -1)
3285           {
3286             charbuf[idx++] = ISO_CODE_ESC;
3287             charbuf[idx] = '0';
3288             new_chars += 2;
3289           }
3290       }
3291   cmp_status->state = COMPOSING_NO;
3292   return new_chars;
3293 }
3294
3295 /* If characters are under composition, finish the composition.  */
3296 #define MAYBE_FINISH_COMPOSITION()                              \
3297   do {                                                          \
3298     if (cmp_status->state != COMPOSING_NO)                      \
3299       char_offset += finish_composition (charbuf, cmp_status);  \
3300   } while (0)
3301
3302 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3303
3304    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3305    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3306    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3307    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3308
3309    Produce this annotation sequence now:
3310
3311    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3312 */
3313
3314 #define DECODE_COMPOSITION_START(c1)                                       \
3315   do {                                                                     \
3316     if (c1 == '0'                                                          \
3317         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3318              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3319             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3320                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3321       {                                                                    \
3322         *charbuf++ = -1;                                                   \
3323         *charbuf++= -1;                                                    \
3324         cmp_status->state = COMPOSING_CHAR;                                \
3325         cmp_status->length += 2;                                           \
3326       }                                                                    \
3327     else                                                                   \
3328       {                                                                    \
3329         MAYBE_FINISH_COMPOSITION ();                                       \
3330         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3331                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3332                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3333                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3334         cmp_status->state                                                  \
3335           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3336         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3337         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3338         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3339         coding->annotated = 1;                                             \
3340       }                                                                    \
3341   } while (0)
3342
3343
3344 /* Handle composition end sequence ESC 1.  */
3345
3346 #define DECODE_COMPOSITION_END()                                        \
3347   do {                                                                  \
3348     if (cmp_status->nchars == 0                                         \
3349         || ((cmp_status->state == COMPOSING_CHAR)                       \
3350             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3351       {                                                                 \
3352         MAYBE_FINISH_COMPOSITION ();                                    \
3353         goto invalid_code;                                              \
3354       }                                                                 \
3355     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3356       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3357     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3358       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3359     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3360     char_offset += cmp_status->nchars;                                  \
3361     cmp_status->state = COMPOSING_NO;                                   \
3362   } while (0)
3363
3364 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3365
3366 #define STORE_COMPOSITION_RULE(rule)    \
3367   do {                                  \
3368     *charbuf++ = -2;                    \
3369     *charbuf++ = rule;                  \
3370     cmp_status->length += 2;            \
3371     cmp_status->state--;                \
3372   } while (0)
3373
3374 /* Store a composed char or a component char C in charbuf, and update
3375    cmp_status.  */
3376
3377 #define STORE_COMPOSITION_CHAR(c)                                       \
3378   do {                                                                  \
3379     *charbuf++ = (c);                                                   \
3380     cmp_status->length++;                                               \
3381     if (cmp_status->state == COMPOSING_CHAR)                            \
3382       cmp_status->nchars++;                                             \
3383     else                                                                \
3384       cmp_status->ncomps++;                                             \
3385     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3386         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3387             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3388       cmp_status->state++;                                              \
3389   } while (0)
3390
3391
3392 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3393
3394 static void
3395 decode_coding_iso_2022 (struct coding_system *coding)
3396 {
3397   const unsigned char *src = coding->source + coding->consumed;
3398   const unsigned char *src_end = coding->source + coding->src_bytes;
3399   const unsigned char *src_base;
3400   int *charbuf = coding->charbuf + coding->charbuf_used;
3401   /* We may produce two annotations (charset and composition) in one
3402      loop and one more charset annotation at the end.  */
3403   int *charbuf_end
3404     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3405   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3406   bool multibytep = coding->src_multibyte;
3407   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3408   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3409   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3410   int charset_id_2, charset_id_3;
3411   struct charset *charset;
3412   int c;
3413   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3414   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3415   ptrdiff_t char_offset = coding->produced_char;
3416   ptrdiff_t last_offset = char_offset;
3417   int last_id = charset_ascii;
3418   bool eol_dos
3419     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3420   int byte_after_cr = -1;
3421   int i;
3422
3423   setup_iso_safe_charsets (attrs);
3424   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3425
3426   if (cmp_status->state != COMPOSING_NO)
3427     {
3428       if (charbuf_end - charbuf < cmp_status->length)
3429         emacs_abort ();
3430       for (i = 0; i < cmp_status->length; i++)
3431         *charbuf++ = cmp_status->carryover[i];
3432       coding->annotated = 1;
3433     }
3434
3435   while (1)
3436     {
3437       int c1, c2, c3;
3438
3439       src_base = src;
3440       consumed_chars_base = consumed_chars;
3441
3442       if (charbuf >= charbuf_end)
3443         {
3444           if (byte_after_cr >= 0)
3445             src_base--;
3446           break;
3447         }
3448
3449       if (byte_after_cr >= 0)
3450         c1 = byte_after_cr, byte_after_cr = -1;
3451       else
3452         ONE_MORE_BYTE (c1);
3453       if (c1 < 0)
3454         goto invalid_code;
3455
3456       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3457         {
3458           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3459           char_offset++;
3460           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3461           continue;
3462         }
3463
3464       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3465         {
3466           if (c1 == ISO_CODE_ESC)
3467             {
3468               if (src + 1 >= src_end)
3469                 goto no_more_source;
3470               *charbuf++ = ISO_CODE_ESC;
3471               char_offset++;
3472               if (src[0] == '%' && src[1] == '@')
3473                 {
3474                   src += 2;
3475                   consumed_chars += 2;
3476                   char_offset += 2;
3477                   /* We are sure charbuf can contain two more chars. */
3478                   *charbuf++ = '%';
3479                   *charbuf++ = '@';
3480                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3481                 }
3482             }
3483           else
3484             {
3485               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3486               char_offset++;
3487             }
3488           continue;
3489         }
3490
3491       if ((cmp_status->state == COMPOSING_RULE
3492            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3493           && c1 != ISO_CODE_ESC)
3494         {
3495           int rule;
3496
3497           DECODE_COMPOSITION_RULE (rule);
3498           STORE_COMPOSITION_RULE (rule);
3499           continue;
3500         }
3501
3502       /* We produce at most one character.  */
3503       switch (iso_code_class [c1])
3504         {
3505         case ISO_0x20_or_0x7F:
3506           if (charset_id_0 < 0
3507               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3508             /* This is SPACE or DEL.  */
3509             charset = CHARSET_FROM_ID (charset_ascii);
3510           else
3511             charset = CHARSET_FROM_ID (charset_id_0);
3512           break;
3513
3514         case ISO_graphic_plane_0:
3515           if (charset_id_0 < 0)
3516             charset = CHARSET_FROM_ID (charset_ascii);
3517           else
3518             charset = CHARSET_FROM_ID (charset_id_0);
3519           break;
3520
3521         case ISO_0xA0_or_0xFF:
3522           if (charset_id_1 < 0
3523               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3524               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3525             goto invalid_code;
3526           /* This is a graphic character, we fall down ... */
3527
3528         case ISO_graphic_plane_1:
3529           if (charset_id_1 < 0)
3530             goto invalid_code;
3531           charset = CHARSET_FROM_ID (charset_id_1);
3532           break;
3533
3534         case ISO_control_0:
3535           if (eol_dos && c1 == '\r')
3536             ONE_MORE_BYTE (byte_after_cr);
3537           MAYBE_FINISH_COMPOSITION ();
3538           charset = CHARSET_FROM_ID (charset_ascii);
3539           break;
3540
3541         case ISO_control_1:
3542           goto invalid_code;
3543
3544         case ISO_shift_out:
3545           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3546               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3547             goto invalid_code;
3548           CODING_ISO_INVOCATION (coding, 0) = 1;
3549           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3550           continue;
3551
3552         case ISO_shift_in:
3553           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3554             goto invalid_code;
3555           CODING_ISO_INVOCATION (coding, 0) = 0;
3556           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3557           continue;
3558
3559         case ISO_single_shift_2_7:
3560           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3561             goto invalid_code;
3562         case ISO_single_shift_2:
3563           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3564             goto invalid_code;
3565           /* SS2 is handled as an escape sequence of ESC 'N' */
3566           c1 = 'N';
3567           goto label_escape_sequence;
3568
3569         case ISO_single_shift_3:
3570           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3571             goto invalid_code;
3572           /* SS2 is handled as an escape sequence of ESC 'O' */
3573           c1 = 'O';
3574           goto label_escape_sequence;
3575
3576         case ISO_control_sequence_introducer:
3577           /* CSI is handled as an escape sequence of ESC '[' ...  */
3578           c1 = '[';
3579           goto label_escape_sequence;
3580
3581         case ISO_escape:
3582           ONE_MORE_BYTE (c1);
3583         label_escape_sequence:
3584           /* Escape sequences handled here are invocation,
3585              designation, direction specification, and character
3586              composition specification.  */
3587           switch (c1)
3588             {
3589             case '&':           /* revision of following character set */
3590               ONE_MORE_BYTE (c1);
3591               if (!(c1 >= '@' && c1 <= '~'))
3592                 goto invalid_code;
3593               ONE_MORE_BYTE (c1);
3594               if (c1 != ISO_CODE_ESC)
3595                 goto invalid_code;
3596               ONE_MORE_BYTE (c1);
3597               goto label_escape_sequence;
3598
3599             case '$':           /* designation of 2-byte character set */
3600               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3601                 goto invalid_code;
3602               {
3603                 int reg, chars96;
3604
3605                 ONE_MORE_BYTE (c1);
3606                 if (c1 >= '@' && c1 <= 'B')
3607                   {     /* designation of JISX0208.1978, GB2312.1980,
3608                            or JISX0208.1980 */
3609                     reg = 0, chars96 = 0;
3610                   }
3611                 else if (c1 >= 0x28 && c1 <= 0x2B)
3612                   { /* designation of DIMENSION2_CHARS94 character set */
3613                     reg = c1 - 0x28, chars96 = 0;
3614                     ONE_MORE_BYTE (c1);
3615                   }
3616                 else if (c1 >= 0x2C && c1 <= 0x2F)
3617                   { /* designation of DIMENSION2_CHARS96 character set */
3618                     reg = c1 - 0x2C, chars96 = 1;
3619                     ONE_MORE_BYTE (c1);
3620                   }
3621                 else
3622                   goto invalid_code;
3623                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3624                 /* We must update these variables now.  */
3625                 if (reg == 0)
3626                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3627                 else if (reg == 1)
3628                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3629                 if (chars96 < 0)
3630                   goto invalid_code;
3631               }
3632               continue;
3633
3634             case 'n':           /* invocation of locking-shift-2 */
3635               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3636                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3637                 goto invalid_code;
3638               CODING_ISO_INVOCATION (coding, 0) = 2;
3639               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3640               continue;
3641
3642             case 'o':           /* invocation of locking-shift-3 */
3643               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3644                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3645                 goto invalid_code;
3646               CODING_ISO_INVOCATION (coding, 0) = 3;
3647               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3648               continue;
3649
3650             case 'N':           /* invocation of single-shift-2 */
3651               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3652                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3653                 goto invalid_code;
3654               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3655               if (charset_id_2 < 0)
3656                 charset = CHARSET_FROM_ID (charset_ascii);
3657               else
3658                 charset = CHARSET_FROM_ID (charset_id_2);
3659               ONE_MORE_BYTE (c1);
3660               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3661                 goto invalid_code;
3662               break;
3663
3664             case 'O':           /* invocation of single-shift-3 */
3665               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3666                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3667                 goto invalid_code;
3668               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3669               if (charset_id_3 < 0)
3670                 charset = CHARSET_FROM_ID (charset_ascii);
3671               else
3672                 charset = CHARSET_FROM_ID (charset_id_3);
3673               ONE_MORE_BYTE (c1);
3674               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3675                 goto invalid_code;
3676               break;
3677
3678             case '0': case '2': case '3': case '4': /* start composition */
3679               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3680                 goto invalid_code;
3681               if (last_id != charset_ascii)
3682                 {
3683                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3684                   last_id = charset_ascii;
3685                   last_offset = char_offset;
3686                 }
3687               DECODE_COMPOSITION_START (c1);
3688               continue;
3689
3690             case '1':           /* end composition */
3691               if (cmp_status->state == COMPOSING_NO)
3692                 goto invalid_code;
3693               DECODE_COMPOSITION_END ();
3694               continue;
3695
3696             case '[':           /* specification of direction */
3697               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3698                 goto invalid_code;
3699               /* For the moment, nested direction is not supported.
3700                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3701                  left-to-right, and nonzero means right-to-left.  */
3702               ONE_MORE_BYTE (c1);
3703               switch (c1)
3704                 {
3705                 case ']':       /* end of the current direction */
3706                   coding->mode &= ~CODING_MODE_DIRECTION;
3707
3708                 case '0':       /* end of the current direction */
3709                 case '1':       /* start of left-to-right direction */
3710                   ONE_MORE_BYTE (c1);
3711                   if (c1 == ']')
3712                     coding->mode &= ~CODING_MODE_DIRECTION;
3713                   else
3714                     goto invalid_code;
3715                   break;
3716
3717                 case '2':       /* start of right-to-left direction */
3718                   ONE_MORE_BYTE (c1);
3719                   if (c1 == ']')
3720                     coding->mode |= CODING_MODE_DIRECTION;
3721                   else
3722                     goto invalid_code;
3723                   break;
3724
3725                 default:
3726                   goto invalid_code;
3727                 }
3728               continue;
3729
3730             case '%':
3731               ONE_MORE_BYTE (c1);
3732               if (c1 == '/')
3733                 {
3734                   /* CTEXT extended segment:
3735                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3736                      We keep these bytes as is for the moment.
3737                      They may be decoded by post-read-conversion.  */
3738                   int dim, M, L;
3739                   int size;
3740
3741                   ONE_MORE_BYTE (dim);
3742                   if (dim < '0' || dim > '4')
3743                     goto invalid_code;
3744                   ONE_MORE_BYTE (M);
3745                   if (M < 128)
3746                     goto invalid_code;
3747                   ONE_MORE_BYTE (L);
3748                   if (L < 128)
3749                     goto invalid_code;
3750                   size = ((M - 128) * 128) + (L - 128);
3751                   if (charbuf + 6 > charbuf_end)
3752                     goto break_loop;
3753                   *charbuf++ = ISO_CODE_ESC;
3754                   *charbuf++ = '%';
3755                   *charbuf++ = '/';
3756                   *charbuf++ = dim;
3757                   *charbuf++ = BYTE8_TO_CHAR (M);
3758                   *charbuf++ = BYTE8_TO_CHAR (L);
3759                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3760                 }
3761               else if (c1 == 'G')
3762                 {
3763                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3764                      ESC % G --UTF-8-BYTES-- ESC % @
3765                      We keep these bytes as is for the moment.
3766                      They may be decoded by post-read-conversion.  */
3767                   if (charbuf + 3 > charbuf_end)
3768                     goto break_loop;
3769                   *charbuf++ = ISO_CODE_ESC;
3770                   *charbuf++ = '%';
3771                   *charbuf++ = 'G';
3772                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3773                 }
3774               else
3775                 goto invalid_code;
3776               continue;
3777               break;
3778
3779             default:
3780               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3781                 goto invalid_code;
3782               {
3783                 int reg, chars96;
3784
3785                 if (c1 >= 0x28 && c1 <= 0x2B)
3786                   { /* designation of DIMENSION1_CHARS94 character set */
3787                     reg = c1 - 0x28, chars96 = 0;
3788                     ONE_MORE_BYTE (c1);
3789                   }
3790                 else if (c1 >= 0x2C && c1 <= 0x2F)
3791                   { /* designation of DIMENSION1_CHARS96 character set */
3792                     reg = c1 - 0x2C, chars96 = 1;
3793                     ONE_MORE_BYTE (c1);
3794                   }
3795                 else
3796                   goto invalid_code;
3797                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3798                 /* We must update these variables now.  */
3799                 if (reg == 0)
3800                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3801                 else if (reg == 1)
3802                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3803                 if (chars96 < 0)
3804                   goto invalid_code;
3805               }
3806               continue;
3807             }
3808           break;
3809
3810         default:
3811           emacs_abort ();
3812         }
3813
3814       if (cmp_status->state == COMPOSING_NO
3815           && charset->id != charset_ascii
3816           && last_id != charset->id)
3817         {
3818           if (last_id != charset_ascii)
3819             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3820           last_id = charset->id;
3821           last_offset = char_offset;
3822         }
3823
3824       /* Now we know CHARSET and 1st position code C1 of a character.
3825          Produce a decoded character while getting 2nd and 3rd
3826          position codes C2, C3 if necessary.  */
3827       if (CHARSET_DIMENSION (charset) > 1)
3828         {
3829           ONE_MORE_BYTE (c2);
3830           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3831               || ((c1 & 0x80) != (c2 & 0x80)))
3832             /* C2 is not in a valid range.  */
3833             goto invalid_code;
3834           if (CHARSET_DIMENSION (charset) == 2)
3835             c1 = (c1 << 8) | c2;
3836           else
3837             {
3838               ONE_MORE_BYTE (c3);
3839               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3840                   || ((c1 & 0x80) != (c3 & 0x80)))
3841                 /* C3 is not in a valid range.  */
3842                 goto invalid_code;
3843               c1 = (c1 << 16) | (c2 << 8) | c2;
3844             }
3845         }
3846       c1 &= 0x7F7F7F;
3847       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3848       if (c < 0)
3849         {
3850           MAYBE_FINISH_COMPOSITION ();
3851           for (; src_base < src; src_base++, char_offset++)
3852             {
3853               if (ASCII_BYTE_P (*src_base))
3854                 *charbuf++ = *src_base;
3855               else
3856                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3857             }
3858         }
3859       else if (cmp_status->state == COMPOSING_NO)
3860         {
3861           *charbuf++ = c;
3862           char_offset++;
3863         }
3864       else if ((cmp_status->state == COMPOSING_CHAR
3865                 ? cmp_status->nchars
3866                 : cmp_status->ncomps)
3867                >= MAX_COMPOSITION_COMPONENTS)
3868         {
3869           /* Too long composition.  */
3870           MAYBE_FINISH_COMPOSITION ();
3871           *charbuf++ = c;
3872           char_offset++;
3873         }
3874       else
3875         STORE_COMPOSITION_CHAR (c);
3876       continue;
3877
3878     invalid_code:
3879       MAYBE_FINISH_COMPOSITION ();
3880       src = src_base;
3881       consumed_chars = consumed_chars_base;
3882       ONE_MORE_BYTE (c);
3883       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3884       char_offset++;
3885       coding->errors++;
3886       continue;
3887
3888     break_loop:
3889       break;
3890     }
3891
3892  no_more_source:
3893   if (cmp_status->state != COMPOSING_NO)
3894     {
3895       if (coding->mode & CODING_MODE_LAST_BLOCK)
3896         MAYBE_FINISH_COMPOSITION ();
3897       else
3898         {
3899           charbuf -= cmp_status->length;
3900           for (i = 0; i < cmp_status->length; i++)
3901             cmp_status->carryover[i] = charbuf[i];
3902         }
3903     }
3904   else if (last_id != charset_ascii)
3905     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3906   coding->consumed_char += consumed_chars_base;
3907   coding->consumed = src_base - coding->source;
3908   coding->charbuf_used = charbuf - coding->charbuf;
3909 }
3910
3911
3912 /* ISO2022 encoding stuff.  */
3913
3914 /*
3915    It is not enough to say just "ISO2022" on encoding, we have to
3916    specify more details.  In Emacs, each coding system of ISO2022
3917    variant has the following specifications:
3918         1. Initial designation to G0 thru G3.
3919         2. Allows short-form designation?
3920         3. ASCII should be designated to G0 before control characters?
3921         4. ASCII should be designated to G0 at end of line?
3922         5. 7-bit environment or 8-bit environment?
3923         6. Use locking-shift?
3924         7. Use Single-shift?
3925    And the following two are only for Japanese:
3926         8. Use ASCII in place of JIS0201-1976-Roman?
3927         9. Use JISX0208-1983 in place of JISX0208-1978?
3928    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3929    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3930    details.
3931 */
3932
3933 /* Produce codes (escape sequence) for designating CHARSET to graphic
3934    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3935    '@', 'A', or 'B' and the coding system CODING allows, produce
3936    designation sequence of short-form.  */
3937
3938 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3939   do {                                                                  \
3940     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3941     const char *intermediate_char_94 = "()*+";                          \
3942     const char *intermediate_char_96 = ",-./";                          \
3943     int revision = -1;                                                  \
3944                                                                         \
3945     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3946       revision = CHARSET_ISO_REVISION (charset);                        \
3947                                                                         \
3948     if (revision >= 0)                                                  \
3949       {                                                                 \
3950         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3951         EMIT_ONE_BYTE ('@' + revision);                                 \
3952       }                                                                 \
3953     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3954     if (CHARSET_DIMENSION (charset) == 1)                               \
3955       {                                                                 \
3956         int b;                                                          \
3957         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3958           b = intermediate_char_94[reg];                                \
3959         else                                                            \
3960           b = intermediate_char_96[reg];                                \
3961         EMIT_ONE_ASCII_BYTE (b);                                        \
3962       }                                                                 \
3963     else                                                                \
3964       {                                                                 \
3965         EMIT_ONE_ASCII_BYTE ('$');                                      \
3966         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3967           {                                                             \
3968             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3969                 || reg != 0                                             \
3970                 || final_char < '@' || final_char > 'B')                \
3971               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3972           }                                                             \
3973         else                                                            \
3974           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3975       }                                                                 \
3976     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3977                                                                         \
3978     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3979   } while (0)
3980
3981
3982 /* The following two macros produce codes (control character or escape
3983    sequence) for ISO2022 single-shift functions (single-shift-2 and
3984    single-shift-3).  */
3985
3986 #define ENCODE_SINGLE_SHIFT_2                                           \
3987   do {                                                                  \
3988     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3989       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3990     else                                                                \
3991       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3992     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3993   } while (0)
3994
3995
3996 #define ENCODE_SINGLE_SHIFT_3                                           \
3997   do {                                                                  \
3998     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3999       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4000     else                                                                \
4001       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4002     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4003   } while (0)
4004
4005
4006 /* The following four macros produce codes (control character or
4007    escape sequence) for ISO2022 locking-shift functions (shift-in,
4008    shift-out, locking-shift-2, and locking-shift-3).  */
4009
4010 #define ENCODE_SHIFT_IN                                 \
4011   do {                                                  \
4012     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4013     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4014   } while (0)
4015
4016
4017 #define ENCODE_SHIFT_OUT                                \
4018   do {                                                  \
4019     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4020     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4021   } while (0)
4022
4023
4024 #define ENCODE_LOCKING_SHIFT_2                          \
4025   do {                                                  \
4026     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4027     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4028   } while (0)
4029
4030
4031 #define ENCODE_LOCKING_SHIFT_3                          \
4032   do {                                                  \
4033     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4034     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4035   } while (0)
4036
4037
4038 /* Produce codes for a DIMENSION1 character whose character set is
4039    CHARSET and whose position-code is C1.  Designation and invocation
4040    sequences are also produced in advance if necessary.  */
4041
4042 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4043   do {                                                                  \
4044     int id = CHARSET_ID (charset);                                      \
4045                                                                         \
4046     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4047         && id == charset_ascii)                                         \
4048       {                                                                 \
4049         id = charset_jisx0201_roman;                                    \
4050         charset = CHARSET_FROM_ID (id);                                 \
4051       }                                                                 \
4052                                                                         \
4053     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4054       {                                                                 \
4055         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4056           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4057         else                                                            \
4058           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4059         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4060         break;                                                          \
4061       }                                                                 \
4062     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4063       {                                                                 \
4064         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4065         break;                                                          \
4066       }                                                                 \
4067     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4068       {                                                                 \
4069         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4070         break;                                                          \
4071       }                                                                 \
4072     else                                                                \
4073       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4074          must invoke it, or, at first, designate it to some graphic     \
4075          register.  Then repeat the loop to actually produce the        \
4076          character.  */                                                 \
4077       dst = encode_invocation_designation (charset, coding, dst,        \
4078                                            &produced_chars);            \
4079   } while (1)
4080
4081
4082 /* Produce codes for a DIMENSION2 character whose character set is
4083    CHARSET and whose position-codes are C1 and C2.  Designation and
4084    invocation codes are also produced in advance if necessary.  */
4085
4086 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4087   do {                                                                  \
4088     int id = CHARSET_ID (charset);                                      \
4089                                                                         \
4090     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4091         && id == charset_jisx0208)                                      \
4092       {                                                                 \
4093         id = charset_jisx0208_1978;                                     \
4094         charset = CHARSET_FROM_ID (id);                                 \
4095       }                                                                 \
4096                                                                         \
4097     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4098       {                                                                 \
4099         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4100           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4101         else                                                            \
4102           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4103         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4104         break;                                                          \
4105       }                                                                 \
4106     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4107       {                                                                 \
4108         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4109         break;                                                          \
4110       }                                                                 \
4111     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4112       {                                                                 \
4113         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4114         break;                                                          \
4115       }                                                                 \
4116     else                                                                \
4117       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4118          must invoke it, or, at first, designate it to some graphic     \
4119          register.  Then repeat the loop to actually produce the        \
4120          character.  */                                                 \
4121       dst = encode_invocation_designation (charset, coding, dst,        \
4122                                            &produced_chars);            \
4123   } while (1)
4124
4125
4126 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4127   do {                                                                     \
4128     unsigned code;                                                         \
4129     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4130                                                                            \
4131     if (CHARSET_DIMENSION (charset) == 1)                                  \
4132       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4133     else                                                                   \
4134       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4135   } while (0)
4136
4137
4138 /* Produce designation and invocation codes at a place pointed by DST
4139    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4140    Return new DST.  */
4141
4142 static unsigned char *
4143 encode_invocation_designation (struct charset *charset,
4144                                struct coding_system *coding,
4145                                unsigned char *dst, ptrdiff_t *p_nchars)
4146 {
4147   bool multibytep = coding->dst_multibyte;
4148   ptrdiff_t produced_chars = *p_nchars;
4149   int reg;                      /* graphic register number */
4150   int id = CHARSET_ID (charset);
4151
4152   /* At first, check designations.  */
4153   for (reg = 0; reg < 4; reg++)
4154     if (id == CODING_ISO_DESIGNATION (coding, reg))
4155       break;
4156
4157   if (reg >= 4)
4158     {
4159       /* CHARSET is not yet designated to any graphic registers.  */
4160       /* At first check the requested designation.  */
4161       reg = CODING_ISO_REQUEST (coding, id);
4162       if (reg < 0)
4163         /* Since CHARSET requests no special designation, designate it
4164            to graphic register 0.  */
4165         reg = 0;
4166
4167       ENCODE_DESIGNATION (charset, reg, coding);
4168     }
4169
4170   if (CODING_ISO_INVOCATION (coding, 0) != reg
4171       && CODING_ISO_INVOCATION (coding, 1) != reg)
4172     {
4173       /* Since the graphic register REG is not invoked to any graphic
4174          planes, invoke it to graphic plane 0.  */
4175       switch (reg)
4176         {
4177         case 0:                 /* graphic register 0 */
4178           ENCODE_SHIFT_IN;
4179           break;
4180
4181         case 1:                 /* graphic register 1 */
4182           ENCODE_SHIFT_OUT;
4183           break;
4184
4185         case 2:                 /* graphic register 2 */
4186           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4187             ENCODE_SINGLE_SHIFT_2;
4188           else
4189             ENCODE_LOCKING_SHIFT_2;
4190           break;
4191
4192         case 3:                 /* graphic register 3 */
4193           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4194             ENCODE_SINGLE_SHIFT_3;
4195           else
4196             ENCODE_LOCKING_SHIFT_3;
4197           break;
4198         }
4199     }
4200
4201   *p_nchars = produced_chars;
4202   return dst;
4203 }
4204
4205
4206 /* Produce codes for designation and invocation to reset the graphic
4207    planes and registers to initial state.  */
4208 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4209   do {                                                                  \
4210     int reg;                                                            \
4211     struct charset *charset;                                            \
4212                                                                         \
4213     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4214       ENCODE_SHIFT_IN;                                                  \
4215     for (reg = 0; reg < 4; reg++)                                       \
4216       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4217           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4218               != CODING_ISO_INITIAL (coding, reg)))                     \
4219         {                                                               \
4220           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4221           ENCODE_DESIGNATION (charset, reg, coding);                    \
4222         }                                                               \
4223   } while (0)
4224
4225
4226 /* Produce designation sequences of charsets in the line started from
4227    CHARBUF to a place pointed by DST, and return the number of
4228    produced bytes.  DST should not directly point a buffer text area
4229    which may be relocated by char_charset call.
4230
4231    If the current block ends before any end-of-line, we may fail to
4232    find all the necessary designations.  */
4233
4234 static ptrdiff_t
4235 encode_designation_at_bol (struct coding_system *coding,
4236                            int *charbuf, int *charbuf_end,
4237                            unsigned char *dst)
4238 {
4239   unsigned char *orig = dst;
4240   struct charset *charset;
4241   /* Table of charsets to be designated to each graphic register.  */
4242   int r[4];
4243   int c, found = 0, reg;
4244   ptrdiff_t produced_chars = 0;
4245   bool multibytep = coding->dst_multibyte;
4246   Lisp_Object attrs;
4247   Lisp_Object charset_list;
4248
4249   attrs = CODING_ID_ATTRS (coding->id);
4250   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4251   if (EQ (charset_list, Qiso_2022))
4252     charset_list = Viso_2022_charset_list;
4253
4254   for (reg = 0; reg < 4; reg++)
4255     r[reg] = -1;
4256
4257   while (charbuf < charbuf_end && found < 4)
4258     {
4259       int id;
4260
4261       c = *charbuf++;
4262       if (c == '\n')
4263         break;
4264       charset = char_charset (c, charset_list, NULL);
4265       id = CHARSET_ID (charset);
4266       reg = CODING_ISO_REQUEST (coding, id);
4267       if (reg >= 0 && r[reg] < 0)
4268         {
4269           found++;
4270           r[reg] = id;
4271         }
4272     }
4273
4274   if (found)
4275     {
4276       for (reg = 0; reg < 4; reg++)
4277         if (r[reg] >= 0
4278             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4279           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4280     }
4281
4282   return dst - orig;
4283 }
4284
4285 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4286
4287 static bool
4288 encode_coding_iso_2022 (struct coding_system *coding)
4289 {
4290   bool multibytep = coding->dst_multibyte;
4291   int *charbuf = coding->charbuf;
4292   int *charbuf_end = charbuf + coding->charbuf_used;
4293   unsigned char *dst = coding->destination + coding->produced;
4294   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4295   int safe_room = 16;
4296   bool bol_designation
4297     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4298        && CODING_ISO_BOL (coding));
4299   ptrdiff_t produced_chars = 0;
4300   Lisp_Object attrs, eol_type, charset_list;
4301   bool ascii_compatible;
4302   int c;
4303   int preferred_charset_id = -1;
4304
4305   CODING_GET_INFO (coding, attrs, charset_list);
4306   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4307   if (VECTORP (eol_type))
4308     eol_type = Qunix;
4309
4310   setup_iso_safe_charsets (attrs);
4311   /* Charset list may have been changed.  */
4312   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4313   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4314
4315   ascii_compatible
4316     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4317        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4318                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4319
4320   while (charbuf < charbuf_end)
4321     {
4322       ASSURE_DESTINATION (safe_room);
4323
4324       if (bol_designation)
4325         {
4326           /* We have to produce designation sequences if any now.  */
4327           unsigned char desig_buf[16];
4328           int nbytes;
4329           ptrdiff_t offset;
4330
4331           charset_map_loaded = 0;
4332           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4333                                               desig_buf);
4334           if (charset_map_loaded
4335               && (offset = coding_change_destination (coding)))
4336             {
4337               dst += offset;
4338               dst_end += offset;
4339             }
4340           memcpy (dst, desig_buf, nbytes);
4341           dst += nbytes;
4342           /* We are sure that designation sequences are all ASCII bytes.  */
4343           produced_chars += nbytes;
4344           bol_designation = 0;
4345           ASSURE_DESTINATION (safe_room);
4346         }
4347
4348       c = *charbuf++;
4349
4350       if (c < 0)
4351         {
4352           /* Handle an annotation.  */
4353           switch (*charbuf)
4354             {
4355             case CODING_ANNOTATE_COMPOSITION_MASK:
4356               /* Not yet implemented.  */
4357               break;
4358             case CODING_ANNOTATE_CHARSET_MASK:
4359               preferred_charset_id = charbuf[2];
4360               if (preferred_charset_id >= 0
4361                   && NILP (Fmemq (make_number (preferred_charset_id),
4362                                   charset_list)))
4363                 preferred_charset_id = -1;
4364               break;
4365             default:
4366               emacs_abort ();
4367             }
4368           charbuf += -c - 1;
4369           continue;
4370         }
4371
4372       /* Now encode the character C.  */
4373       if (c < 0x20 || c == 0x7F)
4374         {
4375           if (c == '\n'
4376               || (c == '\r' && EQ (eol_type, Qmac)))
4377             {
4378               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4379                 ENCODE_RESET_PLANE_AND_REGISTER ();
4380               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4381                 {
4382                   int i;
4383
4384                   for (i = 0; i < 4; i++)
4385                     CODING_ISO_DESIGNATION (coding, i)
4386                       = CODING_ISO_INITIAL (coding, i);
4387                 }
4388               bol_designation = ((CODING_ISO_FLAGS (coding)
4389                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4390                                  != 0);
4391             }
4392           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4393             ENCODE_RESET_PLANE_AND_REGISTER ();
4394           EMIT_ONE_ASCII_BYTE (c);
4395         }
4396       else if (ASCII_CHAR_P (c))
4397         {
4398           if (ascii_compatible)
4399             EMIT_ONE_ASCII_BYTE (c);
4400           else
4401             {
4402               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4403               ENCODE_ISO_CHARACTER (charset, c);
4404             }
4405         }
4406       else if (CHAR_BYTE8_P (c))
4407         {
4408           c = CHAR_TO_BYTE8 (c);
4409           EMIT_ONE_BYTE (c);
4410         }
4411       else
4412         {
4413           struct charset *charset;
4414
4415           if (preferred_charset_id >= 0)
4416             {
4417               bool result;
4418
4419               charset = CHARSET_FROM_ID (preferred_charset_id);
4420               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4421               if (! result)
4422                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4423                                      NULL, charset);
4424             }
4425           else
4426             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4427                                  NULL, charset);
4428           if (!charset)
4429             {
4430               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4431                 {
4432                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4433                   charset = CHARSET_FROM_ID (charset_ascii);
4434                 }
4435               else
4436                 {
4437                   c = coding->default_char;
4438                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4439                                        charset_list, NULL, charset);
4440                 }
4441             }
4442           ENCODE_ISO_CHARACTER (charset, c);
4443         }
4444     }
4445
4446   if (coding->mode & CODING_MODE_LAST_BLOCK
4447       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4448     {
4449       ASSURE_DESTINATION (safe_room);
4450       ENCODE_RESET_PLANE_AND_REGISTER ();
4451     }
4452   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4453   CODING_ISO_BOL (coding) = bol_designation;
4454   coding->produced_char += produced_chars;
4455   coding->produced = dst - coding->destination;
4456   return 0;
4457 }
4458
4459 \f
4460 /*** 8,9. SJIS and BIG5 handlers ***/
4461
4462 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4463    quite widely.  So, for the moment, Emacs supports them in the bare
4464    C code.  But, in the future, they may be supported only by CCL.  */
4465
4466 /* SJIS is a coding system encoding three character sets: ASCII, right
4467    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4468    as is.  A character of charset katakana-jisx0201 is encoded by
4469    "position-code + 0x80".  A character of charset japanese-jisx0208
4470    is encoded in 2-byte but two position-codes are divided and shifted
4471    so that it fit in the range below.
4472
4473    --- CODE RANGE of SJIS ---
4474    (character set)      (range)
4475    ASCII                0x00 .. 0x7F
4476    KATAKANA-JISX0201    0xA0 .. 0xDF
4477    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4478             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4479    -------------------------------
4480
4481 */
4482
4483 /* BIG5 is a coding system encoding two character sets: ASCII and
4484    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4485    character set and is encoded in two-byte.
4486
4487    --- CODE RANGE of BIG5 ---
4488    (character set)      (range)
4489    ASCII                0x00 .. 0x7F
4490    Big5 (1st byte)      0xA1 .. 0xFE
4491         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4492    --------------------------
4493
4494   */
4495
4496 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4497    Return true if a text is encoded in SJIS.  */
4498
4499 static bool
4500 detect_coding_sjis (struct coding_system *coding,
4501                     struct coding_detection_info *detect_info)
4502 {
4503   const unsigned char *src = coding->source, *src_base;
4504   const unsigned char *src_end = coding->source + coding->src_bytes;
4505   bool multibytep = coding->src_multibyte;
4506   ptrdiff_t consumed_chars = 0;
4507   int found = 0;
4508   int c;
4509   Lisp_Object attrs, charset_list;
4510   int max_first_byte_of_2_byte_code;
4511
4512   CODING_GET_INFO (coding, attrs, charset_list);
4513   max_first_byte_of_2_byte_code
4514     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4515
4516   detect_info->checked |= CATEGORY_MASK_SJIS;
4517   /* A coding system of this category is always ASCII compatible.  */
4518   src += coding->head_ascii;
4519
4520   while (1)
4521     {
4522       src_base = src;
4523       ONE_MORE_BYTE (c);
4524       if (c < 0x80)
4525         continue;
4526       if ((c >= 0x81 && c <= 0x9F)
4527           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4528         {
4529           ONE_MORE_BYTE (c);
4530           if (c < 0x40 || c == 0x7F || c > 0xFC)
4531             break;
4532           found = CATEGORY_MASK_SJIS;
4533         }
4534       else if (c >= 0xA0 && c < 0xE0)
4535         found = CATEGORY_MASK_SJIS;
4536       else
4537         break;
4538     }
4539   detect_info->rejected |= CATEGORY_MASK_SJIS;
4540   return 0;
4541
4542  no_more_source:
4543   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4544     {
4545       detect_info->rejected |= CATEGORY_MASK_SJIS;
4546       return 0;
4547     }
4548   detect_info->found |= found;
4549   return 1;
4550 }
4551
4552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4553    Return true if a text is encoded in BIG5.  */
4554
4555 static bool
4556 detect_coding_big5 (struct coding_system *coding,
4557                     struct coding_detection_info *detect_info)
4558 {
4559   const unsigned char *src = coding->source, *src_base;
4560   const unsigned char *src_end = coding->source + coding->src_bytes;
4561   bool multibytep = coding->src_multibyte;
4562   ptrdiff_t consumed_chars = 0;
4563   int found = 0;
4564   int c;
4565
4566   detect_info->checked |= CATEGORY_MASK_BIG5;
4567   /* A coding system of this category is always ASCII compatible.  */
4568   src += coding->head_ascii;
4569
4570   while (1)
4571     {
4572       src_base = src;
4573       ONE_MORE_BYTE (c);
4574       if (c < 0x80)
4575         continue;
4576       if (c >= 0xA1)
4577         {
4578           ONE_MORE_BYTE (c);
4579           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4580             return 0;
4581           found = CATEGORY_MASK_BIG5;
4582         }
4583       else
4584         break;
4585     }
4586   detect_info->rejected |= CATEGORY_MASK_BIG5;
4587   return 0;
4588
4589  no_more_source:
4590   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4591     {
4592       detect_info->rejected |= CATEGORY_MASK_BIG5;
4593       return 0;
4594     }
4595   detect_info->found |= found;
4596   return 1;
4597 }
4598
4599 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4600
4601 static void
4602 decode_coding_sjis (struct coding_system *coding)
4603 {
4604   const unsigned char *src = coding->source + coding->consumed;
4605   const unsigned char *src_end = coding->source + coding->src_bytes;
4606   const unsigned char *src_base;
4607   int *charbuf = coding->charbuf + coding->charbuf_used;
4608   /* We may produce one charset annotation in one loop and one more at
4609      the end.  */
4610   int *charbuf_end
4611     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4612   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4613   bool multibytep = coding->src_multibyte;
4614   struct charset *charset_roman, *charset_kanji, *charset_kana;
4615   struct charset *charset_kanji2;
4616   Lisp_Object attrs, charset_list, val;
4617   ptrdiff_t char_offset = coding->produced_char;
4618   ptrdiff_t last_offset = char_offset;
4619   int last_id = charset_ascii;
4620   bool eol_dos
4621     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4622   int byte_after_cr = -1;
4623
4624   CODING_GET_INFO (coding, attrs, charset_list);
4625
4626   val = charset_list;
4627   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4628   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4629   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4630   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4631
4632   while (1)
4633     {
4634       int c, c1;
4635       struct charset *charset;
4636
4637       src_base = src;
4638       consumed_chars_base = consumed_chars;
4639
4640       if (charbuf >= charbuf_end)
4641         {
4642           if (byte_after_cr >= 0)
4643             src_base--;
4644           break;
4645         }
4646
4647       if (byte_after_cr >= 0)
4648         c = byte_after_cr, byte_after_cr = -1;
4649       else
4650         ONE_MORE_BYTE (c);
4651       if (c < 0)
4652         goto invalid_code;
4653       if (c < 0x80)
4654         {
4655           if (eol_dos && c == '\r')
4656             ONE_MORE_BYTE (byte_after_cr);
4657           charset = charset_roman;
4658         }
4659       else if (c == 0x80 || c == 0xA0)
4660         goto invalid_code;
4661       else if (c >= 0xA1 && c <= 0xDF)
4662         {
4663           /* SJIS -> JISX0201-Kana */
4664           c &= 0x7F;
4665           charset = charset_kana;
4666         }
4667       else if (c <= 0xEF)
4668         {
4669           /* SJIS -> JISX0208 */
4670           ONE_MORE_BYTE (c1);
4671           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4672             goto invalid_code;
4673           c = (c << 8) | c1;
4674           SJIS_TO_JIS (c);
4675           charset = charset_kanji;
4676         }
4677       else if (c <= 0xFC && charset_kanji2)
4678         {
4679           /* SJIS -> JISX0213-2 */
4680           ONE_MORE_BYTE (c1);
4681           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4682             goto invalid_code;
4683           c = (c << 8) | c1;
4684           SJIS_TO_JIS2 (c);
4685           charset = charset_kanji2;
4686         }
4687       else
4688         goto invalid_code;
4689       if (charset->id != charset_ascii
4690           && last_id != charset->id)
4691         {
4692           if (last_id != charset_ascii)
4693             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4694           last_id = charset->id;
4695           last_offset = char_offset;
4696         }
4697       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4698       *charbuf++ = c;
4699       char_offset++;
4700       continue;
4701
4702     invalid_code:
4703       src = src_base;
4704       consumed_chars = consumed_chars_base;
4705       ONE_MORE_BYTE (c);
4706       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4707       char_offset++;
4708       coding->errors++;
4709     }
4710
4711  no_more_source:
4712   if (last_id != charset_ascii)
4713     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4714   coding->consumed_char += consumed_chars_base;
4715   coding->consumed = src_base - coding->source;
4716   coding->charbuf_used = charbuf - coding->charbuf;
4717 }
4718
4719 static void
4720 decode_coding_big5 (struct coding_system *coding)
4721 {
4722   const unsigned char *src = coding->source + coding->consumed;
4723   const unsigned char *src_end = coding->source + coding->src_bytes;
4724   const unsigned char *src_base;
4725   int *charbuf = coding->charbuf + coding->charbuf_used;
4726   /* We may produce one charset annotation in one loop and one more at
4727      the end.  */
4728   int *charbuf_end
4729     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4730   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4731   bool multibytep = coding->src_multibyte;
4732   struct charset *charset_roman, *charset_big5;
4733   Lisp_Object attrs, charset_list, val;
4734   ptrdiff_t char_offset = coding->produced_char;
4735   ptrdiff_t last_offset = char_offset;
4736   int last_id = charset_ascii;
4737   bool eol_dos
4738     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4739   int byte_after_cr = -1;
4740
4741   CODING_GET_INFO (coding, attrs, charset_list);
4742   val = charset_list;
4743   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4744   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4745
4746   while (1)
4747     {
4748       int c, c1;
4749       struct charset *charset;
4750
4751       src_base = src;
4752       consumed_chars_base = consumed_chars;
4753
4754       if (charbuf >= charbuf_end)
4755         {
4756           if (byte_after_cr >= 0)
4757             src_base--;
4758           break;
4759         }
4760
4761       if (byte_after_cr >= 0)
4762         c = byte_after_cr, byte_after_cr = -1;
4763       else
4764         ONE_MORE_BYTE (c);
4765
4766       if (c < 0)
4767         goto invalid_code;
4768       if (c < 0x80)
4769         {
4770           if (eol_dos && c == '\r')
4771             ONE_MORE_BYTE (byte_after_cr);
4772           charset = charset_roman;
4773         }
4774       else
4775         {
4776           /* BIG5 -> Big5 */
4777           if (c < 0xA1 || c > 0xFE)
4778             goto invalid_code;
4779           ONE_MORE_BYTE (c1);
4780           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4781             goto invalid_code;
4782           c = c << 8 | c1;
4783           charset = charset_big5;
4784         }
4785       if (charset->id != charset_ascii
4786           && last_id != charset->id)
4787         {
4788           if (last_id != charset_ascii)
4789             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4790           last_id = charset->id;
4791           last_offset = char_offset;
4792         }
4793       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4794       *charbuf++ = c;
4795       char_offset++;
4796       continue;
4797
4798     invalid_code:
4799       src = src_base;
4800       consumed_chars = consumed_chars_base;
4801       ONE_MORE_BYTE (c);
4802       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4803       char_offset++;
4804       coding->errors++;
4805     }
4806
4807  no_more_source:
4808   if (last_id != charset_ascii)
4809     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4810   coding->consumed_char += consumed_chars_base;
4811   coding->consumed = src_base - coding->source;
4812   coding->charbuf_used = charbuf - coding->charbuf;
4813 }
4814
4815 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4816    This function can encode charsets `ascii', `katakana-jisx0201',
4817    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4818    are sure that all these charsets are registered as official charset
4819    (i.e. do not have extended leading-codes).  Characters of other
4820    charsets are produced without any encoding.  */
4821
4822 static bool
4823 encode_coding_sjis (struct coding_system *coding)
4824 {
4825   bool multibytep = coding->dst_multibyte;
4826   int *charbuf = coding->charbuf;
4827   int *charbuf_end = charbuf + coding->charbuf_used;
4828   unsigned char *dst = coding->destination + coding->produced;
4829   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4830   int safe_room = 4;
4831   ptrdiff_t produced_chars = 0;
4832   Lisp_Object attrs, charset_list, val;
4833   bool ascii_compatible;
4834   struct charset *charset_kanji, *charset_kana;
4835   struct charset *charset_kanji2;
4836   int c;
4837
4838   CODING_GET_INFO (coding, attrs, charset_list);
4839   val = XCDR (charset_list);
4840   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4841   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4842   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4843
4844   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4845
4846   while (charbuf < charbuf_end)
4847     {
4848       ASSURE_DESTINATION (safe_room);
4849       c = *charbuf++;
4850       /* Now encode the character C.  */
4851       if (ASCII_CHAR_P (c) && ascii_compatible)
4852         EMIT_ONE_ASCII_BYTE (c);
4853       else if (CHAR_BYTE8_P (c))
4854         {
4855           c = CHAR_TO_BYTE8 (c);
4856           EMIT_ONE_BYTE (c);
4857         }
4858       else
4859         {
4860           unsigned code;
4861           struct charset *charset;
4862           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4863                                &code, charset);
4864
4865           if (!charset)
4866             {
4867               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4868                 {
4869                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4870                   charset = CHARSET_FROM_ID (charset_ascii);
4871                 }
4872               else
4873                 {
4874                   c = coding->default_char;
4875                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4876                                        charset_list, &code, charset);
4877                 }
4878             }
4879           if (code == CHARSET_INVALID_CODE (charset))
4880             emacs_abort ();
4881           if (charset == charset_kanji)
4882             {
4883               int c1, c2;
4884               JIS_TO_SJIS (code);
4885               c1 = code >> 8, c2 = code & 0xFF;
4886               EMIT_TWO_BYTES (c1, c2);
4887             }
4888           else if (charset == charset_kana)
4889             EMIT_ONE_BYTE (code | 0x80);
4890           else if (charset_kanji2 && charset == charset_kanji2)
4891             {
4892               int c1, c2;
4893
4894               c1 = code >> 8;
4895               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4896                   || c1 == 0x28
4897                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4898                 {
4899                   JIS_TO_SJIS2 (code);
4900                   c1 = code >> 8, c2 = code & 0xFF;
4901                   EMIT_TWO_BYTES (c1, c2);
4902                 }
4903               else
4904                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4905             }
4906           else
4907             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4908         }
4909     }
4910   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4911   coding->produced_char += produced_chars;
4912   coding->produced = dst - coding->destination;
4913   return 0;
4914 }
4915
4916 static bool
4917 encode_coding_big5 (struct coding_system *coding)
4918 {
4919   bool multibytep = coding->dst_multibyte;
4920   int *charbuf = coding->charbuf;
4921   int *charbuf_end = charbuf + coding->charbuf_used;
4922   unsigned char *dst = coding->destination + coding->produced;
4923   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4924   int safe_room = 4;
4925   ptrdiff_t produced_chars = 0;
4926   Lisp_Object attrs, charset_list, val;
4927   bool ascii_compatible;
4928   struct charset *charset_big5;
4929   int c;
4930
4931   CODING_GET_INFO (coding, attrs, charset_list);
4932   val = XCDR (charset_list);
4933   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4934   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4935
4936   while (charbuf < charbuf_end)
4937     {
4938       ASSURE_DESTINATION (safe_room);
4939       c = *charbuf++;
4940       /* Now encode the character C.  */
4941       if (ASCII_CHAR_P (c) && ascii_compatible)
4942         EMIT_ONE_ASCII_BYTE (c);
4943       else if (CHAR_BYTE8_P (c))
4944         {
4945           c = CHAR_TO_BYTE8 (c);
4946           EMIT_ONE_BYTE (c);
4947         }
4948       else
4949         {
4950           unsigned code;
4951           struct charset *charset;
4952           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4953                                &code, charset);
4954
4955           if (! charset)
4956             {
4957               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4958                 {
4959                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4960                   charset = CHARSET_FROM_ID (charset_ascii);
4961                 }
4962               else
4963                 {
4964                   c = coding->default_char;
4965                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4966                                        charset_list, &code, charset);
4967                 }
4968             }
4969           if (code == CHARSET_INVALID_CODE (charset))
4970             emacs_abort ();
4971           if (charset == charset_big5)
4972             {
4973               int c1, c2;
4974
4975               c1 = code >> 8, c2 = code & 0xFF;
4976               EMIT_TWO_BYTES (c1, c2);
4977             }
4978           else
4979             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4980         }
4981     }
4982   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4983   coding->produced_char += produced_chars;
4984   coding->produced = dst - coding->destination;
4985   return 0;
4986 }
4987
4988 \f
4989 /*** 10. CCL handlers ***/
4990
4991 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4992    Return true if a text is encoded in a coding system of which
4993    encoder/decoder are written in CCL program.  */
4994
4995 static bool
4996 detect_coding_ccl (struct coding_system *coding,
4997                    struct coding_detection_info *detect_info)
4998 {
4999   const unsigned char *src = coding->source, *src_base;
5000   const unsigned char *src_end = coding->source + coding->src_bytes;
5001   bool multibytep = coding->src_multibyte;
5002   ptrdiff_t consumed_chars = 0;
5003   int found = 0;
5004   unsigned char *valids;
5005   ptrdiff_t head_ascii = coding->head_ascii;
5006   Lisp_Object attrs;
5007
5008   detect_info->checked |= CATEGORY_MASK_CCL;
5009
5010   coding = &coding_categories[coding_category_ccl];
5011   valids = CODING_CCL_VALIDS (coding);
5012   attrs = CODING_ID_ATTRS (coding->id);
5013   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5014     src += head_ascii;
5015
5016   while (1)
5017     {
5018       int c;
5019
5020       src_base = src;
5021       ONE_MORE_BYTE (c);
5022       if (c < 0 || ! valids[c])
5023         break;
5024       if ((valids[c] > 1))
5025         found = CATEGORY_MASK_CCL;
5026     }
5027   detect_info->rejected |= CATEGORY_MASK_CCL;
5028   return 0;
5029
5030  no_more_source:
5031   detect_info->found |= found;
5032   return 1;
5033 }
5034
5035 static void
5036 decode_coding_ccl (struct coding_system *coding)
5037 {
5038   const unsigned char *src = coding->source + coding->consumed;
5039   const unsigned char *src_end = coding->source + coding->src_bytes;
5040   int *charbuf = coding->charbuf + coding->charbuf_used;
5041   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5042   ptrdiff_t consumed_chars = 0;
5043   bool multibytep = coding->src_multibyte;
5044   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5045   int source_charbuf[1024];
5046   int source_byteidx[1025];
5047   Lisp_Object attrs, charset_list;
5048
5049   CODING_GET_INFO (coding, attrs, charset_list);
5050
5051   while (1)
5052     {
5053       const unsigned char *p = src;
5054       ptrdiff_t offset;
5055       int i = 0;
5056
5057       if (multibytep)
5058         {
5059           while (i < 1024 && p < src_end)
5060             {
5061               source_byteidx[i] = p - src;
5062               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5063             }
5064           source_byteidx[i] = p - src;
5065         }
5066       else
5067         while (i < 1024 && p < src_end)
5068           source_charbuf[i++] = *p++;
5069
5070       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5071         ccl->last_block = 1;
5072       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5073       charset_map_loaded = 0;
5074       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5075                   charset_list);
5076       if (charset_map_loaded
5077           && (offset = coding_change_source (coding)))
5078         {
5079           p += offset;
5080           src += offset;
5081           src_end += offset;
5082         }
5083       charbuf += ccl->produced;
5084       if (multibytep)
5085         src += source_byteidx[ccl->consumed];
5086       else
5087         src += ccl->consumed;
5088       consumed_chars += ccl->consumed;
5089       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5090         break;
5091     }
5092
5093   switch (ccl->status)
5094     {
5095     case CCL_STAT_SUSPEND_BY_SRC:
5096       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5097       break;
5098     case CCL_STAT_SUSPEND_BY_DST:
5099       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5100       break;
5101     case CCL_STAT_QUIT:
5102     case CCL_STAT_INVALID_CMD:
5103       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5104       break;
5105     default:
5106       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5107       break;
5108     }
5109   coding->consumed_char += consumed_chars;
5110   coding->consumed = src - coding->source;
5111   coding->charbuf_used = charbuf - coding->charbuf;
5112 }
5113
5114 static bool
5115 encode_coding_ccl (struct coding_system *coding)
5116 {
5117   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5118   bool multibytep = coding->dst_multibyte;
5119   int *charbuf = coding->charbuf;
5120   int *charbuf_end = charbuf + coding->charbuf_used;
5121   unsigned char *dst = coding->destination + coding->produced;
5122   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5123   int destination_charbuf[1024];
5124   ptrdiff_t produced_chars = 0;
5125   int i;
5126   Lisp_Object attrs, charset_list;
5127
5128   CODING_GET_INFO (coding, attrs, charset_list);
5129   if (coding->consumed_char == coding->src_chars
5130       && coding->mode & CODING_MODE_LAST_BLOCK)
5131     ccl->last_block = 1;
5132
5133   do
5134     {
5135       ptrdiff_t offset;
5136
5137       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5138       charset_map_loaded = 0;
5139       ccl_driver (ccl, charbuf, destination_charbuf,
5140                   charbuf_end - charbuf, 1024, charset_list);
5141       if (charset_map_loaded
5142           && (offset = coding_change_destination (coding)))
5143         dst += offset;
5144       if (multibytep)
5145         {
5146           ASSURE_DESTINATION (ccl->produced * 2);
5147           for (i = 0; i < ccl->produced; i++)
5148             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5149         }
5150       else
5151         {
5152           ASSURE_DESTINATION (ccl->produced);
5153           for (i = 0; i < ccl->produced; i++)
5154             *dst++ = destination_charbuf[i] & 0xFF;
5155           produced_chars += ccl->produced;
5156         }
5157       charbuf += ccl->consumed;
5158       if (ccl->status == CCL_STAT_QUIT
5159           || ccl->status == CCL_STAT_INVALID_CMD)
5160         break;
5161     }
5162   while (charbuf < charbuf_end);
5163
5164   switch (ccl->status)
5165     {
5166     case CCL_STAT_SUSPEND_BY_SRC:
5167       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5168       break;
5169     case CCL_STAT_SUSPEND_BY_DST:
5170       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5171       break;
5172     case CCL_STAT_QUIT:
5173     case CCL_STAT_INVALID_CMD:
5174       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5175       break;
5176     default:
5177       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5178       break;
5179     }
5180
5181   coding->produced_char += produced_chars;
5182   coding->produced = dst - coding->destination;
5183   return 0;
5184 }
5185
5186 \f
5187 /*** 10, 11. no-conversion handlers ***/
5188
5189 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5190
5191 static void
5192 decode_coding_raw_text (struct coding_system *coding)
5193 {
5194   bool eol_dos
5195     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5196
5197   coding->chars_at_source = 1;
5198   coding->consumed_char = coding->src_chars;
5199   coding->consumed = coding->src_bytes;
5200   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5201     {
5202       coding->consumed_char--;
5203       coding->consumed--;
5204       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5205     }
5206   else
5207     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5208 }
5209
5210 static bool
5211 encode_coding_raw_text (struct coding_system *coding)
5212 {
5213   bool multibytep = coding->dst_multibyte;
5214   int *charbuf = coding->charbuf;
5215   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5216   unsigned char *dst = coding->destination + coding->produced;
5217   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5218   ptrdiff_t produced_chars = 0;
5219   int c;
5220
5221   if (multibytep)
5222     {
5223       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5224
5225       if (coding->src_multibyte)
5226         while (charbuf < charbuf_end)
5227           {
5228             ASSURE_DESTINATION (safe_room);
5229             c = *charbuf++;
5230             if (ASCII_CHAR_P (c))
5231               EMIT_ONE_ASCII_BYTE (c);
5232             else if (CHAR_BYTE8_P (c))
5233               {
5234                 c = CHAR_TO_BYTE8 (c);
5235                 EMIT_ONE_BYTE (c);
5236               }
5237             else
5238               {
5239                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5240
5241                 CHAR_STRING_ADVANCE (c, p1);
5242                 do
5243                   {
5244                     EMIT_ONE_BYTE (*p0);
5245                     p0++;
5246                   }
5247                 while (p0 < p1);
5248               }
5249           }
5250       else
5251         while (charbuf < charbuf_end)
5252           {
5253             ASSURE_DESTINATION (safe_room);
5254             c = *charbuf++;
5255             EMIT_ONE_BYTE (c);
5256           }
5257     }
5258   else
5259     {
5260       if (coding->src_multibyte)
5261         {
5262           int safe_room = MAX_MULTIBYTE_LENGTH;
5263
5264           while (charbuf < charbuf_end)
5265             {
5266               ASSURE_DESTINATION (safe_room);
5267               c = *charbuf++;
5268               if (ASCII_CHAR_P (c))
5269                 *dst++ = c;
5270               else if (CHAR_BYTE8_P (c))
5271                 *dst++ = CHAR_TO_BYTE8 (c);
5272               else
5273                 CHAR_STRING_ADVANCE (c, dst);
5274             }
5275         }
5276       else
5277         {
5278           ASSURE_DESTINATION (charbuf_end - charbuf);
5279           while (charbuf < charbuf_end && dst < dst_end)
5280             *dst++ = *charbuf++;
5281         }
5282       produced_chars = dst - (coding->destination + coding->produced);
5283     }
5284   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5285   coding->produced_char += produced_chars;
5286   coding->produced = dst - coding->destination;
5287   return 0;
5288 }
5289
5290 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5291    Return true if a text is encoded in a charset-based coding system.  */
5292
5293 static bool
5294 detect_coding_charset (struct coding_system *coding,
5295                        struct coding_detection_info *detect_info)
5296 {
5297   const unsigned char *src = coding->source, *src_base;
5298   const unsigned char *src_end = coding->source + coding->src_bytes;
5299   bool multibytep = coding->src_multibyte;
5300   ptrdiff_t consumed_chars = 0;
5301   Lisp_Object attrs, valids, name;
5302   int found = 0;
5303   ptrdiff_t head_ascii = coding->head_ascii;
5304   bool check_latin_extra = 0;
5305
5306   detect_info->checked |= CATEGORY_MASK_CHARSET;
5307
5308   coding = &coding_categories[coding_category_charset];
5309   attrs = CODING_ID_ATTRS (coding->id);
5310   valids = AREF (attrs, coding_attr_charset_valids);
5311   name = CODING_ID_NAME (coding->id);
5312   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5313                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5314       || strncmp (SSDATA (SYMBOL_NAME (name)),
5315                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5316     check_latin_extra = 1;
5317
5318   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5319     src += head_ascii;
5320
5321   while (1)
5322     {
5323       int c;
5324       Lisp_Object val;
5325       struct charset *charset;
5326       int dim, idx;
5327
5328       src_base = src;
5329       ONE_MORE_BYTE (c);
5330       if (c < 0)
5331         continue;
5332       val = AREF (valids, c);
5333       if (NILP (val))
5334         break;
5335       if (c >= 0x80)
5336         {
5337           if (c < 0xA0
5338               && check_latin_extra
5339               && (!VECTORP (Vlatin_extra_code_table)
5340                   || NILP (AREF (Vlatin_extra_code_table, c))))
5341             break;
5342           found = CATEGORY_MASK_CHARSET;
5343         }
5344       if (INTEGERP (val))
5345         {
5346           charset = CHARSET_FROM_ID (XFASTINT (val));
5347           dim = CHARSET_DIMENSION (charset);
5348           for (idx = 1; idx < dim; idx++)
5349             {
5350               if (src == src_end)
5351                 goto too_short;
5352               ONE_MORE_BYTE (c);
5353               if (c < charset->code_space[(dim - 1 - idx) * 4]
5354                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5355                 break;
5356             }
5357           if (idx < dim)
5358             break;
5359         }
5360       else
5361         {
5362           idx = 1;
5363           for (; CONSP (val); val = XCDR (val))
5364             {
5365               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5366               dim = CHARSET_DIMENSION (charset);
5367               while (idx < dim)
5368                 {
5369                   if (src == src_end)
5370                     goto too_short;
5371                   ONE_MORE_BYTE (c);
5372                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5373                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5374                     break;
5375                   idx++;
5376                 }
5377               if (idx == dim)
5378                 {
5379                   val = Qnil;
5380                   break;
5381                 }
5382             }
5383           if (CONSP (val))
5384             break;
5385         }
5386     }
5387  too_short:
5388   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5389   return 0;
5390
5391  no_more_source:
5392   detect_info->found |= found;
5393   return 1;
5394 }
5395
5396 static void
5397 decode_coding_charset (struct coding_system *coding)
5398 {
5399   const unsigned char *src = coding->source + coding->consumed;
5400   const unsigned char *src_end = coding->source + coding->src_bytes;
5401   const unsigned char *src_base;
5402   int *charbuf = coding->charbuf + coding->charbuf_used;
5403   /* We may produce one charset annotation in one loop and one more at
5404      the end.  */
5405   int *charbuf_end
5406     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5407   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5408   bool multibytep = coding->src_multibyte;
5409   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5410   Lisp_Object valids;
5411   ptrdiff_t char_offset = coding->produced_char;
5412   ptrdiff_t last_offset = char_offset;
5413   int last_id = charset_ascii;
5414   bool eol_dos
5415     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5416   int byte_after_cr = -1;
5417
5418   valids = AREF (attrs, coding_attr_charset_valids);
5419
5420   while (1)
5421     {
5422       int c;
5423       Lisp_Object val;
5424       struct charset *charset;
5425       int dim;
5426       int len = 1;
5427       unsigned code;
5428
5429       src_base = src;
5430       consumed_chars_base = consumed_chars;
5431
5432       if (charbuf >= charbuf_end)
5433         {
5434           if (byte_after_cr >= 0)
5435             src_base--;
5436           break;
5437         }
5438
5439       if (byte_after_cr >= 0)
5440         {
5441           c = byte_after_cr;
5442           byte_after_cr = -1;
5443         }
5444       else
5445         {
5446           ONE_MORE_BYTE (c);
5447           if (eol_dos && c == '\r')
5448             ONE_MORE_BYTE (byte_after_cr);
5449         }
5450       if (c < 0)
5451         goto invalid_code;
5452       code = c;
5453
5454       val = AREF (valids, c);
5455       if (! INTEGERP (val) && ! CONSP (val))
5456         goto invalid_code;
5457       if (INTEGERP (val))
5458         {
5459           charset = CHARSET_FROM_ID (XFASTINT (val));
5460           dim = CHARSET_DIMENSION (charset);
5461           while (len < dim)
5462             {
5463               ONE_MORE_BYTE (c);
5464               code = (code << 8) | c;
5465               len++;
5466             }
5467           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5468                               charset, code, c);
5469         }
5470       else
5471         {
5472           /* VAL is a list of charset IDs.  It is assured that the
5473              list is sorted by charset dimensions (smaller one
5474              comes first).  */
5475           while (CONSP (val))
5476             {
5477               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5478               dim = CHARSET_DIMENSION (charset);
5479               while (len < dim)
5480                 {
5481                   ONE_MORE_BYTE (c);
5482                   code = (code << 8) | c;
5483                   len++;
5484                 }
5485               CODING_DECODE_CHAR (coding, src, src_base,
5486                                   src_end, charset, code, c);
5487               if (c >= 0)
5488                 break;
5489               val = XCDR (val);
5490             }
5491         }
5492       if (c < 0)
5493         goto invalid_code;
5494       if (charset->id != charset_ascii
5495           && last_id != charset->id)
5496         {
5497           if (last_id != charset_ascii)
5498             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5499           last_id = charset->id;
5500           last_offset = char_offset;
5501         }
5502
5503       *charbuf++ = c;
5504       char_offset++;
5505       continue;
5506
5507     invalid_code:
5508       src = src_base;
5509       consumed_chars = consumed_chars_base;
5510       ONE_MORE_BYTE (c);
5511       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5512       char_offset++;
5513       coding->errors++;
5514     }
5515
5516  no_more_source:
5517   if (last_id != charset_ascii)
5518     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5519   coding->consumed_char += consumed_chars_base;
5520   coding->consumed = src_base - coding->source;
5521   coding->charbuf_used = charbuf - coding->charbuf;
5522 }
5523
5524 static bool
5525 encode_coding_charset (struct coding_system *coding)
5526 {
5527   bool multibytep = coding->dst_multibyte;
5528   int *charbuf = coding->charbuf;
5529   int *charbuf_end = charbuf + coding->charbuf_used;
5530   unsigned char *dst = coding->destination + coding->produced;
5531   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5532   int safe_room = MAX_MULTIBYTE_LENGTH;
5533   ptrdiff_t produced_chars = 0;
5534   Lisp_Object attrs, charset_list;
5535   bool ascii_compatible;
5536   int c;
5537
5538   CODING_GET_INFO (coding, attrs, charset_list);
5539   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5540
5541   while (charbuf < charbuf_end)
5542     {
5543       struct charset *charset;
5544       unsigned code;
5545
5546       ASSURE_DESTINATION (safe_room);
5547       c = *charbuf++;
5548       if (ascii_compatible && ASCII_CHAR_P (c))
5549         EMIT_ONE_ASCII_BYTE (c);
5550       else if (CHAR_BYTE8_P (c))
5551         {
5552           c = CHAR_TO_BYTE8 (c);
5553           EMIT_ONE_BYTE (c);
5554         }
5555       else
5556         {
5557           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5558                                &code, charset);
5559
5560           if (charset)
5561             {
5562               if (CHARSET_DIMENSION (charset) == 1)
5563                 EMIT_ONE_BYTE (code);
5564               else if (CHARSET_DIMENSION (charset) == 2)
5565                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5566               else if (CHARSET_DIMENSION (charset) == 3)
5567                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5568               else
5569                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5570                                  (code >> 8) & 0xFF, code & 0xFF);
5571             }
5572           else
5573             {
5574               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5575                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5576               else
5577                 c = coding->default_char;
5578               EMIT_ONE_BYTE (c);
5579             }
5580         }
5581     }
5582
5583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5584   coding->produced_char += produced_chars;
5585   coding->produced = dst - coding->destination;
5586   return 0;
5587 }
5588
5589 \f
5590 /*** 7. C library functions ***/
5591
5592 /* Setup coding context CODING from information about CODING_SYSTEM.
5593    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5594    CODING_SYSTEM is invalid, signal an error.  */
5595
5596 void
5597 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5598 {
5599   Lisp_Object attrs;
5600   Lisp_Object eol_type;
5601   Lisp_Object coding_type;
5602   Lisp_Object val;
5603
5604   if (NILP (coding_system))
5605     coding_system = Qundecided;
5606
5607   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5608
5609   attrs = CODING_ID_ATTRS (coding->id);
5610   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5611
5612   coding->mode = 0;
5613   coding->head_ascii = -1;
5614   if (VECTORP (eol_type))
5615     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5616                             | CODING_REQUIRE_DETECTION_MASK);
5617   else if (! EQ (eol_type, Qunix))
5618     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5619                             | CODING_REQUIRE_ENCODING_MASK);
5620   else
5621     coding->common_flags = 0;
5622   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5623     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5624   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5625     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5626   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5627     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5628
5629   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5630   coding->max_charset_id = SCHARS (val) - 1;
5631   coding->safe_charsets = SDATA (val);
5632   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5633   coding->carryover_bytes = 0;
5634
5635   coding_type = CODING_ATTR_TYPE (attrs);
5636   if (EQ (coding_type, Qundecided))
5637     {
5638       coding->detector = NULL;
5639       coding->decoder = decode_coding_raw_text;
5640       coding->encoder = encode_coding_raw_text;
5641       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5642     }
5643   else if (EQ (coding_type, Qiso_2022))
5644     {
5645       int i;
5646       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5647
5648       /* Invoke graphic register 0 to plane 0.  */
5649       CODING_ISO_INVOCATION (coding, 0) = 0;
5650       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5651       CODING_ISO_INVOCATION (coding, 1)
5652         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5653       /* Setup the initial status of designation.  */
5654       for (i = 0; i < 4; i++)
5655         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5656       /* Not single shifting initially.  */
5657       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5658       /* Beginning of buffer should also be regarded as bol. */
5659       CODING_ISO_BOL (coding) = 1;
5660       coding->detector = detect_coding_iso_2022;
5661       coding->decoder = decode_coding_iso_2022;
5662       coding->encoder = encode_coding_iso_2022;
5663       if (flags & CODING_ISO_FLAG_SAFE)
5664         coding->mode |= CODING_MODE_SAFE_ENCODING;
5665       coding->common_flags
5666         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5667             | CODING_REQUIRE_FLUSHING_MASK);
5668       if (flags & CODING_ISO_FLAG_COMPOSITION)
5669         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5670       if (flags & CODING_ISO_FLAG_DESIGNATION)
5671         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5672       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5673         {
5674           setup_iso_safe_charsets (attrs);
5675           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5676           coding->max_charset_id = SCHARS (val) - 1;
5677           coding->safe_charsets = SDATA (val);
5678         }
5679       CODING_ISO_FLAGS (coding) = flags;
5680       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5681       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5682       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5683       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5684     }
5685   else if (EQ (coding_type, Qcharset))
5686     {
5687       coding->detector = detect_coding_charset;
5688       coding->decoder = decode_coding_charset;
5689       coding->encoder = encode_coding_charset;
5690       coding->common_flags
5691         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5692     }
5693   else if (EQ (coding_type, Qutf_8))
5694     {
5695       val = AREF (attrs, coding_attr_utf_bom);
5696       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5697                                    : EQ (val, Qt) ? utf_with_bom
5698                                    : utf_without_bom);
5699       coding->detector = detect_coding_utf_8;
5700       coding->decoder = decode_coding_utf_8;
5701       coding->encoder = encode_coding_utf_8;
5702       coding->common_flags
5703         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5704       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5705         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5706     }
5707   else if (EQ (coding_type, Qutf_16))
5708     {
5709       val = AREF (attrs, coding_attr_utf_bom);
5710       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5711                                     : EQ (val, Qt) ? utf_with_bom
5712                                     : utf_without_bom);
5713       val = AREF (attrs, coding_attr_utf_16_endian);
5714       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5715                                        : utf_16_little_endian);
5716       CODING_UTF_16_SURROGATE (coding) = 0;
5717       coding->detector = detect_coding_utf_16;
5718       coding->decoder = decode_coding_utf_16;
5719       coding->encoder = encode_coding_utf_16;
5720       coding->common_flags
5721         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5722       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5723         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5724     }
5725   else if (EQ (coding_type, Qccl))
5726     {
5727       coding->detector = detect_coding_ccl;
5728       coding->decoder = decode_coding_ccl;
5729       coding->encoder = encode_coding_ccl;
5730       coding->common_flags
5731         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5732             | CODING_REQUIRE_FLUSHING_MASK);
5733     }
5734   else if (EQ (coding_type, Qemacs_mule))
5735     {
5736       coding->detector = detect_coding_emacs_mule;
5737       coding->decoder = decode_coding_emacs_mule;
5738       coding->encoder = encode_coding_emacs_mule;
5739       coding->common_flags
5740         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5741       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5742           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5743         {
5744           Lisp_Object tail, safe_charsets;
5745           int max_charset_id = 0;
5746
5747           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5748                tail = XCDR (tail))
5749             if (max_charset_id < XFASTINT (XCAR (tail)))
5750               max_charset_id = XFASTINT (XCAR (tail));
5751           safe_charsets = make_uninit_string (max_charset_id + 1);
5752           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5753           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5754                tail = XCDR (tail))
5755             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5756           coding->max_charset_id = max_charset_id;
5757           coding->safe_charsets = SDATA (safe_charsets);
5758         }
5759       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5760       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5761     }
5762   else if (EQ (coding_type, Qshift_jis))
5763     {
5764       coding->detector = detect_coding_sjis;
5765       coding->decoder = decode_coding_sjis;
5766       coding->encoder = encode_coding_sjis;
5767       coding->common_flags
5768         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5769     }
5770   else if (EQ (coding_type, Qbig5))
5771     {
5772       coding->detector = detect_coding_big5;
5773       coding->decoder = decode_coding_big5;
5774       coding->encoder = encode_coding_big5;
5775       coding->common_flags
5776         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5777     }
5778   else                          /* EQ (coding_type, Qraw_text) */
5779     {
5780       coding->detector = NULL;
5781       coding->decoder = decode_coding_raw_text;
5782       coding->encoder = encode_coding_raw_text;
5783       if (! EQ (eol_type, Qunix))
5784         {
5785           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5786           if (! VECTORP (eol_type))
5787             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5788         }
5789
5790     }
5791
5792   return;
5793 }
5794
5795 /* Return a list of charsets supported by CODING.  */
5796
5797 Lisp_Object
5798 coding_charset_list (struct coding_system *coding)
5799 {
5800   Lisp_Object attrs, charset_list;
5801
5802   CODING_GET_INFO (coding, attrs, charset_list);
5803   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5804     {
5805       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5806
5807       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5808         charset_list = Viso_2022_charset_list;
5809     }
5810   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5811     {
5812       charset_list = Vemacs_mule_charset_list;
5813     }
5814   return charset_list;
5815 }
5816
5817
5818 /* Return a list of charsets supported by CODING-SYSTEM.  */
5819
5820 Lisp_Object
5821 coding_system_charset_list (Lisp_Object coding_system)
5822 {
5823   ptrdiff_t id;
5824   Lisp_Object attrs, charset_list;
5825
5826   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5827   attrs = CODING_ID_ATTRS (id);
5828
5829   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5830     {
5831       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5832
5833       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5834         charset_list = Viso_2022_charset_list;
5835       else
5836         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5837     }
5838   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5839     {
5840       charset_list = Vemacs_mule_charset_list;
5841     }
5842   else
5843     {
5844       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5845     }
5846   return charset_list;
5847 }
5848
5849
5850 /* Return raw-text or one of its subsidiaries that has the same
5851    eol_type as CODING-SYSTEM.  */
5852
5853 Lisp_Object
5854 raw_text_coding_system (Lisp_Object coding_system)
5855 {
5856   Lisp_Object spec, attrs;
5857   Lisp_Object eol_type, raw_text_eol_type;
5858
5859   if (NILP (coding_system))
5860     return Qraw_text;
5861   spec = CODING_SYSTEM_SPEC (coding_system);
5862   attrs = AREF (spec, 0);
5863
5864   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5865     return coding_system;
5866
5867   eol_type = AREF (spec, 2);
5868   if (VECTORP (eol_type))
5869     return Qraw_text;
5870   spec = CODING_SYSTEM_SPEC (Qraw_text);
5871   raw_text_eol_type = AREF (spec, 2);
5872   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5873           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5874           : AREF (raw_text_eol_type, 2));
5875 }
5876
5877
5878 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5879    the subsidiary that has the same eol-spec as PARENT (if it is not
5880    nil and specifies end-of-line format) or the system's setting
5881    (system_eol_type).  */
5882
5883 Lisp_Object
5884 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5885 {
5886   Lisp_Object spec, eol_type;
5887
5888   if (NILP (coding_system))
5889     coding_system = Qraw_text;
5890   spec = CODING_SYSTEM_SPEC (coding_system);
5891   eol_type = AREF (spec, 2);
5892   if (VECTORP (eol_type))
5893     {
5894       Lisp_Object parent_eol_type;
5895
5896       if (! NILP (parent))
5897         {
5898           Lisp_Object parent_spec;
5899
5900           parent_spec = CODING_SYSTEM_SPEC (parent);
5901           parent_eol_type = AREF (parent_spec, 2);
5902           if (VECTORP (parent_eol_type))
5903             parent_eol_type = system_eol_type;
5904         }
5905       else
5906         parent_eol_type = system_eol_type;
5907       if (EQ (parent_eol_type, Qunix))
5908         coding_system = AREF (eol_type, 0);
5909       else if (EQ (parent_eol_type, Qdos))
5910         coding_system = AREF (eol_type, 1);
5911       else if (EQ (parent_eol_type, Qmac))
5912         coding_system = AREF (eol_type, 2);
5913     }
5914   return coding_system;
5915 }
5916
5917
5918 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5919    decided for writing to a process.  If not, complement them, and
5920    return a new coding system.  */
5921
5922 Lisp_Object
5923 complement_process_encoding_system (Lisp_Object coding_system)
5924 {
5925   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5926   Lisp_Object spec, attrs;
5927   int i;
5928
5929   for (i = 0; i < 3; i++)
5930     {
5931       if (i == 1)
5932         coding_system = CDR_SAFE (Vdefault_process_coding_system);
5933       else if (i == 2)
5934         coding_system = preferred_coding_system ();
5935       spec = CODING_SYSTEM_SPEC (coding_system);
5936       if (NILP (spec))
5937         continue;
5938       attrs = AREF (spec, 0);
5939       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
5940         coding_base = CODING_ATTR_BASE_NAME (attrs);
5941       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
5942         eol_base = coding_system;
5943       if (! NILP (coding_base) && ! NILP (eol_base))
5944         break;
5945     }
5946
5947   if (i > 0)
5948     /* The original CODING_SYSTEM didn't specify text-conversion or
5949        eol-conversion.  Be sure that we return a fully complemented
5950        coding system.  */
5951     coding_system = coding_inherit_eol_type (coding_base, eol_base);
5952   return coding_system;
5953 }
5954
5955
5956 /* Emacs has a mechanism to automatically detect a coding system if it
5957    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5958    it's impossible to distinguish some coding systems accurately
5959    because they use the same range of codes.  So, at first, coding
5960    systems are categorized into 7, those are:
5961
5962    o coding-category-emacs-mule
5963
5964         The category for a coding system which has the same code range
5965         as Emacs' internal format.  Assigned the coding-system (Lisp
5966         symbol) `emacs-mule' by default.
5967
5968    o coding-category-sjis
5969
5970         The category for a coding system which has the same code range
5971         as SJIS.  Assigned the coding-system (Lisp
5972         symbol) `japanese-shift-jis' by default.
5973
5974    o coding-category-iso-7
5975
5976         The category for a coding system which has the same code range
5977         as ISO2022 of 7-bit environment.  This doesn't use any locking
5978         shift and single shift functions.  This can encode/decode all
5979         charsets.  Assigned the coding-system (Lisp symbol)
5980         `iso-2022-7bit' by default.
5981
5982    o coding-category-iso-7-tight
5983
5984         Same as coding-category-iso-7 except that this can
5985         encode/decode only the specified charsets.
5986
5987    o coding-category-iso-8-1
5988
5989         The category for a coding system which has the same code range
5990         as ISO2022 of 8-bit environment and graphic plane 1 used only
5991         for DIMENSION1 charset.  This doesn't use any locking shift
5992         and single shift functions.  Assigned the coding-system (Lisp
5993         symbol) `iso-latin-1' by default.
5994
5995    o coding-category-iso-8-2
5996
5997         The category for a coding system which has the same code range
5998         as ISO2022 of 8-bit environment and graphic plane 1 used only
5999         for DIMENSION2 charset.  This doesn't use any locking shift
6000         and single shift functions.  Assigned the coding-system (Lisp
6001         symbol) `japanese-iso-8bit' by default.
6002
6003    o coding-category-iso-7-else
6004
6005         The category for a coding system which has the same code range
6006         as ISO2022 of 7-bit environment but uses locking shift or
6007         single shift functions.  Assigned the coding-system (Lisp
6008         symbol) `iso-2022-7bit-lock' by default.
6009
6010    o coding-category-iso-8-else
6011
6012         The category for a coding system which has the same code range
6013         as ISO2022 of 8-bit environment but uses locking shift or
6014         single shift functions.  Assigned the coding-system (Lisp
6015         symbol) `iso-2022-8bit-ss2' by default.
6016
6017    o coding-category-big5
6018
6019         The category for a coding system which has the same code range
6020         as BIG5.  Assigned the coding-system (Lisp symbol)
6021         `cn-big5' by default.
6022
6023    o coding-category-utf-8
6024
6025         The category for a coding system which has the same code range
6026         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6027         symbol) `utf-8' by default.
6028
6029    o coding-category-utf-16-be
6030
6031         The category for a coding system in which a text has an
6032         Unicode signature (cf. Unicode Standard) in the order of BIG
6033         endian at the head.  Assigned the coding-system (Lisp symbol)
6034         `utf-16-be' by default.
6035
6036    o coding-category-utf-16-le
6037
6038         The category for a coding system in which a text has an
6039         Unicode signature (cf. Unicode Standard) in the order of
6040         LITTLE endian at the head.  Assigned the coding-system (Lisp
6041         symbol) `utf-16-le' by default.
6042
6043    o coding-category-ccl
6044
6045         The category for a coding system of which encoder/decoder is
6046         written in CCL programs.  The default value is nil, i.e., no
6047         coding system is assigned.
6048
6049    o coding-category-binary
6050
6051         The category for a coding system not categorized in any of the
6052         above.  Assigned the coding-system (Lisp symbol)
6053         `no-conversion' by default.
6054
6055    Each of them is a Lisp symbol and the value is an actual
6056    `coding-system's (this is also a Lisp symbol) assigned by a user.
6057    What Emacs does actually is to detect a category of coding system.
6058    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6059    decide only one possible category, it selects a category of the
6060    highest priority.  Priorities of categories are also specified by a
6061    user in a Lisp variable `coding-category-list'.
6062
6063 */
6064
6065 #define EOL_SEEN_NONE   0
6066 #define EOL_SEEN_LF     1
6067 #define EOL_SEEN_CR     2
6068 #define EOL_SEEN_CRLF   4
6069
6070 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6071    SOURCE is encoded.  If CATEGORY is one of
6072    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6073    two-byte, else they are encoded by one-byte.
6074
6075    Return one of EOL_SEEN_XXX.  */
6076
6077 #define MAX_EOL_CHECK_COUNT 3
6078
6079 static int
6080 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6081             enum coding_category category)
6082 {
6083   const unsigned char *src = source, *src_end = src + src_bytes;
6084   unsigned char c;
6085   int total  = 0;
6086   int eol_seen = EOL_SEEN_NONE;
6087
6088   if ((1 << category) & CATEGORY_MASK_UTF_16)
6089     {
6090       bool msb = category == (coding_category_utf_16_le
6091                               | coding_category_utf_16_le_nosig);
6092       bool lsb = !msb;
6093
6094       while (src + 1 < src_end)
6095         {
6096           c = src[lsb];
6097           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6098             {
6099               int this_eol;
6100
6101               if (c == '\n')
6102                 this_eol = EOL_SEEN_LF;
6103               else if (src + 3 >= src_end
6104                        || src[msb + 2] != 0
6105                        || src[lsb + 2] != '\n')
6106                 this_eol = EOL_SEEN_CR;
6107               else
6108                 {
6109                   this_eol = EOL_SEEN_CRLF;
6110                   src += 2;
6111                 }
6112
6113               if (eol_seen == EOL_SEEN_NONE)
6114                 /* This is the first end-of-line.  */
6115                 eol_seen = this_eol;
6116               else if (eol_seen != this_eol)
6117                 {
6118                   /* The found type is different from what found before.
6119                      Allow for stray ^M characters in DOS EOL files.  */
6120                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6121                       || (eol_seen == EOL_SEEN_CRLF
6122                           && this_eol == EOL_SEEN_CR))
6123                     eol_seen = EOL_SEEN_CRLF;
6124                   else
6125                     {
6126                       eol_seen = EOL_SEEN_LF;
6127                       break;
6128                     }
6129                 }
6130               if (++total == MAX_EOL_CHECK_COUNT)
6131                 break;
6132             }
6133           src += 2;
6134         }
6135     }
6136   else
6137     while (src < src_end)
6138       {
6139         c = *src++;
6140         if (c == '\n' || c == '\r')
6141           {
6142             int this_eol;
6143
6144             if (c == '\n')
6145               this_eol = EOL_SEEN_LF;
6146             else if (src >= src_end || *src != '\n')
6147               this_eol = EOL_SEEN_CR;
6148             else
6149               this_eol = EOL_SEEN_CRLF, src++;
6150
6151             if (eol_seen == EOL_SEEN_NONE)
6152               /* This is the first end-of-line.  */
6153               eol_seen = this_eol;
6154             else if (eol_seen != this_eol)
6155               {
6156                 /* The found type is different from what found before.
6157                    Allow for stray ^M characters in DOS EOL files.  */
6158                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6159                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6160                   eol_seen = EOL_SEEN_CRLF;
6161                 else
6162                   {
6163                     eol_seen = EOL_SEEN_LF;
6164                     break;
6165                   }
6166               }
6167             if (++total == MAX_EOL_CHECK_COUNT)
6168               break;
6169           }
6170       }
6171   return eol_seen;
6172 }
6173
6174
6175 static Lisp_Object
6176 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6177 {
6178   Lisp_Object eol_type;
6179
6180   eol_type = CODING_ID_EOL_TYPE (coding->id);
6181   if (eol_seen & EOL_SEEN_LF)
6182     {
6183       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6184       eol_type = Qunix;
6185     }
6186   else if (eol_seen & EOL_SEEN_CRLF)
6187     {
6188       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6189       eol_type = Qdos;
6190     }
6191   else if (eol_seen & EOL_SEEN_CR)
6192     {
6193       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6194       eol_type = Qmac;
6195     }
6196   return eol_type;
6197 }
6198
6199 /* Detect how a text specified in CODING is encoded.  If a coding
6200    system is detected, update fields of CODING by the detected coding
6201    system.  */
6202
6203 static void
6204 detect_coding (struct coding_system *coding)
6205 {
6206   const unsigned char *src, *src_end;
6207   unsigned int saved_mode = coding->mode;
6208
6209   coding->consumed = coding->consumed_char = 0;
6210   coding->produced = coding->produced_char = 0;
6211   coding_set_source (coding);
6212
6213   src_end = coding->source + coding->src_bytes;
6214   coding->head_ascii = 0;
6215
6216   /* If we have not yet decided the text encoding type, detect it
6217      now.  */
6218   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6219     {
6220       int c, i;
6221       struct coding_detection_info detect_info;
6222       bool null_byte_found = 0, eight_bit_found = 0;
6223
6224       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6225       for (src = coding->source; src < src_end; src++)
6226         {
6227           c = *src;
6228           if (c & 0x80)
6229             {
6230               eight_bit_found = 1;
6231               if (null_byte_found)
6232                 break;
6233             }
6234           else if (c < 0x20)
6235             {
6236               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6237                   && ! inhibit_iso_escape_detection
6238                   && ! detect_info.checked)
6239                 {
6240                   if (detect_coding_iso_2022 (coding, &detect_info))
6241                     {
6242                       /* We have scanned the whole data.  */
6243                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6244                         {
6245                           /* We didn't find an 8-bit code.  We may
6246                              have found a null-byte, but it's very
6247                              rare that a binary file conforms to
6248                              ISO-2022.  */
6249                           src = src_end;
6250                           coding->head_ascii = src - coding->source;
6251                         }
6252                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6253                       break;
6254                     }
6255                 }
6256               else if (! c && !inhibit_null_byte_detection)
6257                 {
6258                   null_byte_found = 1;
6259                   if (eight_bit_found)
6260                     break;
6261                 }
6262               if (! eight_bit_found)
6263                 coding->head_ascii++;
6264             }
6265           else if (! eight_bit_found)
6266             coding->head_ascii++;
6267         }
6268
6269       if (null_byte_found || eight_bit_found
6270           || coding->head_ascii < coding->src_bytes
6271           || detect_info.found)
6272         {
6273           enum coding_category category;
6274           struct coding_system *this;
6275
6276           if (coding->head_ascii == coding->src_bytes)
6277             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6278             for (i = 0; i < coding_category_raw_text; i++)
6279               {
6280                 category = coding_priorities[i];
6281                 this = coding_categories + category;
6282                 if (detect_info.found & (1 << category))
6283                   break;
6284               }
6285           else
6286             {
6287               if (null_byte_found)
6288                 {
6289                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6290                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6291                 }
6292               for (i = 0; i < coding_category_raw_text; i++)
6293                 {
6294                   category = coding_priorities[i];
6295                   this = coding_categories + category;
6296                   /* Some of this->detector (e.g. detect_coding_sjis)
6297                      require this information.  */
6298                   coding->id = this->id;
6299                   if (this->id < 0)
6300                     {
6301                       /* No coding system of this category is defined.  */
6302                       detect_info.rejected |= (1 << category);
6303                     }
6304                   else if (category >= coding_category_raw_text)
6305                     continue;
6306                   else if (detect_info.checked & (1 << category))
6307                     {
6308                       if (detect_info.found & (1 << category))
6309                         break;
6310                     }
6311                   else if ((*(this->detector)) (coding, &detect_info)
6312                            && detect_info.found & (1 << category))
6313                     {
6314                       if (category == coding_category_utf_16_auto)
6315                         {
6316                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6317                             category = coding_category_utf_16_le;
6318                           else
6319                             category = coding_category_utf_16_be;
6320                         }
6321                       break;
6322                     }
6323                 }
6324             }
6325
6326           if (i < coding_category_raw_text)
6327             setup_coding_system (CODING_ID_NAME (this->id), coding);
6328           else if (null_byte_found)
6329             setup_coding_system (Qno_conversion, coding);
6330           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6331                    == CATEGORY_MASK_ANY)
6332             setup_coding_system (Qraw_text, coding);
6333           else if (detect_info.rejected)
6334             for (i = 0; i < coding_category_raw_text; i++)
6335               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6336                 {
6337                   this = coding_categories + coding_priorities[i];
6338                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6339                   break;
6340                 }
6341         }
6342     }
6343   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6344            == coding_category_utf_8_auto)
6345     {
6346       Lisp_Object coding_systems;
6347       struct coding_detection_info detect_info;
6348
6349       coding_systems
6350         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6351       detect_info.found = detect_info.rejected = 0;
6352       for (src = coding->source; src < src_end; src++)
6353         {
6354           if (*src & 0x80)
6355             break;
6356         }
6357       coding->head_ascii = src - coding->source;
6358       if (CONSP (coding_systems)
6359           && detect_coding_utf_8 (coding, &detect_info))
6360         {
6361           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6362             setup_coding_system (XCAR (coding_systems), coding);
6363           else
6364             setup_coding_system (XCDR (coding_systems), coding);
6365         }
6366     }
6367   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6368            == coding_category_utf_16_auto)
6369     {
6370       Lisp_Object coding_systems;
6371       struct coding_detection_info detect_info;
6372
6373       coding_systems
6374         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6375       detect_info.found = detect_info.rejected = 0;
6376       coding->head_ascii = 0;
6377       if (CONSP (coding_systems)
6378           && detect_coding_utf_16 (coding, &detect_info))
6379         {
6380           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6381             setup_coding_system (XCAR (coding_systems), coding);
6382           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6383             setup_coding_system (XCDR (coding_systems), coding);
6384         }
6385     }
6386   coding->mode = saved_mode;
6387 }
6388
6389
6390 static void
6391 decode_eol (struct coding_system *coding)
6392 {
6393   Lisp_Object eol_type;
6394   unsigned char *p, *pbeg, *pend;
6395
6396   eol_type = CODING_ID_EOL_TYPE (coding->id);
6397   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6398     return;
6399
6400   if (NILP (coding->dst_object))
6401     pbeg = coding->destination;
6402   else
6403     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6404   pend = pbeg + coding->produced;
6405
6406   if (VECTORP (eol_type))
6407     {
6408       int eol_seen = EOL_SEEN_NONE;
6409
6410       for (p = pbeg; p < pend; p++)
6411         {
6412           if (*p == '\n')
6413             eol_seen |= EOL_SEEN_LF;
6414           else if (*p == '\r')
6415             {
6416               if (p + 1 < pend && *(p + 1) == '\n')
6417                 {
6418                   eol_seen |= EOL_SEEN_CRLF;
6419                   p++;
6420                 }
6421               else
6422                 eol_seen |= EOL_SEEN_CR;
6423             }
6424         }
6425       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6426       if ((eol_seen & EOL_SEEN_CRLF) != 0
6427           && (eol_seen & EOL_SEEN_CR) != 0
6428           && (eol_seen & EOL_SEEN_LF) == 0)
6429         eol_seen = EOL_SEEN_CRLF;
6430       else if (eol_seen != EOL_SEEN_NONE
6431           && eol_seen != EOL_SEEN_LF
6432           && eol_seen != EOL_SEEN_CRLF
6433           && eol_seen != EOL_SEEN_CR)
6434         eol_seen = EOL_SEEN_LF;
6435       if (eol_seen != EOL_SEEN_NONE)
6436         eol_type = adjust_coding_eol_type (coding, eol_seen);
6437     }
6438
6439   if (EQ (eol_type, Qmac))
6440     {
6441       for (p = pbeg; p < pend; p++)
6442         if (*p == '\r')
6443           *p = '\n';
6444     }
6445   else if (EQ (eol_type, Qdos))
6446     {
6447       ptrdiff_t n = 0;
6448
6449       if (NILP (coding->dst_object))
6450         {
6451           /* Start deleting '\r' from the tail to minimize the memory
6452              movement.  */
6453           for (p = pend - 2; p >= pbeg; p--)
6454             if (*p == '\r')
6455               {
6456                 memmove (p, p + 1, pend-- - p - 1);
6457                 n++;
6458               }
6459         }
6460       else
6461         {
6462           ptrdiff_t pos_byte = coding->dst_pos_byte;
6463           ptrdiff_t pos = coding->dst_pos;
6464           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6465
6466           while (pos < pos_end)
6467             {
6468               p = BYTE_POS_ADDR (pos_byte);
6469               if (*p == '\r' && p[1] == '\n')
6470                 {
6471                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6472                   n++;
6473                   pos_end--;
6474                 }
6475               pos++;
6476               if (coding->dst_multibyte)
6477                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6478               else
6479                 pos_byte++;
6480             }
6481         }
6482       coding->produced -= n;
6483       coding->produced_char -= n;
6484     }
6485 }
6486
6487
6488 /* Return a translation table (or list of them) from coding system
6489    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6490    not ENCODEP). */
6491
6492 static Lisp_Object
6493 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6494 {
6495   Lisp_Object standard, translation_table;
6496   Lisp_Object val;
6497
6498   if (NILP (Venable_character_translation))
6499     {
6500       if (max_lookup)
6501         *max_lookup = 0;
6502       return Qnil;
6503     }
6504   if (encodep)
6505     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6506       standard = Vstandard_translation_table_for_encode;
6507   else
6508     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6509       standard = Vstandard_translation_table_for_decode;
6510   if (NILP (translation_table))
6511     translation_table = standard;
6512   else
6513     {
6514       if (SYMBOLP (translation_table))
6515         translation_table = Fget (translation_table, Qtranslation_table);
6516       else if (CONSP (translation_table))
6517         {
6518           translation_table = Fcopy_sequence (translation_table);
6519           for (val = translation_table; CONSP (val); val = XCDR (val))
6520             if (SYMBOLP (XCAR (val)))
6521               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6522         }
6523       if (CHAR_TABLE_P (standard))
6524         {
6525           if (CONSP (translation_table))
6526             translation_table = nconc2 (translation_table,
6527                                         Fcons (standard, Qnil));
6528           else
6529             translation_table = Fcons (translation_table,
6530                                        Fcons (standard, Qnil));
6531         }
6532     }
6533
6534   if (max_lookup)
6535     {
6536       *max_lookup = 1;
6537       if (CHAR_TABLE_P (translation_table)
6538           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6539         {
6540           val = XCHAR_TABLE (translation_table)->extras[1];
6541           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6542             *max_lookup = XFASTINT (val);
6543         }
6544       else if (CONSP (translation_table))
6545         {
6546           Lisp_Object tail;
6547
6548           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6549             if (CHAR_TABLE_P (XCAR (tail))
6550                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6551               {
6552                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6553                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6554                   *max_lookup = XFASTINT (tailval);
6555               }
6556         }
6557     }
6558   return translation_table;
6559 }
6560
6561 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6562   do {                                                          \
6563     trans = Qnil;                                               \
6564     if (CHAR_TABLE_P (table))                                   \
6565       {                                                         \
6566         trans = CHAR_TABLE_REF (table, c);                      \
6567         if (CHARACTERP (trans))                                 \
6568           c = XFASTINT (trans), trans = Qnil;                   \
6569       }                                                         \
6570     else if (CONSP (table))                                     \
6571       {                                                         \
6572         Lisp_Object tail;                                       \
6573                                                                 \
6574         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6575           if (CHAR_TABLE_P (XCAR (tail)))                       \
6576             {                                                   \
6577               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6578               if (CHARACTERP (trans))                           \
6579                 c = XFASTINT (trans), trans = Qnil;             \
6580               else if (! NILP (trans))                          \
6581                 break;                                          \
6582             }                                                   \
6583       }                                                         \
6584   } while (0)
6585
6586
6587 /* Return a translation of character(s) at BUF according to TRANS.
6588    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6589    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6590    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6591    translation is found, and Qnil if not found..
6592    If BUF is too short to lookup characters in FROM, return Qt.  */
6593
6594 static Lisp_Object
6595 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6596 {
6597
6598   if (INTEGERP (trans))
6599     return trans;
6600   for (; CONSP (trans); trans = XCDR (trans))
6601     {
6602       Lisp_Object val = XCAR (trans);
6603       Lisp_Object from = XCAR (val);
6604       ptrdiff_t len = ASIZE (from);
6605       ptrdiff_t i;
6606
6607       for (i = 0; i < len; i++)
6608         {
6609           if (buf + i == buf_end)
6610             return Qt;
6611           if (XINT (AREF (from, i)) != buf[i])
6612             break;
6613         }
6614       if (i == len)
6615         return val;
6616     }
6617   return Qnil;
6618 }
6619
6620
6621 static int
6622 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6623                bool last_block)
6624 {
6625   unsigned char *dst = coding->destination + coding->produced;
6626   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6627   ptrdiff_t produced;
6628   ptrdiff_t produced_chars = 0;
6629   int carryover = 0;
6630
6631   if (! coding->chars_at_source)
6632     {
6633       /* Source characters are in coding->charbuf.  */
6634       int *buf = coding->charbuf;
6635       int *buf_end = buf + coding->charbuf_used;
6636
6637       if (EQ (coding->src_object, coding->dst_object))
6638         {
6639           coding_set_source (coding);
6640           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6641         }
6642
6643       while (buf < buf_end)
6644         {
6645           int c = *buf;
6646           ptrdiff_t i;
6647
6648           if (c >= 0)
6649             {
6650               ptrdiff_t from_nchars = 1, to_nchars = 1;
6651               Lisp_Object trans = Qnil;
6652
6653               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6654               if (! NILP (trans))
6655                 {
6656                   trans = get_translation (trans, buf, buf_end);
6657                   if (INTEGERP (trans))
6658                     c = XINT (trans);
6659                   else if (CONSP (trans))
6660                     {
6661                       from_nchars = ASIZE (XCAR (trans));
6662                       trans = XCDR (trans);
6663                       if (INTEGERP (trans))
6664                         c = XINT (trans);
6665                       else
6666                         {
6667                           to_nchars = ASIZE (trans);
6668                           c = XINT (AREF (trans, 0));
6669                         }
6670                     }
6671                   else if (EQ (trans, Qt) && ! last_block)
6672                     break;
6673                 }
6674
6675               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
6676                 {
6677                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
6678                        / MAX_MULTIBYTE_LENGTH)
6679                       < to_nchars)
6680                     memory_full (SIZE_MAX);
6681                   dst = alloc_destination (coding,
6682                                            buf_end - buf
6683                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6684                                            dst);
6685                   if (EQ (coding->src_object, coding->dst_object))
6686                     {
6687                       coding_set_source (coding);
6688                       dst_end = (((unsigned char *) coding->source)
6689                                  + coding->consumed);
6690                     }
6691                   else
6692                     dst_end = coding->destination + coding->dst_bytes;
6693                 }
6694
6695               for (i = 0; i < to_nchars; i++)
6696                 {
6697                   if (i > 0)
6698                     c = XINT (AREF (trans, i));
6699                   if (coding->dst_multibyte
6700                       || ! CHAR_BYTE8_P (c))
6701                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6702                   else
6703                     *dst++ = CHAR_TO_BYTE8 (c);
6704                 }
6705               produced_chars += to_nchars;
6706               buf += from_nchars;
6707             }
6708           else
6709             /* This is an annotation datum.  (-C) is the length.  */
6710             buf += -c;
6711         }
6712       carryover = buf_end - buf;
6713     }
6714   else
6715     {
6716       /* Source characters are at coding->source.  */
6717       const unsigned char *src = coding->source;
6718       const unsigned char *src_end = src + coding->consumed;
6719
6720       if (EQ (coding->dst_object, coding->src_object))
6721         dst_end = (unsigned char *) src;
6722       if (coding->src_multibyte != coding->dst_multibyte)
6723         {
6724           if (coding->src_multibyte)
6725             {
6726               bool multibytep = 1;
6727               ptrdiff_t consumed_chars = 0;
6728
6729               while (1)
6730                 {
6731                   const unsigned char *src_base = src;
6732                   int c;
6733
6734                   ONE_MORE_BYTE (c);
6735                   if (dst == dst_end)
6736                     {
6737                       if (EQ (coding->src_object, coding->dst_object))
6738                         dst_end = (unsigned char *) src;
6739                       if (dst == dst_end)
6740                         {
6741                           ptrdiff_t offset = src - coding->source;
6742
6743                           dst = alloc_destination (coding, src_end - src + 1,
6744                                                    dst);
6745                           dst_end = coding->destination + coding->dst_bytes;
6746                           coding_set_source (coding);
6747                           src = coding->source + offset;
6748                           src_end = coding->source + coding->consumed;
6749                           if (EQ (coding->src_object, coding->dst_object))
6750                             dst_end = (unsigned char *) src;
6751                         }
6752                     }
6753                   *dst++ = c;
6754                   produced_chars++;
6755                 }
6756             no_more_source:
6757               ;
6758             }
6759           else
6760             while (src < src_end)
6761               {
6762                 bool multibytep = 1;
6763                 int c = *src++;
6764
6765                 if (dst >= dst_end - 1)
6766                   {
6767                     if (EQ (coding->src_object, coding->dst_object))
6768                       dst_end = (unsigned char *) src;
6769                     if (dst >= dst_end - 1)
6770                       {
6771                         ptrdiff_t offset = src - coding->source;
6772                         ptrdiff_t more_bytes;
6773
6774                         if (EQ (coding->src_object, coding->dst_object))
6775                           more_bytes = ((src_end - src) / 2) + 2;
6776                         else
6777                           more_bytes = src_end - src + 2;
6778                         dst = alloc_destination (coding, more_bytes, dst);
6779                         dst_end = coding->destination + coding->dst_bytes;
6780                         coding_set_source (coding);
6781                         src = coding->source + offset;
6782                         src_end = coding->source + coding->consumed;
6783                         if (EQ (coding->src_object, coding->dst_object))
6784                           dst_end = (unsigned char *) src;
6785                       }
6786                   }
6787                 EMIT_ONE_BYTE (c);
6788               }
6789         }
6790       else
6791         {
6792           if (!EQ (coding->src_object, coding->dst_object))
6793             {
6794               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
6795
6796               if (require > 0)
6797                 {
6798                   ptrdiff_t offset = src - coding->source;
6799
6800                   dst = alloc_destination (coding, require, dst);
6801                   coding_set_source (coding);
6802                   src = coding->source + offset;
6803                   src_end = coding->source + coding->consumed;
6804                 }
6805             }
6806           produced_chars = coding->consumed_char;
6807           while (src < src_end)
6808             *dst++ = *src++;
6809         }
6810     }
6811
6812   produced = dst - (coding->destination + coding->produced);
6813   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6814     insert_from_gap (produced_chars, produced);
6815   coding->produced += produced;
6816   coding->produced_char += produced_chars;
6817   return carryover;
6818 }
6819
6820 /* Compose text in CODING->object according to the annotation data at
6821    CHARBUF.  CHARBUF is an array:
6822      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6823  */
6824
6825 static void
6826 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6827 {
6828   int len;
6829   ptrdiff_t to;
6830   enum composition_method method;
6831   Lisp_Object components;
6832
6833   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6834   to = pos + charbuf[2];
6835   method = (enum composition_method) (charbuf[4]);
6836
6837   if (method == COMPOSITION_RELATIVE)
6838     components = Qnil;
6839   else
6840     {
6841       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6842       int i, j;
6843
6844       if (method == COMPOSITION_WITH_RULE)
6845         len = charbuf[2] * 3 - 2;
6846       charbuf += MAX_ANNOTATION_LENGTH;
6847       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6848       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6849         {
6850           if (charbuf[i] >= 0)
6851             args[j] = make_number (charbuf[i]);
6852           else
6853             {
6854               i++;
6855               args[j] = make_number (charbuf[i] % 0x100);
6856             }
6857         }
6858       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6859     }
6860   compose_text (pos, to, components, Qnil, coding->dst_object);
6861 }
6862
6863
6864 /* Put `charset' property on text in CODING->object according to
6865    the annotation data at CHARBUF.  CHARBUF is an array:
6866      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6867  */
6868
6869 static void
6870 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
6871 {
6872   ptrdiff_t from = pos - charbuf[2];
6873   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6874
6875   Fput_text_property (make_number (from), make_number (pos),
6876                       Qcharset, CHARSET_NAME (charset),
6877                       coding->dst_object);
6878 }
6879
6880
6881 #define CHARBUF_SIZE 0x4000
6882
6883 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6884   do {                                                                  \
6885     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
6886     coding->charbuf_size = CHARBUF_SIZE;                                \
6887   } while (0)
6888
6889
6890 static void
6891 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
6892 {
6893   int *charbuf = coding->charbuf;
6894   int *charbuf_end = charbuf + coding->charbuf_used;
6895
6896   if (NILP (coding->dst_object))
6897     return;
6898
6899   while (charbuf < charbuf_end)
6900     {
6901       if (*charbuf >= 0)
6902         pos++, charbuf++;
6903       else
6904         {
6905           int len = -*charbuf;
6906
6907           if (len > 2)
6908             switch (charbuf[1])
6909               {
6910               case CODING_ANNOTATE_COMPOSITION_MASK:
6911                 produce_composition (coding, charbuf, pos);
6912                 break;
6913               case CODING_ANNOTATE_CHARSET_MASK:
6914                 produce_charset (coding, charbuf, pos);
6915                 break;
6916               }
6917           charbuf += len;
6918         }
6919     }
6920 }
6921
6922 /* Decode the data at CODING->src_object into CODING->dst_object.
6923    CODING->src_object is a buffer, a string, or nil.
6924    CODING->dst_object is a buffer.
6925
6926    If CODING->src_object is a buffer, it must be the current buffer.
6927    In this case, if CODING->src_pos is positive, it is a position of
6928    the source text in the buffer, otherwise, the source text is in the
6929    gap area of the buffer, and CODING->src_pos specifies the offset of
6930    the text from GPT (which must be the same as PT).  If this is the
6931    same buffer as CODING->dst_object, CODING->src_pos must be
6932    negative.
6933
6934    If CODING->src_object is a string, CODING->src_pos is an index to
6935    that string.
6936
6937    If CODING->src_object is nil, CODING->source must already point to
6938    the non-relocatable memory area.  In this case, CODING->src_pos is
6939    an offset from CODING->source.
6940
6941    The decoded data is inserted at the current point of the buffer
6942    CODING->dst_object.
6943 */
6944
6945 static void
6946 decode_coding (struct coding_system *coding)
6947 {
6948   Lisp_Object attrs;
6949   Lisp_Object undo_list;
6950   Lisp_Object translation_table;
6951   struct ccl_spec cclspec;
6952   int carryover;
6953   int i;
6954
6955   USE_SAFE_ALLOCA;
6956
6957   if (BUFFERP (coding->src_object)
6958       && coding->src_pos > 0
6959       && coding->src_pos < GPT
6960       && coding->src_pos + coding->src_chars > GPT)
6961     move_gap_both (coding->src_pos, coding->src_pos_byte);
6962
6963   undo_list = Qt;
6964   if (BUFFERP (coding->dst_object))
6965     {
6966       set_buffer_internal (XBUFFER (coding->dst_object));
6967       if (GPT != PT)
6968         move_gap_both (PT, PT_BYTE);
6969
6970       /* We must disable undo_list in order to record the whole insert
6971          transaction via record_insert at the end.  But doing so also
6972          disables the recording of the first change to the undo_list.
6973          Therefore we check for first change here and record it via
6974          record_first_change if needed.  */
6975       if (MODIFF <= SAVE_MODIFF)
6976         record_first_change ();
6977
6978       undo_list = BVAR (current_buffer, undo_list);
6979       bset_undo_list (current_buffer, Qt);
6980     }
6981
6982   coding->consumed = coding->consumed_char = 0;
6983   coding->produced = coding->produced_char = 0;
6984   coding->chars_at_source = 0;
6985   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6986   coding->errors = 0;
6987
6988   ALLOC_CONVERSION_WORK_AREA (coding);
6989
6990   attrs = CODING_ID_ATTRS (coding->id);
6991   translation_table = get_translation_table (attrs, 0, NULL);
6992
6993   carryover = 0;
6994   if (coding->decoder == decode_coding_ccl)
6995     {
6996       coding->spec.ccl = &cclspec;
6997       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
6998     }
6999   do
7000     {
7001       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7002
7003       coding_set_source (coding);
7004       coding->annotated = 0;
7005       coding->charbuf_used = carryover;
7006       (*(coding->decoder)) (coding);
7007       coding_set_destination (coding);
7008       carryover = produce_chars (coding, translation_table, 0);
7009       if (coding->annotated)
7010         produce_annotation (coding, pos);
7011       for (i = 0; i < carryover; i++)
7012         coding->charbuf[i]
7013           = coding->charbuf[coding->charbuf_used - carryover + i];
7014     }
7015   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7016          || (coding->consumed < coding->src_bytes
7017              && (coding->result == CODING_RESULT_SUCCESS
7018                  || coding->result == CODING_RESULT_INVALID_SRC)));
7019
7020   if (carryover > 0)
7021     {
7022       coding_set_destination (coding);
7023       coding->charbuf_used = carryover;
7024       produce_chars (coding, translation_table, 1);
7025     }
7026
7027   coding->carryover_bytes = 0;
7028   if (coding->consumed < coding->src_bytes)
7029     {
7030       int nbytes = coding->src_bytes - coding->consumed;
7031       const unsigned char *src;
7032
7033       coding_set_source (coding);
7034       coding_set_destination (coding);
7035       src = coding->source + coding->consumed;
7036
7037       if (coding->mode & CODING_MODE_LAST_BLOCK)
7038         {
7039           /* Flush out unprocessed data as binary chars.  We are sure
7040              that the number of data is less than the size of
7041              coding->charbuf.  */
7042           coding->charbuf_used = 0;
7043           coding->chars_at_source = 0;
7044
7045           while (nbytes-- > 0)
7046             {
7047               int c = *src++;
7048
7049               if (c & 0x80)
7050                 c = BYTE8_TO_CHAR (c);
7051               coding->charbuf[coding->charbuf_used++] = c;
7052             }
7053           produce_chars (coding, Qnil, 1);
7054         }
7055       else
7056         {
7057           /* Record unprocessed bytes in coding->carryover.  We are
7058              sure that the number of data is less than the size of
7059              coding->carryover.  */
7060           unsigned char *p = coding->carryover;
7061
7062           if (nbytes > sizeof coding->carryover)
7063             nbytes = sizeof coding->carryover;
7064           coding->carryover_bytes = nbytes;
7065           while (nbytes-- > 0)
7066             *p++ = *src++;
7067         }
7068       coding->consumed = coding->src_bytes;
7069     }
7070
7071   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7072       && !inhibit_eol_conversion)
7073     decode_eol (coding);
7074   if (BUFFERP (coding->dst_object))
7075     {
7076       bset_undo_list (current_buffer, undo_list);
7077       record_insert (coding->dst_pos, coding->produced_char);
7078     }
7079
7080   SAFE_FREE ();
7081 }
7082
7083
7084 /* Extract an annotation datum from a composition starting at POS and
7085    ending before LIMIT of CODING->src_object (buffer or string), store
7086    the data in BUF, set *STOP to a starting position of the next
7087    composition (if any) or to LIMIT, and return the address of the
7088    next element of BUF.
7089
7090    If such an annotation is not found, set *STOP to a starting
7091    position of a composition after POS (if any) or to LIMIT, and
7092    return BUF.  */
7093
7094 static int *
7095 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7096                                struct coding_system *coding, int *buf,
7097                                ptrdiff_t *stop)
7098 {
7099   ptrdiff_t start, end;
7100   Lisp_Object prop;
7101
7102   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7103       || end > limit)
7104     *stop = limit;
7105   else if (start > pos)
7106     *stop = start;
7107   else
7108     {
7109       if (start == pos)
7110         {
7111           /* We found a composition.  Store the corresponding
7112              annotation data in BUF.  */
7113           int *head = buf;
7114           enum composition_method method = COMPOSITION_METHOD (prop);
7115           int nchars = COMPOSITION_LENGTH (prop);
7116
7117           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7118           if (method != COMPOSITION_RELATIVE)
7119             {
7120               Lisp_Object components;
7121               ptrdiff_t i, len, i_byte;
7122
7123               components = COMPOSITION_COMPONENTS (prop);
7124               if (VECTORP (components))
7125                 {
7126                   len = ASIZE (components);
7127                   for (i = 0; i < len; i++)
7128                     *buf++ = XINT (AREF (components, i));
7129                 }
7130               else if (STRINGP (components))
7131                 {
7132                   len = SCHARS (components);
7133                   i = i_byte = 0;
7134                   while (i < len)
7135                     {
7136                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7137                       buf++;
7138                     }
7139                 }
7140               else if (INTEGERP (components))
7141                 {
7142                   len = 1;
7143                   *buf++ = XINT (components);
7144                 }
7145               else if (CONSP (components))
7146                 {
7147                   for (len = 0; CONSP (components);
7148                        len++, components = XCDR (components))
7149                     *buf++ = XINT (XCAR (components));
7150                 }
7151               else
7152                 emacs_abort ();
7153               *head -= len;
7154             }
7155         }
7156
7157       if (find_composition (end, limit, &start, &end, &prop,
7158                             coding->src_object)
7159           && end <= limit)
7160         *stop = start;
7161       else
7162         *stop = limit;
7163     }
7164   return buf;
7165 }
7166
7167
7168 /* Extract an annotation datum from a text property `charset' at POS of
7169    CODING->src_object (buffer of string), store the data in BUF, set
7170    *STOP to the position where the value of `charset' property changes
7171    (limiting by LIMIT), and return the address of the next element of
7172    BUF.
7173
7174    If the property value is nil, set *STOP to the position where the
7175    property value is non-nil (limiting by LIMIT), and return BUF.  */
7176
7177 static int *
7178 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7179                            struct coding_system *coding, int *buf,
7180                            ptrdiff_t *stop)
7181 {
7182   Lisp_Object val, next;
7183   int id;
7184
7185   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7186   if (! NILP (val) && CHARSETP (val))
7187     id = XINT (CHARSET_SYMBOL_ID (val));
7188   else
7189     id = -1;
7190   ADD_CHARSET_DATA (buf, 0, id);
7191   next = Fnext_single_property_change (make_number (pos), Qcharset,
7192                                        coding->src_object,
7193                                        make_number (limit));
7194   *stop = XINT (next);
7195   return buf;
7196 }
7197
7198
7199 static void
7200 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7201                int max_lookup)
7202 {
7203   int *buf = coding->charbuf;
7204   int *buf_end = coding->charbuf + coding->charbuf_size;
7205   const unsigned char *src = coding->source + coding->consumed;
7206   const unsigned char *src_end = coding->source + coding->src_bytes;
7207   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7208   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7209   bool multibytep = coding->src_multibyte;
7210   Lisp_Object eol_type;
7211   int c;
7212   ptrdiff_t stop, stop_composition, stop_charset;
7213   int *lookup_buf = NULL;
7214
7215   if (! NILP (translation_table))
7216     lookup_buf = alloca (sizeof (int) * max_lookup);
7217
7218   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7219   if (VECTORP (eol_type))
7220     eol_type = Qunix;
7221
7222   /* Note: composition handling is not yet implemented.  */
7223   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7224
7225   if (NILP (coding->src_object))
7226     stop = stop_composition = stop_charset = end_pos;
7227   else
7228     {
7229       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7230         stop = stop_composition = pos;
7231       else
7232         stop = stop_composition = end_pos;
7233       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7234         stop = stop_charset = pos;
7235       else
7236         stop_charset = end_pos;
7237     }
7238
7239   /* Compensate for CRLF and conversion.  */
7240   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7241   while (buf < buf_end)
7242     {
7243       Lisp_Object trans;
7244
7245       if (pos == stop)
7246         {
7247           if (pos == end_pos)
7248             break;
7249           if (pos == stop_composition)
7250             buf = handle_composition_annotation (pos, end_pos, coding,
7251                                                  buf, &stop_composition);
7252           if (pos == stop_charset)
7253             buf = handle_charset_annotation (pos, end_pos, coding,
7254                                              buf, &stop_charset);
7255           stop = (stop_composition < stop_charset
7256                   ? stop_composition : stop_charset);
7257         }
7258
7259       if (! multibytep)
7260         {
7261           int bytes;
7262
7263           if (coding->encoder == encode_coding_raw_text
7264               || coding->encoder == encode_coding_ccl)
7265             c = *src++, pos++;
7266           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7267             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7268           else
7269             c = BYTE8_TO_CHAR (*src), src++, pos++;
7270         }
7271       else
7272         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7273       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7274         c = '\n';
7275       if (! EQ (eol_type, Qunix))
7276         {
7277           if (c == '\n')
7278             {
7279               if (EQ (eol_type, Qdos))
7280                 *buf++ = '\r';
7281               else
7282                 c = '\r';
7283             }
7284         }
7285
7286       trans = Qnil;
7287       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7288       if (NILP (trans))
7289         *buf++ = c;
7290       else
7291         {
7292           ptrdiff_t from_nchars = 1, to_nchars = 1;
7293           int *lookup_buf_end;
7294           const unsigned char *p = src;
7295           int i;
7296
7297           lookup_buf[0] = c;
7298           for (i = 1; i < max_lookup && p < src_end; i++)
7299             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7300           lookup_buf_end = lookup_buf + i;
7301           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7302           if (INTEGERP (trans))
7303             c = XINT (trans);
7304           else if (CONSP (trans))
7305             {
7306               from_nchars = ASIZE (XCAR (trans));
7307               trans = XCDR (trans);
7308               if (INTEGERP (trans))
7309                 c = XINT (trans);
7310               else
7311                 {
7312                   to_nchars = ASIZE (trans);
7313                   if (buf_end - buf < to_nchars)
7314                     break;
7315                   c = XINT (AREF (trans, 0));
7316                 }
7317             }
7318           else
7319             break;
7320           *buf++ = c;
7321           for (i = 1; i < to_nchars; i++)
7322             *buf++ = XINT (AREF (trans, i));
7323           for (i = 1; i < from_nchars; i++, pos++)
7324             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7325         }
7326     }
7327
7328   coding->consumed = src - coding->source;
7329   coding->consumed_char = pos - coding->src_pos;
7330   coding->charbuf_used = buf - coding->charbuf;
7331   coding->chars_at_source = 0;
7332 }
7333
7334
7335 /* Encode the text at CODING->src_object into CODING->dst_object.
7336    CODING->src_object is a buffer or a string.
7337    CODING->dst_object is a buffer or nil.
7338
7339    If CODING->src_object is a buffer, it must be the current buffer.
7340    In this case, if CODING->src_pos is positive, it is a position of
7341    the source text in the buffer, otherwise. the source text is in the
7342    gap area of the buffer, and coding->src_pos specifies the offset of
7343    the text from GPT (which must be the same as PT).  If this is the
7344    same buffer as CODING->dst_object, CODING->src_pos must be
7345    negative and CODING should not have `pre-write-conversion'.
7346
7347    If CODING->src_object is a string, CODING should not have
7348    `pre-write-conversion'.
7349
7350    If CODING->dst_object is a buffer, the encoded data is inserted at
7351    the current point of that buffer.
7352
7353    If CODING->dst_object is nil, the encoded data is placed at the
7354    memory area specified by CODING->destination.  */
7355
7356 static void
7357 encode_coding (struct coding_system *coding)
7358 {
7359   Lisp_Object attrs;
7360   Lisp_Object translation_table;
7361   int max_lookup;
7362   struct ccl_spec cclspec;
7363
7364   USE_SAFE_ALLOCA;
7365
7366   attrs = CODING_ID_ATTRS (coding->id);
7367   if (coding->encoder == encode_coding_raw_text)
7368     translation_table = Qnil, max_lookup = 0;
7369   else
7370     translation_table = get_translation_table (attrs, 1, &max_lookup);
7371
7372   if (BUFFERP (coding->dst_object))
7373     {
7374       set_buffer_internal (XBUFFER (coding->dst_object));
7375       coding->dst_multibyte
7376         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7377     }
7378
7379   coding->consumed = coding->consumed_char = 0;
7380   coding->produced = coding->produced_char = 0;
7381   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7382   coding->errors = 0;
7383
7384   ALLOC_CONVERSION_WORK_AREA (coding);
7385
7386   if (coding->encoder == encode_coding_ccl)
7387     {
7388       coding->spec.ccl = &cclspec;
7389       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7390     }
7391   do {
7392     coding_set_source (coding);
7393     consume_chars (coding, translation_table, max_lookup);
7394     coding_set_destination (coding);
7395     (*(coding->encoder)) (coding);
7396   } while (coding->consumed_char < coding->src_chars);
7397
7398   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7399     insert_from_gap (coding->produced_char, coding->produced);
7400
7401   SAFE_FREE ();
7402 }
7403
7404
7405 /* Name (or base name) of work buffer for code conversion.  */
7406 static Lisp_Object Vcode_conversion_workbuf_name;
7407
7408 /* A working buffer used by the top level conversion.  Once it is
7409    created, it is never destroyed.  It has the name
7410    Vcode_conversion_workbuf_name.  The other working buffers are
7411    destroyed after the use is finished, and their names are modified
7412    versions of Vcode_conversion_workbuf_name.  */
7413 static Lisp_Object Vcode_conversion_reused_workbuf;
7414
7415 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7416 static bool reused_workbuf_in_use;
7417
7418
7419 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7420    multibyteness of returning buffer.  */
7421
7422 static Lisp_Object
7423 make_conversion_work_buffer (bool multibyte)
7424 {
7425   Lisp_Object name, workbuf;
7426   struct buffer *current;
7427
7428   if (reused_workbuf_in_use)
7429     {
7430       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7431       workbuf = Fget_buffer_create (name);
7432     }
7433   else
7434     {
7435       reused_workbuf_in_use = 1;
7436       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7437         Vcode_conversion_reused_workbuf
7438           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7439       workbuf = Vcode_conversion_reused_workbuf;
7440     }
7441   current = current_buffer;
7442   set_buffer_internal (XBUFFER (workbuf));
7443   /* We can't allow modification hooks to run in the work buffer.  For
7444      instance, directory_files_internal assumes that file decoding
7445      doesn't compile new regexps.  */
7446   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7447   Ferase_buffer ();
7448   bset_undo_list (current_buffer, Qt);
7449   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7450   set_buffer_internal (current);
7451   return workbuf;
7452 }
7453
7454
7455 static Lisp_Object
7456 code_conversion_restore (Lisp_Object arg)
7457 {
7458   Lisp_Object current, workbuf;
7459   struct gcpro gcpro1;
7460
7461   GCPRO1 (arg);
7462   current = XCAR (arg);
7463   workbuf = XCDR (arg);
7464   if (! NILP (workbuf))
7465     {
7466       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7467         reused_workbuf_in_use = 0;
7468       else
7469         Fkill_buffer (workbuf);
7470     }
7471   set_buffer_internal (XBUFFER (current));
7472   UNGCPRO;
7473   return Qnil;
7474 }
7475
7476 Lisp_Object
7477 code_conversion_save (bool with_work_buf, bool multibyte)
7478 {
7479   Lisp_Object workbuf = Qnil;
7480
7481   if (with_work_buf)
7482     workbuf = make_conversion_work_buffer (multibyte);
7483   record_unwind_protect (code_conversion_restore,
7484                          Fcons (Fcurrent_buffer (), workbuf));
7485   return workbuf;
7486 }
7487
7488 void
7489 decode_coding_gap (struct coding_system *coding,
7490                    ptrdiff_t chars, ptrdiff_t bytes)
7491 {
7492   ptrdiff_t count = SPECPDL_INDEX ();
7493   Lisp_Object attrs;
7494
7495   coding->src_object = Fcurrent_buffer ();
7496   coding->src_chars = chars;
7497   coding->src_bytes = bytes;
7498   coding->src_pos = -chars;
7499   coding->src_pos_byte = -bytes;
7500   coding->src_multibyte = chars < bytes;
7501   coding->dst_object = coding->src_object;
7502   coding->dst_pos = PT;
7503   coding->dst_pos_byte = PT_BYTE;
7504   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7505
7506   if (CODING_REQUIRE_DETECTION (coding))
7507     detect_coding (coding);
7508   attrs = CODING_ID_ATTRS (coding->id);
7509 #ifndef CODING_DISABLE_ASCII_OPTIMIZATION
7510   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7511       && NILP (CODING_ATTR_POST_READ (attrs))
7512       && NILP (get_translation_table (attrs, 0, NULL))
7513       && (inhibit_eol_conversion
7514           || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
7515     {
7516       /* We can skip the conversion if all source bytes are ASCII.  */
7517       if (coding->head_ascii < 0)
7518         {
7519           /* We have not yet counted the number of ASCII bytes at the
7520              head of the source.  Do it now.  */
7521           const unsigned char *src, *src_end;
7522
7523           coding_set_source (coding);
7524           src_end = coding->source + coding->src_bytes;
7525           for (src = coding->source; src < src_end; src++)
7526             {
7527               if (*src & 0x80)
7528                 break;
7529             }
7530           coding->head_ascii = src - coding->source;
7531         }
7532       if (coding->src_bytes == coding->head_ascii)
7533         {
7534           /* No need of conversion.  Use the data in the gap as is.  */
7535           coding->produced_char = chars;
7536           coding->produced = bytes;
7537           adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
7538           return;
7539         }
7540     }
7541 #endif  /* not CODING_DISABLE_ASCII_OPTIMIZATION */
7542   code_conversion_save (0, 0);
7543
7544   coding->mode |= CODING_MODE_LAST_BLOCK;
7545   current_buffer->text->inhibit_shrinking = 1;
7546   decode_coding (coding);
7547   current_buffer->text->inhibit_shrinking = 0;
7548
7549   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7550     {
7551       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7552       Lisp_Object val;
7553
7554       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7555       val = call1 (CODING_ATTR_POST_READ (attrs),
7556                    make_number (coding->produced_char));
7557       CHECK_NATNUM (val);
7558       coding->produced_char += Z - prev_Z;
7559       coding->produced += Z_BYTE - prev_Z_BYTE;
7560     }
7561
7562   unbind_to (count, Qnil);
7563 }
7564
7565
7566 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7567    SRC_OBJECT into DST_OBJECT by coding context CODING.
7568
7569    SRC_OBJECT is a buffer, a string, or Qnil.
7570
7571    If it is a buffer, the text is at point of the buffer.  FROM and TO
7572    are positions in the buffer.
7573
7574    If it is a string, the text is at the beginning of the string.
7575    FROM and TO are indices to the string.
7576
7577    If it is nil, the text is at coding->source.  FROM and TO are
7578    indices to coding->source.
7579
7580    DST_OBJECT is a buffer, Qt, or Qnil.
7581
7582    If it is a buffer, the decoded text is inserted at point of the
7583    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7584    is deleted.
7585
7586    If it is Qt, a string is made from the decoded text, and
7587    set in CODING->dst_object.
7588
7589    If it is Qnil, the decoded text is stored at CODING->destination.
7590    The caller must allocate CODING->dst_bytes bytes at
7591    CODING->destination by xmalloc.  If the decoded text is longer than
7592    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7593  */
7594
7595 void
7596 decode_coding_object (struct coding_system *coding,
7597                       Lisp_Object src_object,
7598                       ptrdiff_t from, ptrdiff_t from_byte,
7599                       ptrdiff_t to, ptrdiff_t to_byte,
7600                       Lisp_Object dst_object)
7601 {
7602   ptrdiff_t count = SPECPDL_INDEX ();
7603   unsigned char *destination IF_LINT (= NULL);
7604   ptrdiff_t dst_bytes IF_LINT (= 0);
7605   ptrdiff_t chars = to - from;
7606   ptrdiff_t bytes = to_byte - from_byte;
7607   Lisp_Object attrs;
7608   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7609   bool need_marker_adjustment = 0;
7610   Lisp_Object old_deactivate_mark;
7611
7612   old_deactivate_mark = Vdeactivate_mark;
7613
7614   if (NILP (dst_object))
7615     {
7616       destination = coding->destination;
7617       dst_bytes = coding->dst_bytes;
7618     }
7619
7620   coding->src_object = src_object;
7621   coding->src_chars = chars;
7622   coding->src_bytes = bytes;
7623   coding->src_multibyte = chars < bytes;
7624
7625   if (STRINGP (src_object))
7626     {
7627       coding->src_pos = from;
7628       coding->src_pos_byte = from_byte;
7629     }
7630   else if (BUFFERP (src_object))
7631     {
7632       set_buffer_internal (XBUFFER (src_object));
7633       if (from != GPT)
7634         move_gap_both (from, from_byte);
7635       if (EQ (src_object, dst_object))
7636         {
7637           struct Lisp_Marker *tail;
7638
7639           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7640             {
7641               tail->need_adjustment
7642                 = tail->charpos == (tail->insertion_type ? from : to);
7643               need_marker_adjustment |= tail->need_adjustment;
7644             }
7645           saved_pt = PT, saved_pt_byte = PT_BYTE;
7646           TEMP_SET_PT_BOTH (from, from_byte);
7647           current_buffer->text->inhibit_shrinking = 1;
7648           del_range_both (from, from_byte, to, to_byte, 1);
7649           coding->src_pos = -chars;
7650           coding->src_pos_byte = -bytes;
7651         }
7652       else
7653         {
7654           coding->src_pos = from;
7655           coding->src_pos_byte = from_byte;
7656         }
7657     }
7658
7659   if (CODING_REQUIRE_DETECTION (coding))
7660     detect_coding (coding);
7661   attrs = CODING_ID_ATTRS (coding->id);
7662
7663   if (EQ (dst_object, Qt)
7664       || (! NILP (CODING_ATTR_POST_READ (attrs))
7665           && NILP (dst_object)))
7666     {
7667       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7668       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7669       coding->dst_pos = BEG;
7670       coding->dst_pos_byte = BEG_BYTE;
7671     }
7672   else if (BUFFERP (dst_object))
7673     {
7674       code_conversion_save (0, 0);
7675       coding->dst_object = dst_object;
7676       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7677       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7678       coding->dst_multibyte
7679         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7680     }
7681   else
7682     {
7683       code_conversion_save (0, 0);
7684       coding->dst_object = Qnil;
7685       /* Most callers presume this will return a multibyte result, and they
7686          won't use `binary' or `raw-text' anyway, so let's not worry about
7687          CODING_FOR_UNIBYTE.  */
7688       coding->dst_multibyte = 1;
7689     }
7690
7691   decode_coding (coding);
7692
7693   if (BUFFERP (coding->dst_object))
7694     set_buffer_internal (XBUFFER (coding->dst_object));
7695
7696   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7697     {
7698       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7699       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7700       Lisp_Object val;
7701
7702       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7703       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7704               old_deactivate_mark);
7705       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7706                         make_number (coding->produced_char));
7707       UNGCPRO;
7708       CHECK_NATNUM (val);
7709       coding->produced_char += Z - prev_Z;
7710       coding->produced += Z_BYTE - prev_Z_BYTE;
7711     }
7712
7713   if (EQ (dst_object, Qt))
7714     {
7715       coding->dst_object = Fbuffer_string ();
7716     }
7717   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7718     {
7719       set_buffer_internal (XBUFFER (coding->dst_object));
7720       if (dst_bytes < coding->produced)
7721         {
7722           eassert (coding->produced > 0);
7723           destination = xrealloc (destination, coding->produced);
7724           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7725             move_gap_both (BEGV, BEGV_BYTE);
7726           memcpy (destination, BEGV_ADDR, coding->produced);
7727           coding->destination = destination;
7728         }
7729     }
7730
7731   if (saved_pt >= 0)
7732     {
7733       /* This is the case of:
7734          (BUFFERP (src_object) && EQ (src_object, dst_object))
7735          As we have moved PT while replacing the original buffer
7736          contents, we must recover it now.  */
7737       set_buffer_internal (XBUFFER (src_object));
7738       current_buffer->text->inhibit_shrinking = 0;
7739       if (saved_pt < from)
7740         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7741       else if (saved_pt < from + chars)
7742         TEMP_SET_PT_BOTH (from, from_byte);
7743       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7744         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7745                           saved_pt_byte + (coding->produced - bytes));
7746       else
7747         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7748                           saved_pt_byte + (coding->produced - bytes));
7749
7750       if (need_marker_adjustment)
7751         {
7752           struct Lisp_Marker *tail;
7753
7754           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7755             if (tail->need_adjustment)
7756               {
7757                 tail->need_adjustment = 0;
7758                 if (tail->insertion_type)
7759                   {
7760                     tail->bytepos = from_byte;
7761                     tail->charpos = from;
7762                   }
7763                 else
7764                   {
7765                     tail->bytepos = from_byte + coding->produced;
7766                     tail->charpos
7767                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7768                          ? tail->bytepos : from + coding->produced_char);
7769                   }
7770               }
7771         }
7772     }
7773
7774   Vdeactivate_mark = old_deactivate_mark;
7775   unbind_to (count, coding->dst_object);
7776 }
7777
7778
7779 void
7780 encode_coding_object (struct coding_system *coding,
7781                       Lisp_Object src_object,
7782                       ptrdiff_t from, ptrdiff_t from_byte,
7783                       ptrdiff_t to, ptrdiff_t to_byte,
7784                       Lisp_Object dst_object)
7785 {
7786   ptrdiff_t count = SPECPDL_INDEX ();
7787   ptrdiff_t chars = to - from;
7788   ptrdiff_t bytes = to_byte - from_byte;
7789   Lisp_Object attrs;
7790   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
7791   bool need_marker_adjustment = 0;
7792   bool kill_src_buffer = 0;
7793   Lisp_Object old_deactivate_mark;
7794
7795   old_deactivate_mark = Vdeactivate_mark;
7796
7797   coding->src_object = src_object;
7798   coding->src_chars = chars;
7799   coding->src_bytes = bytes;
7800   coding->src_multibyte = chars < bytes;
7801
7802   attrs = CODING_ID_ATTRS (coding->id);
7803
7804   if (EQ (src_object, dst_object))
7805     {
7806       struct Lisp_Marker *tail;
7807
7808       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7809         {
7810           tail->need_adjustment
7811             = tail->charpos == (tail->insertion_type ? from : to);
7812           need_marker_adjustment |= tail->need_adjustment;
7813         }
7814     }
7815
7816   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7817     {
7818       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7819       set_buffer_internal (XBUFFER (coding->src_object));
7820       if (STRINGP (src_object))
7821         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7822       else if (BUFFERP (src_object))
7823         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7824       else
7825         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7826
7827       if (EQ (src_object, dst_object))
7828         {
7829           set_buffer_internal (XBUFFER (src_object));
7830           saved_pt = PT, saved_pt_byte = PT_BYTE;
7831           del_range_both (from, from_byte, to, to_byte, 1);
7832           set_buffer_internal (XBUFFER (coding->src_object));
7833         }
7834
7835       {
7836         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7837
7838         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7839                 old_deactivate_mark);
7840         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
7841                     make_number (BEG), make_number (Z));
7842         UNGCPRO;
7843       }
7844       if (XBUFFER (coding->src_object) != current_buffer)
7845         kill_src_buffer = 1;
7846       coding->src_object = Fcurrent_buffer ();
7847       if (BEG != GPT)
7848         move_gap_both (BEG, BEG_BYTE);
7849       coding->src_chars = Z - BEG;
7850       coding->src_bytes = Z_BYTE - BEG_BYTE;
7851       coding->src_pos = BEG;
7852       coding->src_pos_byte = BEG_BYTE;
7853       coding->src_multibyte = Z < Z_BYTE;
7854     }
7855   else if (STRINGP (src_object))
7856     {
7857       code_conversion_save (0, 0);
7858       coding->src_pos = from;
7859       coding->src_pos_byte = from_byte;
7860     }
7861   else if (BUFFERP (src_object))
7862     {
7863       code_conversion_save (0, 0);
7864       set_buffer_internal (XBUFFER (src_object));
7865       if (EQ (src_object, dst_object))
7866         {
7867           saved_pt = PT, saved_pt_byte = PT_BYTE;
7868           coding->src_object = del_range_1 (from, to, 1, 1);
7869           coding->src_pos = 0;
7870           coding->src_pos_byte = 0;
7871         }
7872       else
7873         {
7874           if (from < GPT && to >= GPT)
7875             move_gap_both (from, from_byte);
7876           coding->src_pos = from;
7877           coding->src_pos_byte = from_byte;
7878         }
7879     }
7880   else
7881     code_conversion_save (0, 0);
7882
7883   if (BUFFERP (dst_object))
7884     {
7885       coding->dst_object = dst_object;
7886       if (EQ (src_object, dst_object))
7887         {
7888           coding->dst_pos = from;
7889           coding->dst_pos_byte = from_byte;
7890         }
7891       else
7892         {
7893           struct buffer *current = current_buffer;
7894
7895           set_buffer_temp (XBUFFER (dst_object));
7896           coding->dst_pos = PT;
7897           coding->dst_pos_byte = PT_BYTE;
7898           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7899           set_buffer_temp (current);
7900         }
7901       coding->dst_multibyte
7902         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7903     }
7904   else if (EQ (dst_object, Qt))
7905     {
7906       ptrdiff_t dst_bytes = max (1, coding->src_chars);
7907       coding->dst_object = Qnil;
7908       coding->destination = xmalloc (dst_bytes);
7909       coding->dst_bytes = dst_bytes;
7910       coding->dst_multibyte = 0;
7911     }
7912   else
7913     {
7914       coding->dst_object = Qnil;
7915       coding->dst_multibyte = 0;
7916     }
7917
7918   encode_coding (coding);
7919
7920   if (EQ (dst_object, Qt))
7921     {
7922       if (BUFFERP (coding->dst_object))
7923         coding->dst_object = Fbuffer_string ();
7924       else
7925         {
7926           coding->dst_object
7927             = make_unibyte_string ((char *) coding->destination,
7928                                    coding->produced);
7929           xfree (coding->destination);
7930         }
7931     }
7932
7933   if (saved_pt >= 0)
7934     {
7935       /* This is the case of:
7936          (BUFFERP (src_object) && EQ (src_object, dst_object))
7937          As we have moved PT while replacing the original buffer
7938          contents, we must recover it now.  */
7939       set_buffer_internal (XBUFFER (src_object));
7940       if (saved_pt < from)
7941         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7942       else if (saved_pt < from + chars)
7943         TEMP_SET_PT_BOTH (from, from_byte);
7944       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7945         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7946                           saved_pt_byte + (coding->produced - bytes));
7947       else
7948         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7949                           saved_pt_byte + (coding->produced - bytes));
7950
7951       if (need_marker_adjustment)
7952         {
7953           struct Lisp_Marker *tail;
7954
7955           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7956             if (tail->need_adjustment)
7957               {
7958                 tail->need_adjustment = 0;
7959                 if (tail->insertion_type)
7960                   {
7961                     tail->bytepos = from_byte;
7962                     tail->charpos = from;
7963                   }
7964                 else
7965                   {
7966                     tail->bytepos = from_byte + coding->produced;
7967                     tail->charpos
7968                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7969                          ? tail->bytepos : from + coding->produced_char);
7970                   }
7971               }
7972         }
7973     }
7974
7975   if (kill_src_buffer)
7976     Fkill_buffer (coding->src_object);
7977
7978   Vdeactivate_mark = old_deactivate_mark;
7979   unbind_to (count, Qnil);
7980 }
7981
7982
7983 Lisp_Object
7984 preferred_coding_system (void)
7985 {
7986   int id = coding_categories[coding_priorities[0]].id;
7987
7988   return CODING_ID_NAME (id);
7989 }
7990
7991 #if defined (WINDOWSNT) || defined (CYGWIN)
7992
7993 Lisp_Object
7994 from_unicode (Lisp_Object str)
7995 {
7996   CHECK_STRING (str);
7997   if (!STRING_MULTIBYTE (str) &&
7998       SBYTES (str) & 1)
7999     {
8000       str = Fsubstring (str, make_number (0), make_number (-1));
8001     }
8002
8003   return code_convert_string_norecord (str, Qutf_16le, 0);
8004 }
8005
8006 wchar_t *
8007 to_unicode (Lisp_Object str, Lisp_Object *buf)
8008 {
8009   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8010   /* We need to make another copy (in addition to the one made by
8011      code_convert_string_norecord) to ensure that the final string is
8012      _doubly_ zero terminated --- that is, that the string is
8013      terminated by two zero bytes and one utf-16le null character.
8014      Because strings are already terminated with a single zero byte,
8015      we just add one additional zero. */
8016   str = make_uninit_string (SBYTES (*buf) + 1);
8017   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8018   SDATA (str) [SBYTES (*buf)] = '\0';
8019   *buf = str;
8020   return WCSDATA (*buf);
8021 }
8022
8023 #endif /* WINDOWSNT || CYGWIN */
8024
8025 \f
8026 #ifdef emacs
8027 /*** 8. Emacs Lisp library functions ***/
8028
8029 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8030        doc: /* Return t if OBJECT is nil or a coding-system.
8031 See the documentation of `define-coding-system' for information
8032 about coding-system objects.  */)
8033   (Lisp_Object object)
8034 {
8035   if (NILP (object)
8036       || CODING_SYSTEM_ID (object) >= 0)
8037     return Qt;
8038   if (! SYMBOLP (object)
8039       || NILP (Fget (object, Qcoding_system_define_form)))
8040     return Qnil;
8041   return Qt;
8042 }
8043
8044 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8045        Sread_non_nil_coding_system, 1, 1, 0,
8046        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8047   (Lisp_Object prompt)
8048 {
8049   Lisp_Object val;
8050   do
8051     {
8052       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8053                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8054     }
8055   while (SCHARS (val) == 0);
8056   return (Fintern (val, Qnil));
8057 }
8058
8059 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8060        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8061 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8062 Ignores case when completing coding systems (all Emacs coding systems
8063 are lower-case).  */)
8064   (Lisp_Object prompt, Lisp_Object default_coding_system)
8065 {
8066   Lisp_Object val;
8067   ptrdiff_t count = SPECPDL_INDEX ();
8068
8069   if (SYMBOLP (default_coding_system))
8070     default_coding_system = SYMBOL_NAME (default_coding_system);
8071   specbind (Qcompletion_ignore_case, Qt);
8072   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8073                           Qt, Qnil, Qcoding_system_history,
8074                           default_coding_system, Qnil);
8075   unbind_to (count, Qnil);
8076   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8077 }
8078
8079 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8080        1, 1, 0,
8081        doc: /* Check validity of CODING-SYSTEM.
8082 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8083 It is valid if it is nil or a symbol defined as a coding system by the
8084 function `define-coding-system'.  */)
8085   (Lisp_Object coding_system)
8086 {
8087   Lisp_Object define_form;
8088
8089   define_form = Fget (coding_system, Qcoding_system_define_form);
8090   if (! NILP (define_form))
8091     {
8092       Fput (coding_system, Qcoding_system_define_form, Qnil);
8093       safe_eval (define_form);
8094     }
8095   if (!NILP (Fcoding_system_p (coding_system)))
8096     return coding_system;
8097   xsignal1 (Qcoding_system_error, coding_system);
8098 }
8099
8100 \f
8101 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8102    HIGHEST, return the coding system of the highest
8103    priority among the detected coding systems.  Otherwise return a
8104    list of detected coding systems sorted by their priorities.  If
8105    MULTIBYTEP, it is assumed that the bytes are in correct
8106    multibyte form but contains only ASCII and eight-bit chars.
8107    Otherwise, the bytes are raw bytes.
8108
8109    CODING-SYSTEM controls the detection as below:
8110
8111    If it is nil, detect both text-format and eol-format.  If the
8112    text-format part of CODING-SYSTEM is already specified
8113    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8114    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8115    detect only text-format.  */
8116
8117 Lisp_Object
8118 detect_coding_system (const unsigned char *src,
8119                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8120                       bool highest, bool multibytep,
8121                       Lisp_Object coding_system)
8122 {
8123   const unsigned char *src_end = src + src_bytes;
8124   Lisp_Object attrs, eol_type;
8125   Lisp_Object val = Qnil;
8126   struct coding_system coding;
8127   ptrdiff_t id;
8128   struct coding_detection_info detect_info;
8129   enum coding_category base_category;
8130   bool null_byte_found = 0, eight_bit_found = 0;
8131
8132   if (NILP (coding_system))
8133     coding_system = Qundecided;
8134   setup_coding_system (coding_system, &coding);
8135   attrs = CODING_ID_ATTRS (coding.id);
8136   eol_type = CODING_ID_EOL_TYPE (coding.id);
8137   coding_system = CODING_ATTR_BASE_NAME (attrs);
8138
8139   coding.source = src;
8140   coding.src_chars = src_chars;
8141   coding.src_bytes = src_bytes;
8142   coding.src_multibyte = multibytep;
8143   coding.consumed = 0;
8144   coding.mode |= CODING_MODE_LAST_BLOCK;
8145   coding.head_ascii = 0;
8146
8147   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8148
8149   /* At first, detect text-format if necessary.  */
8150   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8151   if (base_category == coding_category_undecided)
8152     {
8153       enum coding_category category IF_LINT (= 0);
8154       struct coding_system *this IF_LINT (= NULL);
8155       int c, i;
8156
8157       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8158       for (; src < src_end; src++)
8159         {
8160           c = *src;
8161           if (c & 0x80)
8162             {
8163               eight_bit_found = 1;
8164               if (null_byte_found)
8165                 break;
8166             }
8167           else if (c < 0x20)
8168             {
8169               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8170                   && ! inhibit_iso_escape_detection
8171                   && ! detect_info.checked)
8172                 {
8173                   if (detect_coding_iso_2022 (&coding, &detect_info))
8174                     {
8175                       /* We have scanned the whole data.  */
8176                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8177                         {
8178                           /* We didn't find an 8-bit code.  We may
8179                              have found a null-byte, but it's very
8180                              rare that a binary file confirm to
8181                              ISO-2022.  */
8182                           src = src_end;
8183                           coding.head_ascii = src - coding.source;
8184                         }
8185                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8186                       break;
8187                     }
8188                 }
8189               else if (! c && !inhibit_null_byte_detection)
8190                 {
8191                   null_byte_found = 1;
8192                   if (eight_bit_found)
8193                     break;
8194                 }
8195               if (! eight_bit_found)
8196                 coding.head_ascii++;
8197             }
8198           else if (! eight_bit_found)
8199             coding.head_ascii++;
8200         }
8201
8202       if (null_byte_found || eight_bit_found
8203           || coding.head_ascii < coding.src_bytes
8204           || detect_info.found)
8205         {
8206           if (coding.head_ascii == coding.src_bytes)
8207             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8208             for (i = 0; i < coding_category_raw_text; i++)
8209               {
8210                 category = coding_priorities[i];
8211                 this = coding_categories + category;
8212                 if (detect_info.found & (1 << category))
8213                   break;
8214               }
8215           else
8216             {
8217               if (null_byte_found)
8218                 {
8219                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8220                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8221                 }
8222               for (i = 0; i < coding_category_raw_text; i++)
8223                 {
8224                   category = coding_priorities[i];
8225                   this = coding_categories + category;
8226
8227                   if (this->id < 0)
8228                     {
8229                       /* No coding system of this category is defined.  */
8230                       detect_info.rejected |= (1 << category);
8231                     }
8232                   else if (category >= coding_category_raw_text)
8233                     continue;
8234                   else if (detect_info.checked & (1 << category))
8235                     {
8236                       if (highest
8237                           && (detect_info.found & (1 << category)))
8238                         break;
8239                     }
8240                   else if ((*(this->detector)) (&coding, &detect_info)
8241                            && highest
8242                            && (detect_info.found & (1 << category)))
8243                     {
8244                       if (category == coding_category_utf_16_auto)
8245                         {
8246                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8247                             category = coding_category_utf_16_le;
8248                           else
8249                             category = coding_category_utf_16_be;
8250                         }
8251                       break;
8252                     }
8253                 }
8254             }
8255         }
8256
8257       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8258           || null_byte_found)
8259         {
8260           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8261           id = CODING_SYSTEM_ID (Qno_conversion);
8262           val = Fcons (make_number (id), Qnil);
8263         }
8264       else if (! detect_info.rejected && ! detect_info.found)
8265         {
8266           detect_info.found = CATEGORY_MASK_ANY;
8267           id = coding_categories[coding_category_undecided].id;
8268           val = Fcons (make_number (id), Qnil);
8269         }
8270       else if (highest)
8271         {
8272           if (detect_info.found)
8273             {
8274               detect_info.found = 1 << category;
8275               val = Fcons (make_number (this->id), Qnil);
8276             }
8277           else
8278             for (i = 0; i < coding_category_raw_text; i++)
8279               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8280                 {
8281                   detect_info.found = 1 << coding_priorities[i];
8282                   id = coding_categories[coding_priorities[i]].id;
8283                   val = Fcons (make_number (id), Qnil);
8284                   break;
8285                 }
8286         }
8287       else
8288         {
8289           int mask = detect_info.rejected | detect_info.found;
8290           int found = 0;
8291
8292           for (i = coding_category_raw_text - 1; i >= 0; i--)
8293             {
8294               category = coding_priorities[i];
8295               if (! (mask & (1 << category)))
8296                 {
8297                   found |= 1 << category;
8298                   id = coding_categories[category].id;
8299                   if (id >= 0)
8300                     val = Fcons (make_number (id), val);
8301                 }
8302             }
8303           for (i = coding_category_raw_text - 1; i >= 0; i--)
8304             {
8305               category = coding_priorities[i];
8306               if (detect_info.found & (1 << category))
8307                 {
8308                   id = coding_categories[category].id;
8309                   val = Fcons (make_number (id), val);
8310                 }
8311             }
8312           detect_info.found |= found;
8313         }
8314     }
8315   else if (base_category == coding_category_utf_8_auto)
8316     {
8317       if (detect_coding_utf_8 (&coding, &detect_info))
8318         {
8319           struct coding_system *this;
8320
8321           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8322             this = coding_categories + coding_category_utf_8_sig;
8323           else
8324             this = coding_categories + coding_category_utf_8_nosig;
8325           val = Fcons (make_number (this->id), Qnil);
8326         }
8327     }
8328   else if (base_category == coding_category_utf_16_auto)
8329     {
8330       if (detect_coding_utf_16 (&coding, &detect_info))
8331         {
8332           struct coding_system *this;
8333
8334           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8335             this = coding_categories + coding_category_utf_16_le;
8336           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8337             this = coding_categories + coding_category_utf_16_be;
8338           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8339             this = coding_categories + coding_category_utf_16_be_nosig;
8340           else
8341             this = coding_categories + coding_category_utf_16_le_nosig;
8342           val = Fcons (make_number (this->id), Qnil);
8343         }
8344     }
8345   else
8346     {
8347       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8348       val = Fcons (make_number (coding.id), Qnil);
8349     }
8350
8351   /* Then, detect eol-format if necessary.  */
8352   {
8353     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8354     Lisp_Object tail;
8355
8356     if (VECTORP (eol_type))
8357       {
8358         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8359           {
8360             if (null_byte_found)
8361               normal_eol = EOL_SEEN_LF;
8362             else
8363               normal_eol = detect_eol (coding.source, src_bytes,
8364                                        coding_category_raw_text);
8365           }
8366         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8367                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8368           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8369                                       coding_category_utf_16_be);
8370         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8371                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8372           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8373                                       coding_category_utf_16_le);
8374       }
8375     else
8376       {
8377         if (EQ (eol_type, Qunix))
8378           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8379         else if (EQ (eol_type, Qdos))
8380           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8381         else
8382           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8383       }
8384
8385     for (tail = val; CONSP (tail); tail = XCDR (tail))
8386       {
8387         enum coding_category category;
8388         int this_eol;
8389
8390         id = XINT (XCAR (tail));
8391         attrs = CODING_ID_ATTRS (id);
8392         category = XINT (CODING_ATTR_CATEGORY (attrs));
8393         eol_type = CODING_ID_EOL_TYPE (id);
8394         if (VECTORP (eol_type))
8395           {
8396             if (category == coding_category_utf_16_be
8397                 || category == coding_category_utf_16_be_nosig)
8398               this_eol = utf_16_be_eol;
8399             else if (category == coding_category_utf_16_le
8400                      || category == coding_category_utf_16_le_nosig)
8401               this_eol = utf_16_le_eol;
8402             else
8403               this_eol = normal_eol;
8404
8405             if (this_eol == EOL_SEEN_LF)
8406               XSETCAR (tail, AREF (eol_type, 0));
8407             else if (this_eol == EOL_SEEN_CRLF)
8408               XSETCAR (tail, AREF (eol_type, 1));
8409             else if (this_eol == EOL_SEEN_CR)
8410               XSETCAR (tail, AREF (eol_type, 2));
8411             else
8412               XSETCAR (tail, CODING_ID_NAME (id));
8413           }
8414         else
8415           XSETCAR (tail, CODING_ID_NAME (id));
8416       }
8417   }
8418
8419   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8420 }
8421
8422
8423 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8424        2, 3, 0,
8425        doc: /* Detect coding system of the text in the region between START and END.
8426 Return a list of possible coding systems ordered by priority.
8427 The coding systems to try and their priorities follows what
8428 the function `coding-system-priority-list' (which see) returns.
8429
8430 If only ASCII characters are found (except for such ISO-2022 control
8431 characters as ESC), it returns a list of single element `undecided'
8432 or its subsidiary coding system according to a detected end-of-line
8433 format.
8434
8435 If optional argument HIGHEST is non-nil, return the coding system of
8436 highest priority.  */)
8437   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8438 {
8439   ptrdiff_t from, to;
8440   ptrdiff_t from_byte, to_byte;
8441
8442   validate_region (&start, &end);
8443   from = XINT (start), to = XINT (end);
8444   from_byte = CHAR_TO_BYTE (from);
8445   to_byte = CHAR_TO_BYTE (to);
8446
8447   if (from < GPT && to >= GPT)
8448     move_gap_both (to, to_byte);
8449
8450   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8451                                to - from, to_byte - from_byte,
8452                                !NILP (highest),
8453                                !NILP (BVAR (current_buffer
8454                                       , enable_multibyte_characters)),
8455                                Qnil);
8456 }
8457
8458 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8459        1, 2, 0,
8460        doc: /* Detect coding system of the text in STRING.
8461 Return a list of possible coding systems ordered by priority.
8462 The coding systems to try and their priorities follows what
8463 the function `coding-system-priority-list' (which see) returns.
8464
8465 If only ASCII characters are found (except for such ISO-2022 control
8466 characters as ESC), it returns a list of single element `undecided'
8467 or its subsidiary coding system according to a detected end-of-line
8468 format.
8469
8470 If optional argument HIGHEST is non-nil, return the coding system of
8471 highest priority.  */)
8472   (Lisp_Object string, Lisp_Object highest)
8473 {
8474   CHECK_STRING (string);
8475
8476   return detect_coding_system (SDATA (string),
8477                                SCHARS (string), SBYTES (string),
8478                                !NILP (highest), STRING_MULTIBYTE (string),
8479                                Qnil);
8480 }
8481
8482
8483 static bool
8484 char_encodable_p (int c, Lisp_Object attrs)
8485 {
8486   Lisp_Object tail;
8487   struct charset *charset;
8488   Lisp_Object translation_table;
8489
8490   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8491   if (! NILP (translation_table))
8492     c = translate_char (translation_table, c);
8493   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8494        CONSP (tail); tail = XCDR (tail))
8495     {
8496       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8497       if (CHAR_CHARSET_P (c, charset))
8498         break;
8499     }
8500   return (! NILP (tail));
8501 }
8502
8503
8504 /* Return a list of coding systems that safely encode the text between
8505    START and END.  If EXCLUDE is non-nil, it is a list of coding
8506    systems not to check.  The returned list doesn't contain any such
8507    coding systems.  In any case, if the text contains only ASCII or is
8508    unibyte, return t.  */
8509
8510 DEFUN ("find-coding-systems-region-internal",
8511        Ffind_coding_systems_region_internal,
8512        Sfind_coding_systems_region_internal, 2, 3, 0,
8513        doc: /* Internal use only.  */)
8514   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8515 {
8516   Lisp_Object coding_attrs_list, safe_codings;
8517   ptrdiff_t start_byte, end_byte;
8518   const unsigned char *p, *pbeg, *pend;
8519   int c;
8520   Lisp_Object tail, elt, work_table;
8521
8522   if (STRINGP (start))
8523     {
8524       if (!STRING_MULTIBYTE (start)
8525           || SCHARS (start) == SBYTES (start))
8526         return Qt;
8527       start_byte = 0;
8528       end_byte = SBYTES (start);
8529     }
8530   else
8531     {
8532       CHECK_NUMBER_COERCE_MARKER (start);
8533       CHECK_NUMBER_COERCE_MARKER (end);
8534       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8535         args_out_of_range (start, end);
8536       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8537         return Qt;
8538       start_byte = CHAR_TO_BYTE (XINT (start));
8539       end_byte = CHAR_TO_BYTE (XINT (end));
8540       if (XINT (end) - XINT (start) == end_byte - start_byte)
8541         return Qt;
8542
8543       if (XINT (start) < GPT && XINT (end) > GPT)
8544         {
8545           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8546             move_gap_both (XINT (start), start_byte);
8547           else
8548             move_gap_both (XINT (end), end_byte);
8549         }
8550     }
8551
8552   coding_attrs_list = Qnil;
8553   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8554     if (NILP (exclude)
8555         || NILP (Fmemq (XCAR (tail), exclude)))
8556       {
8557         Lisp_Object attrs;
8558
8559         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8560         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8561             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8562           {
8563             ASET (attrs, coding_attr_trans_tbl,
8564                   get_translation_table (attrs, 1, NULL));
8565             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8566           }
8567       }
8568
8569   if (STRINGP (start))
8570     p = pbeg = SDATA (start);
8571   else
8572     p = pbeg = BYTE_POS_ADDR (start_byte);
8573   pend = p + (end_byte - start_byte);
8574
8575   while (p < pend && ASCII_BYTE_P (*p)) p++;
8576   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8577
8578   work_table = Fmake_char_table (Qnil, Qnil);
8579   while (p < pend)
8580     {
8581       if (ASCII_BYTE_P (*p))
8582         p++;
8583       else
8584         {
8585           c = STRING_CHAR_ADVANCE (p);
8586           if (!NILP (char_table_ref (work_table, c)))
8587             /* This character was already checked.  Ignore it.  */
8588             continue;
8589
8590           charset_map_loaded = 0;
8591           for (tail = coding_attrs_list; CONSP (tail);)
8592             {
8593               elt = XCAR (tail);
8594               if (NILP (elt))
8595                 tail = XCDR (tail);
8596               else if (char_encodable_p (c, elt))
8597                 tail = XCDR (tail);
8598               else if (CONSP (XCDR (tail)))
8599                 {
8600                   XSETCAR (tail, XCAR (XCDR (tail)));
8601                   XSETCDR (tail, XCDR (XCDR (tail)));
8602                 }
8603               else
8604                 {
8605                   XSETCAR (tail, Qnil);
8606                   tail = XCDR (tail);
8607                 }
8608             }
8609           if (charset_map_loaded)
8610             {
8611               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8612
8613               if (STRINGP (start))
8614                 pbeg = SDATA (start);
8615               else
8616                 pbeg = BYTE_POS_ADDR (start_byte);
8617               p = pbeg + p_offset;
8618               pend = pbeg + pend_offset;
8619             }
8620           char_table_set (work_table, c, Qt);
8621         }
8622     }
8623
8624   safe_codings = list2 (Qraw_text, Qno_conversion);
8625   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8626     if (! NILP (XCAR (tail)))
8627       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8628
8629   return safe_codings;
8630 }
8631
8632
8633 DEFUN ("unencodable-char-position", Funencodable_char_position,
8634        Sunencodable_char_position, 3, 5, 0,
8635        doc: /*
8636 Return position of first un-encodable character in a region.
8637 START and END specify the region and CODING-SYSTEM specifies the
8638 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8639
8640 If optional 4th argument COUNT is non-nil, it specifies at most how
8641 many un-encodable characters to search.  In this case, the value is a
8642 list of positions.
8643
8644 If optional 5th argument STRING is non-nil, it is a string to search
8645 for un-encodable characters.  In that case, START and END are indexes
8646 to the string.  */)
8647   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8648 {
8649   EMACS_INT n;
8650   struct coding_system coding;
8651   Lisp_Object attrs, charset_list, translation_table;
8652   Lisp_Object positions;
8653   ptrdiff_t from, to;
8654   const unsigned char *p, *stop, *pend;
8655   bool ascii_compatible;
8656
8657   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8658   attrs = CODING_ID_ATTRS (coding.id);
8659   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8660     return Qnil;
8661   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8662   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8663   translation_table = get_translation_table (attrs, 1, NULL);
8664
8665   if (NILP (string))
8666     {
8667       validate_region (&start, &end);
8668       from = XINT (start);
8669       to = XINT (end);
8670       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8671           || (ascii_compatible
8672               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8673         return Qnil;
8674       p = CHAR_POS_ADDR (from);
8675       pend = CHAR_POS_ADDR (to);
8676       if (from < GPT && to >= GPT)
8677         stop = GPT_ADDR;
8678       else
8679         stop = pend;
8680     }
8681   else
8682     {
8683       CHECK_STRING (string);
8684       CHECK_NATNUM (start);
8685       CHECK_NATNUM (end);
8686       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
8687         args_out_of_range_3 (string, start, end);
8688       from = XINT (start);
8689       to = XINT (end);
8690       if (! STRING_MULTIBYTE (string))
8691         return Qnil;
8692       p = SDATA (string) + string_char_to_byte (string, from);
8693       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8694       if (ascii_compatible && (to - from) == (pend - p))
8695         return Qnil;
8696     }
8697
8698   if (NILP (count))
8699     n = 1;
8700   else
8701     {
8702       CHECK_NATNUM (count);
8703       n = XINT (count);
8704     }
8705
8706   positions = Qnil;
8707   charset_map_loaded = 0;
8708   while (1)
8709     {
8710       int c;
8711
8712       if (ascii_compatible)
8713         while (p < stop && ASCII_BYTE_P (*p))
8714           p++, from++;
8715       if (p >= stop)
8716         {
8717           if (p >= pend)
8718             break;
8719           stop = pend;
8720           p = GAP_END_ADDR;
8721         }
8722
8723       c = STRING_CHAR_ADVANCE (p);
8724       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8725           && ! char_charset (translate_char (translation_table, c),
8726                              charset_list, NULL))
8727         {
8728           positions = Fcons (make_number (from), positions);
8729           n--;
8730           if (n == 0)
8731             break;
8732         }
8733
8734       from++;
8735       if (charset_map_loaded && NILP (string))
8736         {
8737           p = CHAR_POS_ADDR (from);
8738           pend = CHAR_POS_ADDR (to);
8739           if (from < GPT && to >= GPT)
8740             stop = GPT_ADDR;
8741           else
8742             stop = pend;
8743           charset_map_loaded = 0;
8744         }
8745     }
8746
8747   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8748 }
8749
8750
8751 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8752        Scheck_coding_systems_region, 3, 3, 0,
8753        doc: /* Check if the region is encodable by coding systems.
8754
8755 START and END are buffer positions specifying the region.
8756 CODING-SYSTEM-LIST is a list of coding systems to check.
8757
8758 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8759 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8760 whole region, POS0, POS1, ... are buffer positions where non-encodable
8761 characters are found.
8762
8763 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8764 value is nil.
8765
8766 START may be a string.  In that case, check if the string is
8767 encodable, and the value contains indices to the string instead of
8768 buffer positions.  END is ignored.
8769
8770 If the current buffer (or START if it is a string) is unibyte, the value
8771 is nil.  */)
8772   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8773 {
8774   Lisp_Object list;
8775   ptrdiff_t start_byte, end_byte;
8776   ptrdiff_t pos;
8777   const unsigned char *p, *pbeg, *pend;
8778   int c;
8779   Lisp_Object tail, elt, attrs;
8780
8781   if (STRINGP (start))
8782     {
8783       if (!STRING_MULTIBYTE (start)
8784           || SCHARS (start) == SBYTES (start))
8785         return Qnil;
8786       start_byte = 0;
8787       end_byte = SBYTES (start);
8788       pos = 0;
8789     }
8790   else
8791     {
8792       CHECK_NUMBER_COERCE_MARKER (start);
8793       CHECK_NUMBER_COERCE_MARKER (end);
8794       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8795         args_out_of_range (start, end);
8796       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8797         return Qnil;
8798       start_byte = CHAR_TO_BYTE (XINT (start));
8799       end_byte = CHAR_TO_BYTE (XINT (end));
8800       if (XINT (end) - XINT (start) == end_byte - start_byte)
8801         return Qnil;
8802
8803       if (XINT (start) < GPT && XINT (end) > GPT)
8804         {
8805           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8806             move_gap_both (XINT (start), start_byte);
8807           else
8808             move_gap_both (XINT (end), end_byte);
8809         }
8810       pos = XINT (start);
8811     }
8812
8813   list = Qnil;
8814   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8815     {
8816       elt = XCAR (tail);
8817       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8818       ASET (attrs, coding_attr_trans_tbl,
8819             get_translation_table (attrs, 1, NULL));
8820       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8821     }
8822
8823   if (STRINGP (start))
8824     p = pbeg = SDATA (start);
8825   else
8826     p = pbeg = BYTE_POS_ADDR (start_byte);
8827   pend = p + (end_byte - start_byte);
8828
8829   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8830   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8831
8832   while (p < pend)
8833     {
8834       if (ASCII_BYTE_P (*p))
8835         p++;
8836       else
8837         {
8838           c = STRING_CHAR_ADVANCE (p);
8839
8840           charset_map_loaded = 0;
8841           for (tail = list; CONSP (tail); tail = XCDR (tail))
8842             {
8843               elt = XCDR (XCAR (tail));
8844               if (! char_encodable_p (c, XCAR (elt)))
8845                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8846             }
8847           if (charset_map_loaded)
8848             {
8849               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
8850
8851               if (STRINGP (start))
8852                 pbeg = SDATA (start);
8853               else
8854                 pbeg = BYTE_POS_ADDR (start_byte);
8855               p = pbeg + p_offset;
8856               pend = pbeg + pend_offset;
8857             }
8858         }
8859       pos++;
8860     }
8861
8862   tail = list;
8863   list = Qnil;
8864   for (; CONSP (tail); tail = XCDR (tail))
8865     {
8866       elt = XCAR (tail);
8867       if (CONSP (XCDR (XCDR (elt))))
8868         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8869                       list);
8870     }
8871
8872   return list;
8873 }
8874
8875
8876 static Lisp_Object
8877 code_convert_region (Lisp_Object start, Lisp_Object end,
8878                      Lisp_Object coding_system, Lisp_Object dst_object,
8879                      bool encodep, bool norecord)
8880 {
8881   struct coding_system coding;
8882   ptrdiff_t from, from_byte, to, to_byte;
8883   Lisp_Object src_object;
8884
8885   if (NILP (coding_system))
8886     coding_system = Qno_conversion;
8887   else
8888     CHECK_CODING_SYSTEM (coding_system);
8889   src_object = Fcurrent_buffer ();
8890   if (NILP (dst_object))
8891     dst_object = src_object;
8892   else if (! EQ (dst_object, Qt))
8893     CHECK_BUFFER (dst_object);
8894
8895   validate_region (&start, &end);
8896   from = XFASTINT (start);
8897   from_byte = CHAR_TO_BYTE (from);
8898   to = XFASTINT (end);
8899   to_byte = CHAR_TO_BYTE (to);
8900
8901   setup_coding_system (coding_system, &coding);
8902   coding.mode |= CODING_MODE_LAST_BLOCK;
8903
8904   if (encodep)
8905     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8906                           dst_object);
8907   else
8908     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8909                           dst_object);
8910   if (! norecord)
8911     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8912
8913   return (BUFFERP (dst_object)
8914           ? make_number (coding.produced_char)
8915           : coding.dst_object);
8916 }
8917
8918
8919 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8920        3, 4, "r\nzCoding system: ",
8921        doc: /* Decode the current region from the specified coding system.
8922 When called from a program, takes four arguments:
8923         START, END, CODING-SYSTEM, and DESTINATION.
8924 START and END are buffer positions.
8925
8926 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8927 If nil, the region between START and END is replaced by the decoded text.
8928 If buffer, the decoded text is inserted in that buffer after point (point
8929 does not move).
8930 In those cases, the length of the decoded text is returned.
8931 If DESTINATION is t, the decoded text is returned.
8932
8933 This function sets `last-coding-system-used' to the precise coding system
8934 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8935 not fully specified.)  */)
8936   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8937 {
8938   return code_convert_region (start, end, coding_system, destination, 0, 0);
8939 }
8940
8941 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8942        3, 4, "r\nzCoding system: ",
8943        doc: /* Encode the current region by specified coding system.
8944 When called from a program, takes four arguments:
8945         START, END, CODING-SYSTEM and DESTINATION.
8946 START and END are buffer positions.
8947
8948 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8949 If nil, the region between START and END is replace by the encoded text.
8950 If buffer, the encoded text is inserted in that buffer after point (point
8951 does not move).
8952 In those cases, the length of the encoded text is returned.
8953 If DESTINATION is t, the encoded text is returned.
8954
8955 This function sets `last-coding-system-used' to the precise coding system
8956 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8957 not fully specified.)  */)
8958   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8959 {
8960   return code_convert_region (start, end, coding_system, destination, 1, 0);
8961 }
8962
8963 Lisp_Object
8964 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8965                      Lisp_Object dst_object, bool encodep, bool nocopy,
8966                      bool norecord)
8967 {
8968   struct coding_system coding;
8969   ptrdiff_t chars, bytes;
8970
8971   CHECK_STRING (string);
8972   if (NILP (coding_system))
8973     {
8974       if (! norecord)
8975         Vlast_coding_system_used = Qno_conversion;
8976       if (NILP (dst_object))
8977         return (nocopy ? Fcopy_sequence (string) : string);
8978     }
8979
8980   if (NILP (coding_system))
8981     coding_system = Qno_conversion;
8982   else
8983     CHECK_CODING_SYSTEM (coding_system);
8984   if (NILP (dst_object))
8985     dst_object = Qt;
8986   else if (! EQ (dst_object, Qt))
8987     CHECK_BUFFER (dst_object);
8988
8989   setup_coding_system (coding_system, &coding);
8990   coding.mode |= CODING_MODE_LAST_BLOCK;
8991   chars = SCHARS (string);
8992   bytes = SBYTES (string);
8993   if (encodep)
8994     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8995   else
8996     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8997   if (! norecord)
8998     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8999
9000   return (BUFFERP (dst_object)
9001           ? make_number (coding.produced_char)
9002           : coding.dst_object);
9003 }
9004
9005
9006 /* Encode or decode STRING according to CODING_SYSTEM.
9007    Do not set Vlast_coding_system_used.
9008
9009    This function is called only from macros DECODE_FILE and
9010    ENCODE_FILE, thus we ignore character composition.  */
9011
9012 Lisp_Object
9013 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9014                               bool encodep)
9015 {
9016   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9017 }
9018
9019
9020 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9021        2, 4, 0,
9022        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9023
9024 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9025 if the decoding operation is trivial.
9026
9027 Optional fourth arg BUFFER non-nil means that the decoded text is
9028 inserted in that buffer after point (point does not move).  In this
9029 case, the return value is the length of the decoded text.
9030
9031 This function sets `last-coding-system-used' to the precise coding system
9032 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9033 not fully specified.)  */)
9034   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9035 {
9036   return code_convert_string (string, coding_system, buffer,
9037                               0, ! NILP (nocopy), 0);
9038 }
9039
9040 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9041        2, 4, 0,
9042        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9043
9044 Optional third arg NOCOPY non-nil means it is OK to return STRING
9045 itself if the encoding operation is trivial.
9046
9047 Optional fourth arg BUFFER non-nil means that the encoded text is
9048 inserted in that buffer after point (point does not move).  In this
9049 case, the return value is the length of the encoded text.
9050
9051 This function sets `last-coding-system-used' to the precise coding system
9052 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9053 not fully specified.)  */)
9054   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9055 {
9056   return code_convert_string (string, coding_system, buffer,
9057                               1, ! NILP (nocopy), 0);
9058 }
9059
9060 \f
9061 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9062        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9063 Return the corresponding character.  */)
9064   (Lisp_Object code)
9065 {
9066   Lisp_Object spec, attrs, val;
9067   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9068   EMACS_INT ch;
9069   int c;
9070
9071   CHECK_NATNUM (code);
9072   ch = XFASTINT (code);
9073   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9074   attrs = AREF (spec, 0);
9075
9076   if (ASCII_BYTE_P (ch)
9077       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9078     return code;
9079
9080   val = CODING_ATTR_CHARSET_LIST (attrs);
9081   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9082   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9083   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9084
9085   if (ch <= 0x7F)
9086     {
9087       c = ch;
9088       charset = charset_roman;
9089     }
9090   else if (ch >= 0xA0 && ch < 0xDF)
9091     {
9092       c = ch - 0x80;
9093       charset = charset_kana;
9094     }
9095   else
9096     {
9097       EMACS_INT c1 = ch >> 8;
9098       int c2 = ch & 0xFF;
9099
9100       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9101           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9102         error ("Invalid code: %"pI"d", ch);
9103       c = ch;
9104       SJIS_TO_JIS (c);
9105       charset = charset_kanji;
9106     }
9107   c = DECODE_CHAR (charset, c);
9108   if (c < 0)
9109     error ("Invalid code: %"pI"d", ch);
9110   return make_number (c);
9111 }
9112
9113
9114 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9115        doc: /* Encode a Japanese character CH to shift_jis encoding.
9116 Return the corresponding code in SJIS.  */)
9117   (Lisp_Object ch)
9118 {
9119   Lisp_Object spec, attrs, charset_list;
9120   int c;
9121   struct charset *charset;
9122   unsigned code;
9123
9124   CHECK_CHARACTER (ch);
9125   c = XFASTINT (ch);
9126   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9127   attrs = AREF (spec, 0);
9128
9129   if (ASCII_CHAR_P (c)
9130       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9131     return ch;
9132
9133   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9134   charset = char_charset (c, charset_list, &code);
9135   if (code == CHARSET_INVALID_CODE (charset))
9136     error ("Can't encode by shift_jis encoding: %c", c);
9137   JIS_TO_SJIS (code);
9138
9139   return make_number (code);
9140 }
9141
9142 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9143        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9144 Return the corresponding character.  */)
9145   (Lisp_Object code)
9146 {
9147   Lisp_Object spec, attrs, val;
9148   struct charset *charset_roman, *charset_big5, *charset;
9149   EMACS_INT ch;
9150   int c;
9151
9152   CHECK_NATNUM (code);
9153   ch = XFASTINT (code);
9154   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9155   attrs = AREF (spec, 0);
9156
9157   if (ASCII_BYTE_P (ch)
9158       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9159     return code;
9160
9161   val = CODING_ATTR_CHARSET_LIST (attrs);
9162   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9163   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9164
9165   if (ch <= 0x7F)
9166     {
9167       c = ch;
9168       charset = charset_roman;
9169     }
9170   else
9171     {
9172       EMACS_INT b1 = ch >> 8;
9173       int b2 = ch & 0x7F;
9174       if (b1 < 0xA1 || b1 > 0xFE
9175           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9176         error ("Invalid code: %"pI"d", ch);
9177       c = ch;
9178       charset = charset_big5;
9179     }
9180   c = DECODE_CHAR (charset, c);
9181   if (c < 0)
9182     error ("Invalid code: %"pI"d", ch);
9183   return make_number (c);
9184 }
9185
9186 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9187        doc: /* Encode the Big5 character CH to BIG5 coding system.
9188 Return the corresponding character code in Big5.  */)
9189   (Lisp_Object ch)
9190 {
9191   Lisp_Object spec, attrs, charset_list;
9192   struct charset *charset;
9193   int c;
9194   unsigned code;
9195
9196   CHECK_CHARACTER (ch);
9197   c = XFASTINT (ch);
9198   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9199   attrs = AREF (spec, 0);
9200   if (ASCII_CHAR_P (c)
9201       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9202     return ch;
9203
9204   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9205   charset = char_charset (c, charset_list, &code);
9206   if (code == CHARSET_INVALID_CODE (charset))
9207     error ("Can't encode by Big5 encoding: %c", c);
9208
9209   return make_number (code);
9210 }
9211
9212 \f
9213 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9214        Sset_terminal_coding_system_internal, 1, 2, 0,
9215        doc: /* Internal use only.  */)
9216   (Lisp_Object coding_system, Lisp_Object terminal)
9217 {
9218   struct terminal *term = get_terminal (terminal, 1);
9219   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9220   CHECK_SYMBOL (coding_system);
9221   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9222   /* We had better not send unsafe characters to terminal.  */
9223   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9224   /* Character composition should be disabled.  */
9225   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9226   terminal_coding->src_multibyte = 1;
9227   terminal_coding->dst_multibyte = 0;
9228   tset_charset_list
9229     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9230             ? coding_charset_list (terminal_coding)
9231             : Fcons (make_number (charset_ascii), Qnil)));
9232   return Qnil;
9233 }
9234
9235 DEFUN ("set-safe-terminal-coding-system-internal",
9236        Fset_safe_terminal_coding_system_internal,
9237        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9238        doc: /* Internal use only.  */)
9239   (Lisp_Object coding_system)
9240 {
9241   CHECK_SYMBOL (coding_system);
9242   setup_coding_system (Fcheck_coding_system (coding_system),
9243                        &safe_terminal_coding);
9244   /* Character composition should be disabled.  */
9245   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9246   safe_terminal_coding.src_multibyte = 1;
9247   safe_terminal_coding.dst_multibyte = 0;
9248   return Qnil;
9249 }
9250
9251 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9252        Sterminal_coding_system, 0, 1, 0,
9253        doc: /* Return coding system specified for terminal output on the given terminal.
9254 TERMINAL may be a terminal object, a frame, or nil for the selected
9255 frame's terminal device.  */)
9256   (Lisp_Object terminal)
9257 {
9258   struct coding_system *terminal_coding
9259     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9260   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9261
9262   /* For backward compatibility, return nil if it is `undecided'.  */
9263   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9264 }
9265
9266 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9267        Sset_keyboard_coding_system_internal, 1, 2, 0,
9268        doc: /* Internal use only.  */)
9269   (Lisp_Object coding_system, Lisp_Object terminal)
9270 {
9271   struct terminal *t = get_terminal (terminal, 1);
9272   CHECK_SYMBOL (coding_system);
9273   if (NILP (coding_system))
9274     coding_system = Qno_conversion;
9275   else
9276     Fcheck_coding_system (coding_system);
9277   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9278   /* Character composition should be disabled.  */
9279   TERMINAL_KEYBOARD_CODING (t)->common_flags
9280     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9281   return Qnil;
9282 }
9283
9284 DEFUN ("keyboard-coding-system",
9285        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9286        doc: /* Return coding system specified for decoding keyboard input.  */)
9287   (Lisp_Object terminal)
9288 {
9289   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9290                          (get_terminal (terminal, 1))->id);
9291 }
9292
9293 \f
9294 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9295        Sfind_operation_coding_system,  1, MANY, 0,
9296        doc: /* Choose a coding system for an operation based on the target name.
9297 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9298 DECODING-SYSTEM is the coding system to use for decoding
9299 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9300 for encoding (in case OPERATION does encoding).
9301
9302 The first argument OPERATION specifies an I/O primitive:
9303   For file I/O, `insert-file-contents' or `write-region'.
9304   For process I/O, `call-process', `call-process-region', or `start-process'.
9305   For network I/O, `open-network-stream'.
9306
9307 The remaining arguments should be the same arguments that were passed
9308 to the primitive.  Depending on which primitive, one of those arguments
9309 is selected as the TARGET.  For example, if OPERATION does file I/O,
9310 whichever argument specifies the file name is TARGET.
9311
9312 TARGET has a meaning which depends on OPERATION:
9313   For file I/O, TARGET is a file name (except for the special case below).
9314   For process I/O, TARGET is a process name.
9315   For network I/O, TARGET is a service name or a port number.
9316
9317 This function looks up what is specified for TARGET in
9318 `file-coding-system-alist', `process-coding-system-alist',
9319 or `network-coding-system-alist' depending on OPERATION.
9320 They may specify a coding system, a cons of coding systems,
9321 or a function symbol to call.
9322 In the last case, we call the function with one argument,
9323 which is a list of all the arguments given to this function.
9324 If the function can't decide a coding system, it can return
9325 `undecided' so that the normal code-detection is performed.
9326
9327 If OPERATION is `insert-file-contents', the argument corresponding to
9328 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9329 file name to look up, and BUFFER is a buffer that contains the file's
9330 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9331 function to call for FILENAME, that function should examine the
9332 contents of BUFFER instead of reading the file.
9333
9334 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9335   (ptrdiff_t nargs, Lisp_Object *args)
9336 {
9337   Lisp_Object operation, target_idx, target, val;
9338   register Lisp_Object chain;
9339
9340   if (nargs < 2)
9341     error ("Too few arguments");
9342   operation = args[0];
9343   if (!SYMBOLP (operation)
9344       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9345     error ("Invalid first argument");
9346   if (nargs <= 1 + XFASTINT (target_idx))
9347     error ("Too few arguments for operation `%s'",
9348            SDATA (SYMBOL_NAME (operation)));
9349   target = args[XFASTINT (target_idx) + 1];
9350   if (!(STRINGP (target)
9351         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9352             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9353         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9354     error ("Invalid argument %"pI"d of operation `%s'",
9355            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9356   if (CONSP (target))
9357     target = XCAR (target);
9358
9359   chain = ((EQ (operation, Qinsert_file_contents)
9360             || EQ (operation, Qwrite_region))
9361            ? Vfile_coding_system_alist
9362            : (EQ (operation, Qopen_network_stream)
9363               ? Vnetwork_coding_system_alist
9364               : Vprocess_coding_system_alist));
9365   if (NILP (chain))
9366     return Qnil;
9367
9368   for (; CONSP (chain); chain = XCDR (chain))
9369     {
9370       Lisp_Object elt;
9371
9372       elt = XCAR (chain);
9373       if (CONSP (elt)
9374           && ((STRINGP (target)
9375                && STRINGP (XCAR (elt))
9376                && fast_string_match (XCAR (elt), target) >= 0)
9377               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9378         {
9379           val = XCDR (elt);
9380           /* Here, if VAL is both a valid coding system and a valid
9381              function symbol, we return VAL as a coding system.  */
9382           if (CONSP (val))
9383             return val;
9384           if (! SYMBOLP (val))
9385             return Qnil;
9386           if (! NILP (Fcoding_system_p (val)))
9387             return Fcons (val, val);
9388           if (! NILP (Ffboundp (val)))
9389             {
9390               /* We use call1 rather than safe_call1
9391                  so as to get bug reports about functions called here
9392                  which don't handle the current interface.  */
9393               val = call1 (val, Flist (nargs, args));
9394               if (CONSP (val))
9395                 return val;
9396               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9397                 return Fcons (val, val);
9398             }
9399           return Qnil;
9400         }
9401     }
9402   return Qnil;
9403 }
9404
9405 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9406        Sset_coding_system_priority, 0, MANY, 0,
9407        doc: /* Assign higher priority to the coding systems given as arguments.
9408 If multiple coding systems belong to the same category,
9409 all but the first one are ignored.
9410
9411 usage: (set-coding-system-priority &rest coding-systems)  */)
9412   (ptrdiff_t nargs, Lisp_Object *args)
9413 {
9414   ptrdiff_t i, j;
9415   bool changed[coding_category_max];
9416   enum coding_category priorities[coding_category_max];
9417
9418   memset (changed, 0, sizeof changed);
9419
9420   for (i = j = 0; i < nargs; i++)
9421     {
9422       enum coding_category category;
9423       Lisp_Object spec, attrs;
9424
9425       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9426       attrs = AREF (spec, 0);
9427       category = XINT (CODING_ATTR_CATEGORY (attrs));
9428       if (changed[category])
9429         /* Ignore this coding system because a coding system of the
9430            same category already had a higher priority.  */
9431         continue;
9432       changed[category] = 1;
9433       priorities[j++] = category;
9434       if (coding_categories[category].id >= 0
9435           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9436         setup_coding_system (args[i], &coding_categories[category]);
9437       Fset (AREF (Vcoding_category_table, category), args[i]);
9438     }
9439
9440   /* Now we have decided top J priorities.  Reflect the order of the
9441      original priorities to the remaining priorities.  */
9442
9443   for (i = j, j = 0; i < coding_category_max; i++, j++)
9444     {
9445       while (j < coding_category_max
9446              && changed[coding_priorities[j]])
9447         j++;
9448       if (j == coding_category_max)
9449         emacs_abort ();
9450       priorities[i] = coding_priorities[j];
9451     }
9452
9453   memcpy (coding_priorities, priorities, sizeof priorities);
9454
9455   /* Update `coding-category-list'.  */
9456   Vcoding_category_list = Qnil;
9457   for (i = coding_category_max; i-- > 0; )
9458     Vcoding_category_list
9459       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9460                Vcoding_category_list);
9461
9462   return Qnil;
9463 }
9464
9465 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9466        Scoding_system_priority_list, 0, 1, 0,
9467        doc: /* Return a list of coding systems ordered by their priorities.
9468 The list contains a subset of coding systems; i.e. coding systems
9469 assigned to each coding category (see `coding-category-list').
9470
9471 HIGHESTP non-nil means just return the highest priority one.  */)
9472   (Lisp_Object highestp)
9473 {
9474   int i;
9475   Lisp_Object val;
9476
9477   for (i = 0, val = Qnil; i < coding_category_max; i++)
9478     {
9479       enum coding_category category = coding_priorities[i];
9480       int id = coding_categories[category].id;
9481       Lisp_Object attrs;
9482
9483       if (id < 0)
9484         continue;
9485       attrs = CODING_ID_ATTRS (id);
9486       if (! NILP (highestp))
9487         return CODING_ATTR_BASE_NAME (attrs);
9488       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9489     }
9490   return Fnreverse (val);
9491 }
9492
9493 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9494
9495 static Lisp_Object
9496 make_subsidiaries (Lisp_Object base)
9497 {
9498   Lisp_Object subsidiaries;
9499   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
9500   char *buf = alloca (base_name_len + 6);
9501   int i;
9502
9503   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9504   subsidiaries = make_uninit_vector (3);
9505   for (i = 0; i < 3; i++)
9506     {
9507       strcpy (buf + base_name_len, suffixes[i]);
9508       ASET (subsidiaries, i, intern (buf));
9509     }
9510   return subsidiaries;
9511 }
9512
9513
9514 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9515        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9516        doc: /* For internal use only.
9517 usage: (define-coding-system-internal ...)  */)
9518   (ptrdiff_t nargs, Lisp_Object *args)
9519 {
9520   Lisp_Object name;
9521   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9522   Lisp_Object attrs;            /* Vector of attributes.  */
9523   Lisp_Object eol_type;
9524   Lisp_Object aliases;
9525   Lisp_Object coding_type, charset_list, safe_charsets;
9526   enum coding_category category;
9527   Lisp_Object tail, val;
9528   int max_charset_id = 0;
9529   int i;
9530
9531   if (nargs < coding_arg_max)
9532     goto short_args;
9533
9534   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9535
9536   name = args[coding_arg_name];
9537   CHECK_SYMBOL (name);
9538   ASET (attrs, coding_attr_base_name, name);
9539
9540   val = args[coding_arg_mnemonic];
9541   if (! STRINGP (val))
9542     CHECK_CHARACTER (val);
9543   ASET (attrs, coding_attr_mnemonic, val);
9544
9545   coding_type = args[coding_arg_coding_type];
9546   CHECK_SYMBOL (coding_type);
9547   ASET (attrs, coding_attr_type, coding_type);
9548
9549   charset_list = args[coding_arg_charset_list];
9550   if (SYMBOLP (charset_list))
9551     {
9552       if (EQ (charset_list, Qiso_2022))
9553         {
9554           if (! EQ (coding_type, Qiso_2022))
9555             error ("Invalid charset-list");
9556           charset_list = Viso_2022_charset_list;
9557         }
9558       else if (EQ (charset_list, Qemacs_mule))
9559         {
9560           if (! EQ (coding_type, Qemacs_mule))
9561             error ("Invalid charset-list");
9562           charset_list = Vemacs_mule_charset_list;
9563         }
9564       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9565         {
9566           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
9567             error ("Invalid charset-list");
9568           if (max_charset_id < XFASTINT (XCAR (tail)))
9569             max_charset_id = XFASTINT (XCAR (tail));
9570         }
9571     }
9572   else
9573     {
9574       charset_list = Fcopy_sequence (charset_list);
9575       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9576         {
9577           struct charset *charset;
9578
9579           val = XCAR (tail);
9580           CHECK_CHARSET_GET_CHARSET (val, charset);
9581           if (EQ (coding_type, Qiso_2022)
9582               ? CHARSET_ISO_FINAL (charset) < 0
9583               : EQ (coding_type, Qemacs_mule)
9584               ? CHARSET_EMACS_MULE_ID (charset) < 0
9585               : 0)
9586             error ("Can't handle charset `%s'",
9587                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9588
9589           XSETCAR (tail, make_number (charset->id));
9590           if (max_charset_id < charset->id)
9591             max_charset_id = charset->id;
9592         }
9593     }
9594   ASET (attrs, coding_attr_charset_list, charset_list);
9595
9596   safe_charsets = make_uninit_string (max_charset_id + 1);
9597   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9598   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9599     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9600   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
9601
9602   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
9603
9604   val = args[coding_arg_decode_translation_table];
9605   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9606     CHECK_SYMBOL (val);
9607   ASET (attrs, coding_attr_decode_tbl, val);
9608
9609   val = args[coding_arg_encode_translation_table];
9610   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9611     CHECK_SYMBOL (val);
9612   ASET (attrs, coding_attr_encode_tbl, val);
9613
9614   val = args[coding_arg_post_read_conversion];
9615   CHECK_SYMBOL (val);
9616   ASET (attrs, coding_attr_post_read, val);
9617
9618   val = args[coding_arg_pre_write_conversion];
9619   CHECK_SYMBOL (val);
9620   ASET (attrs, coding_attr_pre_write, val);
9621
9622   val = args[coding_arg_default_char];
9623   if (NILP (val))
9624     ASET (attrs, coding_attr_default_char, make_number (' '));
9625   else
9626     {
9627       CHECK_CHARACTER (val);
9628       ASET (attrs, coding_attr_default_char, val);
9629     }
9630
9631   val = args[coding_arg_for_unibyte];
9632   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
9633
9634   val = args[coding_arg_plist];
9635   CHECK_LIST (val);
9636   ASET (attrs, coding_attr_plist, val);
9637
9638   if (EQ (coding_type, Qcharset))
9639     {
9640       /* Generate a lisp vector of 256 elements.  Each element is nil,
9641          integer, or a list of charset IDs.
9642
9643          If Nth element is nil, the byte code N is invalid in this
9644          coding system.
9645
9646          If Nth element is a number NUM, N is the first byte of a
9647          charset whose ID is NUM.
9648
9649          If Nth element is a list of charset IDs, N is the first byte
9650          of one of them.  The list is sorted by dimensions of the
9651          charsets.  A charset of smaller dimension comes first. */
9652       val = Fmake_vector (make_number (256), Qnil);
9653
9654       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9655         {
9656           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9657           int dim = CHARSET_DIMENSION (charset);
9658           int idx = (dim - 1) * 4;
9659
9660           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9661             ASET (attrs, coding_attr_ascii_compat, Qt);
9662
9663           for (i = charset->code_space[idx];
9664                i <= charset->code_space[idx + 1]; i++)
9665             {
9666               Lisp_Object tmp, tmp2;
9667               int dim2;
9668
9669               tmp = AREF (val, i);
9670               if (NILP (tmp))
9671                 tmp = XCAR (tail);
9672               else if (NUMBERP (tmp))
9673                 {
9674                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9675                   if (dim < dim2)
9676                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9677                   else
9678                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9679                 }
9680               else
9681                 {
9682                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9683                     {
9684                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9685                       if (dim < dim2)
9686                         break;
9687                     }
9688                   if (NILP (tmp2))
9689                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9690                   else
9691                     {
9692                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9693                       XSETCAR (tmp2, XCAR (tail));
9694                     }
9695                 }
9696               ASET (val, i, tmp);
9697             }
9698         }
9699       ASET (attrs, coding_attr_charset_valids, val);
9700       category = coding_category_charset;
9701     }
9702   else if (EQ (coding_type, Qccl))
9703     {
9704       Lisp_Object valids;
9705
9706       if (nargs < coding_arg_ccl_max)
9707         goto short_args;
9708
9709       val = args[coding_arg_ccl_decoder];
9710       CHECK_CCL_PROGRAM (val);
9711       if (VECTORP (val))
9712         val = Fcopy_sequence (val);
9713       ASET (attrs, coding_attr_ccl_decoder, val);
9714
9715       val = args[coding_arg_ccl_encoder];
9716       CHECK_CCL_PROGRAM (val);
9717       if (VECTORP (val))
9718         val = Fcopy_sequence (val);
9719       ASET (attrs, coding_attr_ccl_encoder, val);
9720
9721       val = args[coding_arg_ccl_valids];
9722       valids = Fmake_string (make_number (256), make_number (0));
9723       for (tail = val; CONSP (tail); tail = XCDR (tail))
9724         {
9725           int from, to;
9726
9727           val = XCAR (tail);
9728           if (INTEGERP (val))
9729             {
9730               if (! (0 <= XINT (val) && XINT (val) <= 255))
9731                 args_out_of_range_3 (val, make_number (0), make_number (255));
9732               from = to = XINT (val);
9733             }
9734           else
9735             {
9736               CHECK_CONS (val);
9737               CHECK_NATNUM_CAR (val);
9738               CHECK_NUMBER_CDR (val);
9739               if (XINT (XCAR (val)) > 255)
9740                 args_out_of_range_3 (XCAR (val),
9741                                      make_number (0), make_number (255));
9742               from = XINT (XCAR (val));
9743               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
9744                 args_out_of_range_3 (XCDR (val),
9745                                      XCAR (val), make_number (255));
9746               to = XINT (XCDR (val));
9747             }
9748           for (i = from; i <= to; i++)
9749             SSET (valids, i, 1);
9750         }
9751       ASET (attrs, coding_attr_ccl_valids, valids);
9752
9753       category = coding_category_ccl;
9754     }
9755   else if (EQ (coding_type, Qutf_16))
9756     {
9757       Lisp_Object bom, endian;
9758
9759       ASET (attrs, coding_attr_ascii_compat, Qnil);
9760
9761       if (nargs < coding_arg_utf16_max)
9762         goto short_args;
9763
9764       bom = args[coding_arg_utf16_bom];
9765       if (! NILP (bom) && ! EQ (bom, Qt))
9766         {
9767           CHECK_CONS (bom);
9768           val = XCAR (bom);
9769           CHECK_CODING_SYSTEM (val);
9770           val = XCDR (bom);
9771           CHECK_CODING_SYSTEM (val);
9772         }
9773       ASET (attrs, coding_attr_utf_bom, bom);
9774
9775       endian = args[coding_arg_utf16_endian];
9776       CHECK_SYMBOL (endian);
9777       if (NILP (endian))
9778         endian = Qbig;
9779       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9780         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9781       ASET (attrs, coding_attr_utf_16_endian, endian);
9782
9783       category = (CONSP (bom)
9784                   ? coding_category_utf_16_auto
9785                   : NILP (bom)
9786                   ? (EQ (endian, Qbig)
9787                      ? coding_category_utf_16_be_nosig
9788                      : coding_category_utf_16_le_nosig)
9789                   : (EQ (endian, Qbig)
9790                      ? coding_category_utf_16_be
9791                      : coding_category_utf_16_le));
9792     }
9793   else if (EQ (coding_type, Qiso_2022))
9794     {
9795       Lisp_Object initial, reg_usage, request, flags;
9796
9797       if (nargs < coding_arg_iso2022_max)
9798         goto short_args;
9799
9800       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9801       CHECK_VECTOR (initial);
9802       for (i = 0; i < 4; i++)
9803         {
9804           val = AREF (initial, i);
9805           if (! NILP (val))
9806             {
9807               struct charset *charset;
9808
9809               CHECK_CHARSET_GET_CHARSET (val, charset);
9810               ASET (initial, i, make_number (CHARSET_ID (charset)));
9811               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9812                 ASET (attrs, coding_attr_ascii_compat, Qt);
9813             }
9814           else
9815             ASET (initial, i, make_number (-1));
9816         }
9817
9818       reg_usage = args[coding_arg_iso2022_reg_usage];
9819       CHECK_CONS (reg_usage);
9820       CHECK_NUMBER_CAR (reg_usage);
9821       CHECK_NUMBER_CDR (reg_usage);
9822
9823       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9824       for (tail = request; CONSP (tail); tail = XCDR (tail))
9825         {
9826           int id;
9827           Lisp_Object tmp1;
9828
9829           val = XCAR (tail);
9830           CHECK_CONS (val);
9831           tmp1 = XCAR (val);
9832           CHECK_CHARSET_GET_ID (tmp1, id);
9833           CHECK_NATNUM_CDR (val);
9834           if (XINT (XCDR (val)) >= 4)
9835             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
9836           XSETCAR (val, make_number (id));
9837         }
9838
9839       flags = args[coding_arg_iso2022_flags];
9840       CHECK_NATNUM (flags);
9841       i = XINT (flags) & INT_MAX;
9842       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9843         i |= CODING_ISO_FLAG_FULL_SUPPORT;
9844       flags = make_number (i);
9845
9846       ASET (attrs, coding_attr_iso_initial, initial);
9847       ASET (attrs, coding_attr_iso_usage, reg_usage);
9848       ASET (attrs, coding_attr_iso_request, request);
9849       ASET (attrs, coding_attr_iso_flags, flags);
9850       setup_iso_safe_charsets (attrs);
9851
9852       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9853         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9854                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9855                     ? coding_category_iso_7_else
9856                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9857                     ? coding_category_iso_7
9858                     : coding_category_iso_7_tight);
9859       else
9860         {
9861           int id = XINT (AREF (initial, 1));
9862
9863           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9864                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9865                        || id < 0)
9866                       ? coding_category_iso_8_else
9867                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9868                       ? coding_category_iso_8_1
9869                       : coding_category_iso_8_2);
9870         }
9871       if (category != coding_category_iso_8_1
9872           && category != coding_category_iso_8_2)
9873         ASET (attrs, coding_attr_ascii_compat, Qnil);
9874     }
9875   else if (EQ (coding_type, Qemacs_mule))
9876     {
9877       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9878         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9879       ASET (attrs, coding_attr_ascii_compat, Qt);
9880       category = coding_category_emacs_mule;
9881     }
9882   else if (EQ (coding_type, Qshift_jis))
9883     {
9884
9885       struct charset *charset;
9886
9887       if (XINT (Flength (charset_list)) != 3
9888           && XINT (Flength (charset_list)) != 4)
9889         error ("There should be three or four charsets");
9890
9891       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9892       if (CHARSET_DIMENSION (charset) != 1)
9893         error ("Dimension of charset %s is not one",
9894                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9895       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9896         ASET (attrs, coding_attr_ascii_compat, Qt);
9897
9898       charset_list = XCDR (charset_list);
9899       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9900       if (CHARSET_DIMENSION (charset) != 1)
9901         error ("Dimension of charset %s is not one",
9902                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9903
9904       charset_list = XCDR (charset_list);
9905       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9906       if (CHARSET_DIMENSION (charset) != 2)
9907         error ("Dimension of charset %s is not two",
9908                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9909
9910       charset_list = XCDR (charset_list);
9911       if (! NILP (charset_list))
9912         {
9913           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9914           if (CHARSET_DIMENSION (charset) != 2)
9915             error ("Dimension of charset %s is not two",
9916                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9917         }
9918
9919       category = coding_category_sjis;
9920       Vsjis_coding_system = name;
9921     }
9922   else if (EQ (coding_type, Qbig5))
9923     {
9924       struct charset *charset;
9925
9926       if (XINT (Flength (charset_list)) != 2)
9927         error ("There should be just two charsets");
9928
9929       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9930       if (CHARSET_DIMENSION (charset) != 1)
9931         error ("Dimension of charset %s is not one",
9932                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9933       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9934         ASET (attrs, coding_attr_ascii_compat, Qt);
9935
9936       charset_list = XCDR (charset_list);
9937       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9938       if (CHARSET_DIMENSION (charset) != 2)
9939         error ("Dimension of charset %s is not two",
9940                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9941
9942       category = coding_category_big5;
9943       Vbig5_coding_system = name;
9944     }
9945   else if (EQ (coding_type, Qraw_text))
9946     {
9947       category = coding_category_raw_text;
9948       ASET (attrs, coding_attr_ascii_compat, Qt);
9949     }
9950   else if (EQ (coding_type, Qutf_8))
9951     {
9952       Lisp_Object bom;
9953
9954       if (nargs < coding_arg_utf8_max)
9955         goto short_args;
9956
9957       bom = args[coding_arg_utf8_bom];
9958       if (! NILP (bom) && ! EQ (bom, Qt))
9959         {
9960           CHECK_CONS (bom);
9961           val = XCAR (bom);
9962           CHECK_CODING_SYSTEM (val);
9963           val = XCDR (bom);
9964           CHECK_CODING_SYSTEM (val);
9965         }
9966       ASET (attrs, coding_attr_utf_bom, bom);
9967       if (NILP (bom))
9968         ASET (attrs, coding_attr_ascii_compat, Qt);
9969
9970       category = (CONSP (bom) ? coding_category_utf_8_auto
9971                   : NILP (bom) ? coding_category_utf_8_nosig
9972                   : coding_category_utf_8_sig);
9973     }
9974   else if (EQ (coding_type, Qundecided))
9975     category = coding_category_undecided;
9976   else
9977     error ("Invalid coding system type: %s",
9978            SDATA (SYMBOL_NAME (coding_type)));
9979
9980   ASET (attrs, coding_attr_category, make_number (category));
9981   ASET (attrs, coding_attr_plist,
9982         Fcons (QCcategory,
9983                Fcons (AREF (Vcoding_category_table, category),
9984                       CODING_ATTR_PLIST (attrs))));
9985   ASET (attrs, coding_attr_plist,
9986         Fcons (QCascii_compatible_p,
9987                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9988                       CODING_ATTR_PLIST (attrs))));
9989
9990   eol_type = args[coding_arg_eol_type];
9991   if (! NILP (eol_type)
9992       && ! EQ (eol_type, Qunix)
9993       && ! EQ (eol_type, Qdos)
9994       && ! EQ (eol_type, Qmac))
9995     error ("Invalid eol-type");
9996
9997   aliases = Fcons (name, Qnil);
9998
9999   if (NILP (eol_type))
10000     {
10001       eol_type = make_subsidiaries (name);
10002       for (i = 0; i < 3; i++)
10003         {
10004           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10005
10006           this_name = AREF (eol_type, i);
10007           this_aliases = Fcons (this_name, Qnil);
10008           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10009           this_spec = make_uninit_vector (3);
10010           ASET (this_spec, 0, attrs);
10011           ASET (this_spec, 1, this_aliases);
10012           ASET (this_spec, 2, this_eol_type);
10013           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10014           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10015           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10016           if (NILP (val))
10017             Vcoding_system_alist
10018               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10019                        Vcoding_system_alist);
10020         }
10021     }
10022
10023   spec_vec = make_uninit_vector (3);
10024   ASET (spec_vec, 0, attrs);
10025   ASET (spec_vec, 1, aliases);
10026   ASET (spec_vec, 2, eol_type);
10027
10028   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10029   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10030   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10031   if (NILP (val))
10032     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10033                                   Vcoding_system_alist);
10034
10035   {
10036     int id = coding_categories[category].id;
10037
10038     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10039       setup_coding_system (name, &coding_categories[category]);
10040   }
10041
10042   return Qnil;
10043
10044  short_args:
10045   return Fsignal (Qwrong_number_of_arguments,
10046                   Fcons (intern ("define-coding-system-internal"),
10047                          make_number (nargs)));
10048 }
10049
10050
10051 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10052        3, 3, 0,
10053        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10054   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10055 {
10056   Lisp_Object spec, attrs;
10057
10058   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10059   attrs = AREF (spec, 0);
10060   if (EQ (prop, QCmnemonic))
10061     {
10062       if (! STRINGP (val))
10063         CHECK_CHARACTER (val);
10064       ASET (attrs, coding_attr_mnemonic, val);
10065     }
10066   else if (EQ (prop, QCdefault_char))
10067     {
10068       if (NILP (val))
10069         val = make_number (' ');
10070       else
10071         CHECK_CHARACTER (val);
10072       ASET (attrs, coding_attr_default_char, val);
10073     }
10074   else if (EQ (prop, QCdecode_translation_table))
10075     {
10076       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10077         CHECK_SYMBOL (val);
10078       ASET (attrs, coding_attr_decode_tbl, val);
10079     }
10080   else if (EQ (prop, QCencode_translation_table))
10081     {
10082       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10083         CHECK_SYMBOL (val);
10084       ASET (attrs, coding_attr_encode_tbl, val);
10085     }
10086   else if (EQ (prop, QCpost_read_conversion))
10087     {
10088       CHECK_SYMBOL (val);
10089       ASET (attrs, coding_attr_post_read, val);
10090     }
10091   else if (EQ (prop, QCpre_write_conversion))
10092     {
10093       CHECK_SYMBOL (val);
10094       ASET (attrs, coding_attr_pre_write, val);
10095     }
10096   else if (EQ (prop, QCascii_compatible_p))
10097     {
10098       ASET (attrs, coding_attr_ascii_compat, val);
10099     }
10100
10101   ASET (attrs, coding_attr_plist,
10102         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10103   return val;
10104 }
10105
10106
10107 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10108        Sdefine_coding_system_alias, 2, 2, 0,
10109        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10110   (Lisp_Object alias, Lisp_Object coding_system)
10111 {
10112   Lisp_Object spec, aliases, eol_type, val;
10113
10114   CHECK_SYMBOL (alias);
10115   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10116   aliases = AREF (spec, 1);
10117   /* ALIASES should be a list of length more than zero, and the first
10118      element is a base coding system.  Append ALIAS at the tail of the
10119      list.  */
10120   while (!NILP (XCDR (aliases)))
10121     aliases = XCDR (aliases);
10122   XSETCDR (aliases, Fcons (alias, Qnil));
10123
10124   eol_type = AREF (spec, 2);
10125   if (VECTORP (eol_type))
10126     {
10127       Lisp_Object subsidiaries;
10128       int i;
10129
10130       subsidiaries = make_subsidiaries (alias);
10131       for (i = 0; i < 3; i++)
10132         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10133                                      AREF (eol_type, i));
10134     }
10135
10136   Fputhash (alias, spec, Vcoding_system_hash_table);
10137   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10138   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10139   if (NILP (val))
10140     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10141                                   Vcoding_system_alist);
10142
10143   return Qnil;
10144 }
10145
10146 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10147        1, 1, 0,
10148        doc: /* Return the base of CODING-SYSTEM.
10149 Any alias or subsidiary coding system is not a base coding system.  */)
10150   (Lisp_Object coding_system)
10151 {
10152   Lisp_Object spec, attrs;
10153
10154   if (NILP (coding_system))
10155     return (Qno_conversion);
10156   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10157   attrs = AREF (spec, 0);
10158   return CODING_ATTR_BASE_NAME (attrs);
10159 }
10160
10161 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10162        1, 1, 0,
10163        doc: "Return the property list of CODING-SYSTEM.")
10164   (Lisp_Object coding_system)
10165 {
10166   Lisp_Object spec, attrs;
10167
10168   if (NILP (coding_system))
10169     coding_system = Qno_conversion;
10170   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10171   attrs = AREF (spec, 0);
10172   return CODING_ATTR_PLIST (attrs);
10173 }
10174
10175
10176 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10177        1, 1, 0,
10178        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10179   (Lisp_Object coding_system)
10180 {
10181   Lisp_Object spec;
10182
10183   if (NILP (coding_system))
10184     coding_system = Qno_conversion;
10185   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10186   return AREF (spec, 1);
10187 }
10188
10189 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10190        Scoding_system_eol_type, 1, 1, 0,
10191        doc: /* Return eol-type of CODING-SYSTEM.
10192 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10193
10194 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10195 and CR respectively.
10196
10197 A vector value indicates that a format of end-of-line should be
10198 detected automatically.  Nth element of the vector is the subsidiary
10199 coding system whose eol-type is N.  */)
10200   (Lisp_Object coding_system)
10201 {
10202   Lisp_Object spec, eol_type;
10203   int n;
10204
10205   if (NILP (coding_system))
10206     coding_system = Qno_conversion;
10207   if (! CODING_SYSTEM_P (coding_system))
10208     return Qnil;
10209   spec = CODING_SYSTEM_SPEC (coding_system);
10210   eol_type = AREF (spec, 2);
10211   if (VECTORP (eol_type))
10212     return Fcopy_sequence (eol_type);
10213   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10214   return make_number (n);
10215 }
10216
10217 #endif /* emacs */
10218
10219 \f
10220 /*** 9. Post-amble ***/
10221
10222 void
10223 init_coding_once (void)
10224 {
10225   int i;
10226
10227   for (i = 0; i < coding_category_max; i++)
10228     {
10229       coding_categories[i].id = -1;
10230       coding_priorities[i] = i;
10231     }
10232
10233   /* ISO2022 specific initialize routine.  */
10234   for (i = 0; i < 0x20; i++)
10235     iso_code_class[i] = ISO_control_0;
10236   for (i = 0x21; i < 0x7F; i++)
10237     iso_code_class[i] = ISO_graphic_plane_0;
10238   for (i = 0x80; i < 0xA0; i++)
10239     iso_code_class[i] = ISO_control_1;
10240   for (i = 0xA1; i < 0xFF; i++)
10241     iso_code_class[i] = ISO_graphic_plane_1;
10242   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10243   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10244   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10245   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10246   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10247   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10248   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10249   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10250   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10251
10252   for (i = 0; i < 256; i++)
10253     {
10254       emacs_mule_bytes[i] = 1;
10255     }
10256   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10257   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10258   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10259   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10260 }
10261
10262 #ifdef emacs
10263
10264 void
10265 syms_of_coding (void)
10266 {
10267   staticpro (&Vcoding_system_hash_table);
10268   {
10269     Lisp_Object args[2];
10270     args[0] = QCtest;
10271     args[1] = Qeq;
10272     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10273   }
10274
10275   staticpro (&Vsjis_coding_system);
10276   Vsjis_coding_system = Qnil;
10277
10278   staticpro (&Vbig5_coding_system);
10279   Vbig5_coding_system = Qnil;
10280
10281   staticpro (&Vcode_conversion_reused_workbuf);
10282   Vcode_conversion_reused_workbuf = Qnil;
10283
10284   staticpro (&Vcode_conversion_workbuf_name);
10285   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10286
10287   reused_workbuf_in_use = 0;
10288
10289   DEFSYM (Qcharset, "charset");
10290   DEFSYM (Qtarget_idx, "target-idx");
10291   DEFSYM (Qcoding_system_history, "coding-system-history");
10292   Fset (Qcoding_system_history, Qnil);
10293
10294   /* Target FILENAME is the first argument.  */
10295   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10296   /* Target FILENAME is the third argument.  */
10297   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10298
10299   DEFSYM (Qcall_process, "call-process");
10300   /* Target PROGRAM is the first argument.  */
10301   Fput (Qcall_process, Qtarget_idx, make_number (0));
10302
10303   DEFSYM (Qcall_process_region, "call-process-region");
10304   /* Target PROGRAM is the third argument.  */
10305   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10306
10307   DEFSYM (Qstart_process, "start-process");
10308   /* Target PROGRAM is the third argument.  */
10309   Fput (Qstart_process, Qtarget_idx, make_number (2));
10310
10311   DEFSYM (Qopen_network_stream, "open-network-stream");
10312   /* Target SERVICE is the fourth argument.  */
10313   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10314
10315   DEFSYM (Qcoding_system, "coding-system");
10316   DEFSYM (Qcoding_aliases, "coding-aliases");
10317
10318   DEFSYM (Qeol_type, "eol-type");
10319   DEFSYM (Qunix, "unix");
10320   DEFSYM (Qdos, "dos");
10321   DEFSYM (Qmac, "mac");
10322
10323   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10324   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10325   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10326   DEFSYM (Qdefault_char, "default-char");
10327   DEFSYM (Qundecided, "undecided");
10328   DEFSYM (Qno_conversion, "no-conversion");
10329   DEFSYM (Qraw_text, "raw-text");
10330
10331   DEFSYM (Qiso_2022, "iso-2022");
10332
10333   DEFSYM (Qutf_8, "utf-8");
10334   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10335
10336 #if defined (WINDOWSNT) || defined (CYGWIN)
10337   /* No, not utf-16-le: that one has a BOM.  */
10338   DEFSYM (Qutf_16le, "utf-16le");
10339 #endif
10340
10341   DEFSYM (Qutf_16, "utf-16");
10342   DEFSYM (Qbig, "big");
10343   DEFSYM (Qlittle, "little");
10344
10345   DEFSYM (Qshift_jis, "shift-jis");
10346   DEFSYM (Qbig5, "big5");
10347
10348   DEFSYM (Qcoding_system_p, "coding-system-p");
10349
10350   DEFSYM (Qcoding_system_error, "coding-system-error");
10351   Fput (Qcoding_system_error, Qerror_conditions,
10352         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10353   Fput (Qcoding_system_error, Qerror_message,
10354         build_pure_c_string ("Invalid coding system"));
10355
10356   /* Intern this now in case it isn't already done.
10357      Setting this variable twice is harmless.
10358      But don't staticpro it here--that is done in alloc.c.  */
10359   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10360
10361   DEFSYM (Qtranslation_table, "translation-table");
10362   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10363   DEFSYM (Qtranslation_table_id, "translation-table-id");
10364   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10365   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10366
10367   DEFSYM (Qvalid_codes, "valid-codes");
10368
10369   DEFSYM (Qemacs_mule, "emacs-mule");
10370
10371   DEFSYM (QCcategory, ":category");
10372   DEFSYM (QCmnemonic, ":mnemonic");
10373   DEFSYM (QCdefault_char, ":default-char");
10374   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10375   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10376   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10377   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10378   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10379
10380   Vcoding_category_table
10381     = Fmake_vector (make_number (coding_category_max), Qnil);
10382   staticpro (&Vcoding_category_table);
10383   /* Followings are target of code detection.  */
10384   ASET (Vcoding_category_table, coding_category_iso_7,
10385         intern_c_string ("coding-category-iso-7"));
10386   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10387         intern_c_string ("coding-category-iso-7-tight"));
10388   ASET (Vcoding_category_table, coding_category_iso_8_1,
10389         intern_c_string ("coding-category-iso-8-1"));
10390   ASET (Vcoding_category_table, coding_category_iso_8_2,
10391         intern_c_string ("coding-category-iso-8-2"));
10392   ASET (Vcoding_category_table, coding_category_iso_7_else,
10393         intern_c_string ("coding-category-iso-7-else"));
10394   ASET (Vcoding_category_table, coding_category_iso_8_else,
10395         intern_c_string ("coding-category-iso-8-else"));
10396   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10397         intern_c_string ("coding-category-utf-8-auto"));
10398   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10399         intern_c_string ("coding-category-utf-8"));
10400   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10401         intern_c_string ("coding-category-utf-8-sig"));
10402   ASET (Vcoding_category_table, coding_category_utf_16_be,
10403         intern_c_string ("coding-category-utf-16-be"));
10404   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10405         intern_c_string ("coding-category-utf-16-auto"));
10406   ASET (Vcoding_category_table, coding_category_utf_16_le,
10407         intern_c_string ("coding-category-utf-16-le"));
10408   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10409         intern_c_string ("coding-category-utf-16-be-nosig"));
10410   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10411         intern_c_string ("coding-category-utf-16-le-nosig"));
10412   ASET (Vcoding_category_table, coding_category_charset,
10413         intern_c_string ("coding-category-charset"));
10414   ASET (Vcoding_category_table, coding_category_sjis,
10415         intern_c_string ("coding-category-sjis"));
10416   ASET (Vcoding_category_table, coding_category_big5,
10417         intern_c_string ("coding-category-big5"));
10418   ASET (Vcoding_category_table, coding_category_ccl,
10419         intern_c_string ("coding-category-ccl"));
10420   ASET (Vcoding_category_table, coding_category_emacs_mule,
10421         intern_c_string ("coding-category-emacs-mule"));
10422   /* Followings are NOT target of code detection.  */
10423   ASET (Vcoding_category_table, coding_category_raw_text,
10424         intern_c_string ("coding-category-raw-text"));
10425   ASET (Vcoding_category_table, coding_category_undecided,
10426         intern_c_string ("coding-category-undecided"));
10427
10428   DEFSYM (Qinsufficient_source, "insufficient-source");
10429   DEFSYM (Qinvalid_source, "invalid-source");
10430   DEFSYM (Qinterrupted, "interrupted");
10431   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10432
10433   defsubr (&Scoding_system_p);
10434   defsubr (&Sread_coding_system);
10435   defsubr (&Sread_non_nil_coding_system);
10436   defsubr (&Scheck_coding_system);
10437   defsubr (&Sdetect_coding_region);
10438   defsubr (&Sdetect_coding_string);
10439   defsubr (&Sfind_coding_systems_region_internal);
10440   defsubr (&Sunencodable_char_position);
10441   defsubr (&Scheck_coding_systems_region);
10442   defsubr (&Sdecode_coding_region);
10443   defsubr (&Sencode_coding_region);
10444   defsubr (&Sdecode_coding_string);
10445   defsubr (&Sencode_coding_string);
10446   defsubr (&Sdecode_sjis_char);
10447   defsubr (&Sencode_sjis_char);
10448   defsubr (&Sdecode_big5_char);
10449   defsubr (&Sencode_big5_char);
10450   defsubr (&Sset_terminal_coding_system_internal);
10451   defsubr (&Sset_safe_terminal_coding_system_internal);
10452   defsubr (&Sterminal_coding_system);
10453   defsubr (&Sset_keyboard_coding_system_internal);
10454   defsubr (&Skeyboard_coding_system);
10455   defsubr (&Sfind_operation_coding_system);
10456   defsubr (&Sset_coding_system_priority);
10457   defsubr (&Sdefine_coding_system_internal);
10458   defsubr (&Sdefine_coding_system_alias);
10459   defsubr (&Scoding_system_put);
10460   defsubr (&Scoding_system_base);
10461   defsubr (&Scoding_system_plist);
10462   defsubr (&Scoding_system_aliases);
10463   defsubr (&Scoding_system_eol_type);
10464   defsubr (&Scoding_system_priority_list);
10465
10466   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10467                doc: /* List of coding systems.
10468
10469 Do not alter the value of this variable manually.  This variable should be
10470 updated by the functions `define-coding-system' and
10471 `define-coding-system-alias'.  */);
10472   Vcoding_system_list = Qnil;
10473
10474   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10475                doc: /* Alist of coding system names.
10476 Each element is one element list of coding system name.
10477 This variable is given to `completing-read' as COLLECTION argument.
10478
10479 Do not alter the value of this variable manually.  This variable should be
10480 updated by the functions `make-coding-system' and
10481 `define-coding-system-alias'.  */);
10482   Vcoding_system_alist = Qnil;
10483
10484   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10485                doc: /* List of coding-categories (symbols) ordered by priority.
10486
10487 On detecting a coding system, Emacs tries code detection algorithms
10488 associated with each coding-category one by one in this order.  When
10489 one algorithm agrees with a byte sequence of source text, the coding
10490 system bound to the corresponding coding-category is selected.
10491
10492 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10493   {
10494     int i;
10495
10496     Vcoding_category_list = Qnil;
10497     for (i = coding_category_max - 1; i >= 0; i--)
10498       Vcoding_category_list
10499         = Fcons (AREF (Vcoding_category_table, i),
10500                  Vcoding_category_list);
10501   }
10502
10503   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10504                doc: /* Specify the coding system for read operations.
10505 It is useful to bind this variable with `let', but do not set it globally.
10506 If the value is a coding system, it is used for decoding on read operation.
10507 If not, an appropriate element is used from one of the coding system alists.
10508 There are three such tables: `file-coding-system-alist',
10509 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10510   Vcoding_system_for_read = Qnil;
10511
10512   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10513                doc: /* Specify the coding system for write operations.
10514 Programs bind this variable with `let', but you should not set it globally.
10515 If the value is a coding system, it is used for encoding of output,
10516 when writing it to a file and when sending it to a file or subprocess.
10517
10518 If this does not specify a coding system, an appropriate element
10519 is used from one of the coding system alists.
10520 There are three such tables: `file-coding-system-alist',
10521 `process-coding-system-alist', and `network-coding-system-alist'.
10522 For output to files, if the above procedure does not specify a coding system,
10523 the value of `buffer-file-coding-system' is used.  */);
10524   Vcoding_system_for_write = Qnil;
10525
10526   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10527                doc: /*
10528 Coding system used in the latest file or process I/O.  */);
10529   Vlast_coding_system_used = Qnil;
10530
10531   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10532                doc: /*
10533 Error status of the last code conversion.
10534
10535 When an error was detected in the last code conversion, this variable
10536 is set to one of the following symbols.
10537   `insufficient-source'
10538   `inconsistent-eol'
10539   `invalid-source'
10540   `interrupted'
10541   `insufficient-memory'
10542 When no error was detected, the value doesn't change.  So, to check
10543 the error status of a code conversion by this variable, you must
10544 explicitly set this variable to nil before performing code
10545 conversion.  */);
10546   Vlast_code_conversion_error = Qnil;
10547
10548   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10549                doc: /*
10550 *Non-nil means always inhibit code conversion of end-of-line format.
10551 See info node `Coding Systems' and info node `Text and Binary' concerning
10552 such conversion.  */);
10553   inhibit_eol_conversion = 0;
10554
10555   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10556                doc: /*
10557 Non-nil means process buffer inherits coding system of process output.
10558 Bind it to t if the process output is to be treated as if it were a file
10559 read from some filesystem.  */);
10560   inherit_process_coding_system = 0;
10561
10562   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10563                doc: /*
10564 Alist to decide a coding system to use for a file I/O operation.
10565 The format is ((PATTERN . VAL) ...),
10566 where PATTERN is a regular expression matching a file name,
10567 VAL is a coding system, a cons of coding systems, or a function symbol.
10568 If VAL is a coding system, it is used for both decoding and encoding
10569 the file contents.
10570 If VAL is a cons of coding systems, the car part is used for decoding,
10571 and the cdr part is used for encoding.
10572 If VAL is a function symbol, the function must return a coding system
10573 or a cons of coding systems which are used as above.  The function is
10574 called with an argument that is a list of the arguments with which
10575 `find-operation-coding-system' was called.  If the function can't decide
10576 a coding system, it can return `undecided' so that the normal
10577 code-detection is performed.
10578
10579 See also the function `find-operation-coding-system'
10580 and the variable `auto-coding-alist'.  */);
10581   Vfile_coding_system_alist = Qnil;
10582
10583   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10584                doc: /*
10585 Alist to decide a coding system to use for a process I/O operation.
10586 The format is ((PATTERN . VAL) ...),
10587 where PATTERN is a regular expression matching a program name,
10588 VAL is a coding system, a cons of coding systems, or a function symbol.
10589 If VAL is a coding system, it is used for both decoding what received
10590 from the program and encoding what sent to the program.
10591 If VAL is a cons of coding systems, the car part is used for decoding,
10592 and the cdr part is used for encoding.
10593 If VAL is a function symbol, the function must return a coding system
10594 or a cons of coding systems which are used as above.
10595
10596 See also the function `find-operation-coding-system'.  */);
10597   Vprocess_coding_system_alist = Qnil;
10598
10599   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10600                doc: /*
10601 Alist to decide a coding system to use for a network I/O operation.
10602 The format is ((PATTERN . VAL) ...),
10603 where PATTERN is a regular expression matching a network service name
10604 or is a port number to connect to,
10605 VAL is a coding system, a cons of coding systems, or a function symbol.
10606 If VAL is a coding system, it is used for both decoding what received
10607 from the network stream and encoding what sent to the network stream.
10608 If VAL is a cons of coding systems, the car part is used for decoding,
10609 and the cdr part is used for encoding.
10610 If VAL is a function symbol, the function must return a coding system
10611 or a cons of coding systems which are used as above.
10612
10613 See also the function `find-operation-coding-system'.  */);
10614   Vnetwork_coding_system_alist = Qnil;
10615
10616   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10617                doc: /* Coding system to use with system messages.
10618 Also used for decoding keyboard input on X Window system.  */);
10619   Vlocale_coding_system = Qnil;
10620
10621   /* The eol mnemonics are reset in startup.el system-dependently.  */
10622   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10623                doc: /*
10624 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10625   eol_mnemonic_unix = build_pure_c_string (":");
10626
10627   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10628                doc: /*
10629 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10630   eol_mnemonic_dos = build_pure_c_string ("\\");
10631
10632   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10633                doc: /*
10634 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10635   eol_mnemonic_mac = build_pure_c_string ("/");
10636
10637   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10638                doc: /*
10639 *String displayed in mode line when end-of-line format is not yet determined.  */);
10640   eol_mnemonic_undecided = build_pure_c_string (":");
10641
10642   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10643                doc: /*
10644 *Non-nil enables character translation while encoding and decoding.  */);
10645   Venable_character_translation = Qt;
10646
10647   DEFVAR_LISP ("standard-translation-table-for-decode",
10648                Vstandard_translation_table_for_decode,
10649                doc: /* Table for translating characters while decoding.  */);
10650   Vstandard_translation_table_for_decode = Qnil;
10651
10652   DEFVAR_LISP ("standard-translation-table-for-encode",
10653                Vstandard_translation_table_for_encode,
10654                doc: /* Table for translating characters while encoding.  */);
10655   Vstandard_translation_table_for_encode = Qnil;
10656
10657   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10658                doc: /* Alist of charsets vs revision numbers.
10659 While encoding, if a charset (car part of an element) is found,
10660 designate it with the escape sequence identifying revision (cdr part
10661 of the element).  */);
10662   Vcharset_revision_table = Qnil;
10663
10664   DEFVAR_LISP ("default-process-coding-system",
10665                Vdefault_process_coding_system,
10666                doc: /* Cons of coding systems used for process I/O by default.
10667 The car part is used for decoding a process output,
10668 the cdr part is used for encoding a text to be sent to a process.  */);
10669   Vdefault_process_coding_system = Qnil;
10670
10671   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10672                doc: /*
10673 Table of extra Latin codes in the range 128..159 (inclusive).
10674 This is a vector of length 256.
10675 If Nth element is non-nil, the existence of code N in a file
10676 \(or output of subprocess) doesn't prevent it to be detected as
10677 a coding system of ISO 2022 variant which has a flag
10678 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10679 or reading output of a subprocess.
10680 Only 128th through 159th elements have a meaning.  */);
10681   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10682
10683   DEFVAR_LISP ("select-safe-coding-system-function",
10684                Vselect_safe_coding_system_function,
10685                doc: /*
10686 Function to call to select safe coding system for encoding a text.
10687
10688 If set, this function is called to force a user to select a proper
10689 coding system which can encode the text in the case that a default
10690 coding system used in each operation can't encode the text.  The
10691 function should take care that the buffer is not modified while
10692 the coding system is being selected.
10693
10694 The default value is `select-safe-coding-system' (which see).  */);
10695   Vselect_safe_coding_system_function = Qnil;
10696
10697   DEFVAR_BOOL ("coding-system-require-warning",
10698                coding_system_require_warning,
10699                doc: /* Internal use only.
10700 If non-nil, on writing a file, `select-safe-coding-system-function' is
10701 called even if `coding-system-for-write' is non-nil.  The command
10702 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10703   coding_system_require_warning = 0;
10704
10705
10706   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10707                inhibit_iso_escape_detection,
10708                doc: /*
10709 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10710
10711 When Emacs reads text, it tries to detect how the text is encoded.
10712 This code detection is sensitive to escape sequences.  If Emacs sees
10713 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10714 of the ISO2022 encodings, and decodes text by the corresponding coding
10715 system (e.g. `iso-2022-7bit').
10716
10717 However, there may be a case that you want to read escape sequences in
10718 a file as is.  In such a case, you can set this variable to non-nil.
10719 Then the code detection will ignore any escape sequences, and no text is
10720 detected as encoded in some ISO-2022 encoding.  The result is that all
10721 escape sequences become visible in a buffer.
10722
10723 The default value is nil, and it is strongly recommended not to change
10724 it.  That is because many Emacs Lisp source files that contain
10725 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10726 in Emacs's distribution, and they won't be decoded correctly on
10727 reading if you suppress escape sequence detection.
10728
10729 The other way to read escape sequences in a file without decoding is
10730 to explicitly specify some coding system that doesn't use ISO-2022
10731 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
10732   inhibit_iso_escape_detection = 0;
10733
10734   DEFVAR_BOOL ("inhibit-null-byte-detection",
10735                inhibit_null_byte_detection,
10736                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10737 By default, Emacs treats it as binary data, and does not attempt to
10738 decode it.  The effect is as if you specified `no-conversion' for
10739 reading that text.
10740
10741 Set this to non-nil when a regular text happens to include null bytes.
10742 Examples are Index nodes of Info files and null-byte delimited output
10743 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10744 decode text as usual.  */);
10745   inhibit_null_byte_detection = 0;
10746
10747   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10748                doc: /* Char table for translating self-inserting characters.
10749 This is applied to the result of input methods, not their input.
10750 See also `keyboard-translate-table'.
10751
10752 Use of this variable for character code unification was rendered
10753 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10754 internal character representation.  */);
10755     Vtranslation_table_for_input = Qnil;
10756
10757   {
10758     Lisp_Object args[coding_arg_max];
10759     Lisp_Object plist[16];
10760     int i;
10761
10762     for (i = 0; i < coding_arg_max; i++)
10763       args[i] = Qnil;
10764
10765     plist[0] = intern_c_string (":name");
10766     plist[1] = args[coding_arg_name] = Qno_conversion;
10767     plist[2] = intern_c_string (":mnemonic");
10768     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10769     plist[4] = intern_c_string (":coding-type");
10770     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10771     plist[6] = intern_c_string (":ascii-compatible-p");
10772     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10773     plist[8] = intern_c_string (":default-char");
10774     plist[9] = args[coding_arg_default_char] = make_number (0);
10775     plist[10] = intern_c_string (":for-unibyte");
10776     plist[11] = args[coding_arg_for_unibyte] = Qt;
10777     plist[12] = intern_c_string (":docstring");
10778     plist[13] = build_pure_c_string ("Do no conversion.\n\
10779 \n\
10780 When you visit a file with this coding, the file is read into a\n\
10781 unibyte buffer as is, thus each byte of a file is treated as a\n\
10782 character.");
10783     plist[14] = intern_c_string (":eol-type");
10784     plist[15] = args[coding_arg_eol_type] = Qunix;
10785     args[coding_arg_plist] = Flist (16, plist);
10786     Fdefine_coding_system_internal (coding_arg_max, args);
10787
10788     plist[1] = args[coding_arg_name] = Qundecided;
10789     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10790     plist[5] = args[coding_arg_coding_type] = Qundecided;
10791     /* This is already set.
10792        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10793     plist[8] = intern_c_string (":charset-list");
10794     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10795     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10796     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10797     plist[15] = args[coding_arg_eol_type] = Qnil;
10798     args[coding_arg_plist] = Flist (16, plist);
10799     Fdefine_coding_system_internal (coding_arg_max, args);
10800   }
10801
10802   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10803
10804   {
10805     int i;
10806
10807     for (i = 0; i < coding_category_max; i++)
10808       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10809   }
10810 #if defined (DOS_NT)
10811   system_eol_type = Qdos;
10812 #else
10813   system_eol_type = Qunix;
10814 #endif
10815   staticpro (&system_eol_type);
10816 }
10817
10818 char *
10819 emacs_strerror (int error_number)
10820 {
10821   char *str;
10822
10823   synchronize_system_messages_locale ();
10824   str = strerror (error_number);
10825
10826   if (! NILP (Vlocale_coding_system))
10827     {
10828       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10829                                                       Vlocale_coding_system,
10830                                                       0);
10831       str = SSDATA (dec);
10832     }
10833
10834   return str;
10835 }
10836
10837 #endif /* emacs */