src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 646
 647 static int
 648 encode_inhibit_flag (Lisp_Object flag)
 649 {
 650   return NILP (flag) ? -1 : EQ (flag, Qt);
 651 }
 652
 653 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 654    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 655
 656 static bool
 657 inhibit_flag (int encoded_flag, bool var)
 658 {
 659   return 0 < encoded_flag + var;
 660 }
 661
 662 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 663   do {                                                  \
 664     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 665     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 666   } while (0)
 667
 668 static void
 669 CHECK_NATNUM_CAR (Lisp_Object x)
 670 {
 671   Lisp_Object tmp = XCAR (x);
 672   CHECK_NATNUM (tmp);
 673   XSETCAR (x, tmp);
 674 }
 675
 676 static void
 677 CHECK_NATNUM_CDR (Lisp_Object x)
 678 {
 679   Lisp_Object tmp = XCDR (x);
 680   CHECK_NATNUM (tmp);
 681   XSETCDR (x, tmp);
 682 }
 683
 684 /* True if CODING's destination can be grown.  */
 685
 686 static bool
 687 growable_destination (struct coding_system *coding)
 688 {
 689   return STRINGP (coding->dst_object) || BUFFERP (coding->dst_object);
 690 }
 691
 692
 693 /* Safely get one byte from the source text pointed by SRC which ends
 694    at SRC_END, and set C to that byte.  If there are not enough bytes
 695    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 696    and a multibyte character is found at SRC, set C to the
 697    negative value of the character code.  The caller should declare
 698    and set these variables appropriately in advance:
 699         src, src_end, multibytep */
 700
 701 #define ONE_MORE_BYTE(c)                                \
 702   do {                                                  \
 703     if (src == src_end)                                 \
 704       {                                                 \
 705         if (src_base < src)                             \
 706           record_conversion_result                      \
 707             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 708         goto no_more_source;                            \
 709       }                                                 \
 710     c = *src++;                                         \
 711     if (multibytep && (c & 0x80))                       \
 712       {                                                 \
 713         if ((c & 0xFE) == 0xC0)                         \
 714           c = ((c & 1) << 6) | *src++;                  \
 715         else                                            \
 716           {                                             \
 717             src--;                                      \
 718             c = - string_char (src, &src, NULL);        \
 719             record_conversion_result                    \
 720               (coding, CODING_RESULT_INVALID_SRC);      \
 721           }                                             \
 722       }                                                 \
 723     consumed_chars++;                                   \
 724   } while (0)
 725
 726 /* Safely get two bytes from the source text pointed by SRC which ends
 727    at SRC_END, and set C1 and C2 to those bytes while skipping the
 728    heading multibyte characters.  If there are not enough bytes in the
 729    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 730    a multibyte character is found for C2, set C2 to the negative value
 731    of the character code.  The caller should declare and set these
 732    variables appropriately in advance:
 733         src, src_end, multibytep
 734    It is intended that this macro is used in detect_coding_utf_16.  */
 735
 736 #define TWO_MORE_BYTES(c1, c2)                          \
 737   do {                                                  \
 738     do {                                                \
 739       if (src == src_end)                               \
 740         goto no_more_source;                            \
 741       c1 = *src++;                                      \
 742       if (multibytep && (c1 & 0x80))                    \
 743         {                                               \
 744           if ((c1 & 0xFE) == 0xC0)                      \
 745             c1 = ((c1 & 1) << 6) | *src++;              \
 746           else                                          \
 747             {                                           \
 748               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 749               c1 = -1;                                  \
 750             }                                           \
 751         }                                               \
 752     } while (c1 < 0);                                   \
 753     if (src == src_end)                                 \
 754       goto no_more_source;                              \
 755     c2 = *src++;                                        \
 756     if (multibytep && (c2 & 0x80))                      \
 757       {                                                 \
 758         if ((c2 & 0xFE) == 0xC0)                        \
 759           c2 = ((c2 & 1) << 6) | *src++;                \
 760         else                                            \
 761           c2 = -1;                                      \
 762       }                                                 \
 763   } while (0)
 764
 765
 766 /* Store a byte C in the place pointed by DST and increment DST to the
 767    next free point, and increment PRODUCED_CHARS.  The caller should
 768    assure that C is 0..127, and declare and set the variable `dst'
 769    appropriately in advance.
 770 */
 771
 772
 773 #define EMIT_ONE_ASCII_BYTE(c)  \
 774   do {                          \
 775     produced_chars++;           \
 776     *dst++ = (c);               \
 777   } while (0)
 778
 779
 780 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 781
 782 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 783   do {                                  \
 784     produced_chars += 2;                \
 785     *dst++ = (c1), *dst++ = (c2);       \
 786   } while (0)
 787
 788
 789 /* Store a byte C in the place pointed by DST and increment DST to the
 790    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 791    store in an appropriate multibyte form.  The caller should
 792    declare and set the variables `dst' and `multibytep' appropriately
 793    in advance.  */
 794
 795 #define EMIT_ONE_BYTE(c)                \
 796   do {                                  \
 797     produced_chars++;                   \
 798     if (multibytep)                     \
 799       {                                 \
 800         unsigned ch = (c);              \
 801         if (ch >= 0x80)                 \
 802           ch = BYTE8_TO_CHAR (ch);      \
 803         CHAR_STRING_ADVANCE (ch, dst);  \
 804       }                                 \
 805     else                                \
 806       *dst++ = (c);                     \
 807   } while (0)
 808
 809
 810 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 811
 812 #define EMIT_TWO_BYTES(c1, c2)          \
 813   do {                                  \
 814     produced_chars += 2;                \
 815     if (multibytep)                     \
 816       {                                 \
 817         unsigned ch;                    \
 818                                         \
 819         ch = (c1);                      \
 820         if (ch >= 0x80)                 \
 821           ch = BYTE8_TO_CHAR (ch);      \
 822         CHAR_STRING_ADVANCE (ch, dst);  \
 823         ch = (c2);                      \
 824         if (ch >= 0x80)                 \
 825           ch = BYTE8_TO_CHAR (ch);      \
 826         CHAR_STRING_ADVANCE (ch, dst);  \
 827       }                                 \
 828     else                                \
 829       {                                 \
 830         *dst++ = (c1);                  \
 831         *dst++ = (c2);                  \
 832       }                                 \
 833   } while (0)
 834
 835
 836 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 837   do {                                  \
 838     EMIT_ONE_BYTE (c1);                 \
 839     EMIT_TWO_BYTES (c2, c3);            \
 840   } while (0)
 841
 842
 843 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 844   do {                                          \
 845     EMIT_TWO_BYTES (c1, c2);                    \
 846     EMIT_TWO_BYTES (c3, c4);                    \
 847   } while (0)
 848
 849
 850 static void
 851 record_conversion_result (struct coding_system *coding,
 852                           enum coding_result_code result)
 853 {
 854   coding->result = result;
 855   switch (result)
 856     {
 857     case CODING_RESULT_INSUFFICIENT_SRC:
 858       Vlast_code_conversion_error = Qinsufficient_source;
 859       break;
 860     case CODING_RESULT_INVALID_SRC:
 861       Vlast_code_conversion_error = Qinvalid_source;
 862       break;
 863     case CODING_RESULT_INTERRUPT:
 864       Vlast_code_conversion_error = Qinterrupted;
 865       break;
 866     case CODING_RESULT_INSUFFICIENT_DST:
 867       /* Don't record this error in Vlast_code_conversion_error
 868          because it happens just temporarily and is resolved when the
 869          whole conversion is finished.  */
 870       break;
 871     case CODING_RESULT_SUCCESS:
 872       break;
 873     default:
 874       Vlast_code_conversion_error = intern ("Unknown error");
 875     }
 876 }
 877
 878 /* These wrapper macros are used to preserve validity of pointers into
 879    buffer text across calls to decode_char, encode_char, etc, which
 880    could cause relocation of buffers if it loads a charset map,
 881    because loading a charset map allocates large structures.  */
 882
 883 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 884   do {                                                                       \
 885     ptrdiff_t offset;                                                        \
 886                                                                              \
 887     charset_map_loaded = 0;                                                  \
 888     c = DECODE_CHAR (charset, code);                                         \
 889     if (charset_map_loaded                                                   \
 890         && (offset = coding_change_source (coding)))                         \
 891       {                                                                      \
 892         src += offset;                                                       \
 893         src_base += offset;                                                  \
 894         src_end += offset;                                                   \
 895       }                                                                      \
 896   } while (0)
 897
 898 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 899   do {                                                                  \
 900     ptrdiff_t offset;                                                   \
 901                                                                         \
 902     charset_map_loaded = 0;                                             \
 903     code = ENCODE_CHAR (charset, c);                                    \
 904     if (charset_map_loaded                                              \
 905         && (offset = coding_change_destination (coding)))               \
 906       {                                                                 \
 907         dst += offset;                                                  \
 908         dst_end += offset;                                              \
 909       }                                                                 \
 910   } while (0)
 911
 912 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 913   do {                                                                  \
 914     ptrdiff_t offset;                                                   \
 915                                                                         \
 916     charset_map_loaded = 0;                                             \
 917     charset = char_charset (c, charset_list, code_return);              \
 918     if (charset_map_loaded                                              \
 919         && (offset = coding_change_destination (coding)))               \
 920       {                                                                 \
 921         dst += offset;                                                  \
 922         dst_end += offset;                                              \
 923       }                                                                 \
 924   } while (0)
 925
 926 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 927   do {                                                                  \
 928     ptrdiff_t offset;                                                   \
 929                                                                         \
 930     charset_map_loaded = 0;                                             \
 931     result = CHAR_CHARSET_P (c, charset);                               \
 932     if (charset_map_loaded                                              \
 933         && (offset = coding_change_destination (coding)))               \
 934       {                                                                 \
 935         dst += offset;                                                  \
 936         dst_end += offset;                                              \
 937       }                                                                 \
 938   } while (0)
 939
 940
 941 /* If there are at least BYTES length of room at dst, allocate memory
 942    for coding->destination and update dst and dst_end.  We don't have
 943    to take care of coding->source which will be relocated.  It is
 944    handled by calling coding_set_source in encode_coding.  */
 945
 946 #define ASSURE_DESTINATION(bytes)                               \
 947   do {                                                          \
 948     if (dst + (bytes) >= dst_end)                               \
 949       {                                                         \
 950         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 951                                                                 \
 952         dst = alloc_destination (coding, more_bytes, dst);      \
 953         dst_end = coding->destination + coding->dst_bytes;      \
 954       }                                                         \
 955   } while (0)
 956
 957
 958 /* Store multibyte form of the character C in P, and advance P to the
 959    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 960    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 961    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 962
 963 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 964
 965 /* Return the character code of character whose multibyte form is at
 966    P, and advance P to the end of the multibyte form.  This used to be
 967    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 968    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 969
 970 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 971
 972 /* Set coding->source from coding->src_object.  */
 973
 974 static void
 975 coding_set_source (struct coding_system *coding)
 976 {
 977   if (BUFFERP (coding->src_object))
 978     {
 979       struct buffer *buf = XBUFFER (coding->src_object);
 980
 981       if (coding->src_pos < 0)
 982         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 983       else
 984         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 985     }
 986   else if (STRINGP (coding->src_object))
 987     {
 988       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 989     }
 990   else
 991     {
 992       /* Otherwise, the source is C string and is never relocated
 993          automatically.  Thus we don't have to update anything.  */
 994     }
 995 }
 996
 997
 998 /* Set coding->source from coding->src_object, and return how many
 999    bytes coding->source was changed.  */
1000
1001 static ptrdiff_t
1002 coding_change_source (struct coding_system *coding)
1003 {
1004   const unsigned char *orig = coding->source;
1005   coding_set_source (coding);
1006   return coding->source - orig;
1007 }
1008
1009
1010 /* Set coding->destination from coding->dst_object.  */
1011
1012 static void
1013 coding_set_destination (struct coding_system *coding)
1014 {
1015   if (BUFFERP (coding->dst_object))
1016     {
1017       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1018         {
1019           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1020           coding->dst_bytes = (GAP_END_ADDR
1021                                - (coding->src_bytes - coding->consumed)
1022                                - coding->destination);
1023         }
1024       else
1025         {
1026           /* We are sure that coding->dst_pos_byte is before the gap
1027              of the buffer. */
1028           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1029                                  + coding->dst_pos_byte - BEG_BYTE);
1030           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1031                                - coding->destination);
1032         }
1033     }
1034   else
1035     {
1036       /* Otherwise, the destination is C string and is never relocated
1037          automatically.  Thus we don't have to update anything.  */
1038     }
1039 }
1040
1041
1042 /* Set coding->destination from coding->dst_object, and return how
1043    many bytes coding->destination was changed.  */
1044
1045 static ptrdiff_t
1046 coding_change_destination (struct coding_system *coding)
1047 {
1048   const unsigned char *orig = coding->destination;
1049   coding_set_destination (coding);
1050   return coding->destination - orig;
1051 }
1052
1053
1054 static void
1055 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1056 {
1057   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1058     string_overflow ();
1059   coding->destination = xrealloc (coding->destination,
1060                                   coding->dst_bytes + bytes);
1061   coding->dst_bytes += bytes;
1062 }
1063
1064 static void
1065 coding_alloc_by_making_gap (struct coding_system *coding,
1066                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1067 {
1068   if (EQ (coding->src_object, coding->dst_object))
1069     {
1070       /* The gap may contain the produced data at the head and not-yet
1071          consumed data at the tail.  To preserve those data, we at
1072          first make the gap size to zero, then increase the gap
1073          size.  */
1074       ptrdiff_t add = GAP_SIZE;
1075
1076       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1077       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1078       make_gap (bytes);
1079       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1080       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1081     }
1082   else
1083     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1084 }
1085
1086
1087 static unsigned char *
1088 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1089                    unsigned char *dst)
1090 {
1091   ptrdiff_t offset = dst - coding->destination;
1092
1093   if (BUFFERP (coding->dst_object))
1094     {
1095       struct buffer *buf = XBUFFER (coding->dst_object);
1096
1097       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1098     }
1099   else
1100     coding_alloc_by_realloc (coding, nbytes);
1101   coding_set_destination (coding);
1102   dst = coding->destination + offset;
1103   return dst;
1104 }
1105
1106 /** Macros for annotations.  */
1107
1108 /* An annotation data is stored in the array coding->charbuf in this
1109    format:
1110      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1111    LENGTH is the number of elements in the annotation.
1112    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1113    NCHARS is the number of characters in the text annotated.
1114
1115    The format of the following elements depend on ANNOTATION_MASK.
1116
1117    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1118    follows:
1119      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1120
1121    NBYTES is the number of bytes specified in the header part of
1122    old-style emacs-mule encoding, or 0 for the other kind of
1123    composition.
1124
1125    METHOD is one of enum composition_method.
1126
1127    Optional COMPOSITION-COMPONENTS are characters and composition
1128    rules.
1129
1130    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1131    follows.
1132
1133    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1134    recover from an invalid annotation, and should be skipped by
1135    produce_annotation.  */
1136
1137 /* Maximum length of the header of annotation data.  */
1138 #define MAX_ANNOTATION_LENGTH 5
1139
1140 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1141   do {                                                  \
1142     *(buf)++ = -(len);                                  \
1143     *(buf)++ = (mask);                                  \
1144     *(buf)++ = (nchars);                                \
1145     coding->annotated = 1;                              \
1146   } while (0);
1147
1148 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1149   do {                                                                      \
1150     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1151     *buf++ = nbytes;                                                        \
1152     *buf++ = method;                                                        \
1153   } while (0)
1154
1155
1156 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1157   do {                                                                  \
1158     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1159     *buf++ = id;                                                        \
1160   } while (0)
1161
1162
1163 /* Bitmasks for coding->eol_seen.  */
1164
1165 #define EOL_SEEN_NONE   0
1166 #define EOL_SEEN_LF     1
1167 #define EOL_SEEN_CR     2
1168 #define EOL_SEEN_CRLF   4
1169
1170 \f
1171 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1172
1173
1174
1175 \f
1176 /*** 3. UTF-8 ***/
1177
1178 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1179    Return true if a text is encoded in UTF-8.  */
1180
1181 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1182 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1183 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1184 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1185 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1186 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1187
1188 #define UTF_8_BOM_1 0xEF
1189 #define UTF_8_BOM_2 0xBB
1190 #define UTF_8_BOM_3 0xBF
1191
1192 /* Unlike the other detect_coding_XXX, this function counts the number
1193    of characters and checks the EOL format.  */
1194
1195 static bool
1196 detect_coding_utf_8 (struct coding_system *coding,
1197                      struct coding_detection_info *detect_info)
1198 {
1199   const unsigned char *src = coding->source, *src_base;
1200   const unsigned char *src_end = coding->source + coding->src_bytes;
1201   bool multibytep = coding->src_multibyte;
1202   ptrdiff_t consumed_chars = 0;
1203   bool bom_found = 0;
1204   ptrdiff_t nchars = coding->head_ascii;
1205   int eol_seen = coding->eol_seen;
1206
1207   detect_info->checked |= CATEGORY_MASK_UTF_8;
1208   /* A coding system of this category is always ASCII compatible.  */
1209   src += nchars;
1210
1211   if (src == coding->source     /* BOM should be at the head.  */
1212       && src + 3 < src_end      /* BOM is 3-byte long.  */
1213       && src[0] == UTF_8_BOM_1
1214       && src[1] == UTF_8_BOM_2
1215       && src[2] == UTF_8_BOM_3)
1216     {
1217       bom_found = 1;
1218       src += 3;
1219       nchars++;
1220     }
1221
1222   while (1)
1223     {
1224       int c, c1, c2, c3, c4;
1225
1226       src_base = src;
1227       ONE_MORE_BYTE (c);
1228       if (c < 0 || UTF_8_1_OCTET_P (c))
1229         {
1230           nchars++;
1231           if (c == '\r')
1232             {
1233               if (src < src_end && *src == '\n')
1234                 {
1235                   eol_seen |= EOL_SEEN_CRLF;
1236                   src++;
1237                   nchars++;
1238                 }
1239               else
1240                 eol_seen |= EOL_SEEN_CR;
1241             }
1242           else if (c == '\n')
1243             eol_seen |= EOL_SEEN_LF;
1244           continue;
1245         }
1246       ONE_MORE_BYTE (c1);
1247       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1248         break;
1249       if (UTF_8_2_OCTET_LEADING_P (c))
1250         {
1251           nchars++;
1252           continue;
1253         }
1254       ONE_MORE_BYTE (c2);
1255       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1256         break;
1257       if (UTF_8_3_OCTET_LEADING_P (c))
1258         {
1259           nchars++;
1260           continue;
1261         }
1262       ONE_MORE_BYTE (c3);
1263       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1264         break;
1265       if (UTF_8_4_OCTET_LEADING_P (c))
1266         {
1267           nchars++;
1268           continue;
1269         }
1270       ONE_MORE_BYTE (c4);
1271       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1272         break;
1273       if (UTF_8_5_OCTET_LEADING_P (c))
1274         {
1275           nchars++;
1276           continue;
1277         }
1278       break;
1279     }
1280   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1281   return 0;
1282
1283  no_more_source:
1284   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1285     {
1286       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1287       return 0;
1288     }
1289   if (bom_found)
1290     {
1291       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1292       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1293     }
1294   else
1295     {
1296       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1297       if (nchars < src_end - coding->source)
1298         /* The found characters are less than source bytes, which
1299            means that we found a valid non-ASCII characters.  */
1300         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1301     }
1302   coding->detected_utf8_bytes = src_base - coding->source;
1303   coding->detected_utf8_chars = nchars;
1304   return 1;
1305 }
1306
1307
1308 static void
1309 decode_coding_utf_8 (struct coding_system *coding)
1310 {
1311   const unsigned char *src = coding->source + coding->consumed;
1312   const unsigned char *src_end = coding->source + coding->src_bytes;
1313   const unsigned char *src_base;
1314   int *charbuf = coding->charbuf + coding->charbuf_used;
1315   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1316   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1317   bool multibytep = coding->src_multibyte;
1318   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1319   bool eol_dos
1320     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1321   int byte_after_cr = -1;
1322
1323   if (bom != utf_without_bom)
1324     {
1325       int c1, c2, c3;
1326
1327       src_base = src;
1328       ONE_MORE_BYTE (c1);
1329       if (! UTF_8_3_OCTET_LEADING_P (c1))
1330         src = src_base;
1331       else
1332         {
1333           ONE_MORE_BYTE (c2);
1334           if (! UTF_8_EXTRA_OCTET_P (c2))
1335             src = src_base;
1336           else
1337             {
1338               ONE_MORE_BYTE (c3);
1339               if (! UTF_8_EXTRA_OCTET_P (c3))
1340                 src = src_base;
1341               else
1342                 {
1343                   if ((c1 != UTF_8_BOM_1)
1344                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1345                     src = src_base;
1346                   else
1347                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1348                 }
1349             }
1350         }
1351     }
1352   CODING_UTF_8_BOM (coding) = utf_without_bom;
1353
1354   while (1)
1355     {
1356       int c, c1, c2, c3, c4, c5;
1357
1358       src_base = src;
1359       consumed_chars_base = consumed_chars;
1360
1361       if (charbuf >= charbuf_end)
1362         {
1363           if (byte_after_cr >= 0)
1364             src_base--;
1365           break;
1366         }
1367
1368       /* In the simple case, rapidly handle ordinary characters */
1369       if (multibytep && ! eol_dos
1370           && charbuf < charbuf_end - 6 && src < src_end - 6)
1371         {
1372           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1373             {
1374               c1 = *src;
1375               if (c1 & 0x80)
1376                 break;
1377               src++;
1378               consumed_chars++;
1379               *charbuf++ = c1;
1380
1381               c1 = *src;
1382               if (c1 & 0x80)
1383                 break;
1384               src++;
1385               consumed_chars++;
1386               *charbuf++ = c1;
1387
1388               c1 = *src;
1389               if (c1 & 0x80)
1390                 break;
1391               src++;
1392               consumed_chars++;
1393               *charbuf++ = c1;
1394
1395               c1 = *src;
1396               if (c1 & 0x80)
1397                 break;
1398               src++;
1399               consumed_chars++;
1400               *charbuf++ = c1;
1401             }
1402           /* If we handled at least one character, restart the main loop.  */
1403           if (src != src_base)
1404             continue;
1405         }
1406
1407       if (byte_after_cr >= 0)
1408         c1 = byte_after_cr, byte_after_cr = -1;
1409       else
1410         ONE_MORE_BYTE (c1);
1411       if (c1 < 0)
1412         {
1413           c = - c1;
1414         }
1415       else if (UTF_8_1_OCTET_P (c1))
1416         {
1417           if (eol_dos && c1 == '\r')
1418             ONE_MORE_BYTE (byte_after_cr);
1419           c = c1;
1420         }
1421       else
1422         {
1423           ONE_MORE_BYTE (c2);
1424           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1425             goto invalid_code;
1426           if (UTF_8_2_OCTET_LEADING_P (c1))
1427             {
1428               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1429               /* Reject overlong sequences here and below.  Encoders
1430                  producing them are incorrect, they can be misleading,
1431                  and they mess up read/write invariance.  */
1432               if (c < 128)
1433                 goto invalid_code;
1434             }
1435           else
1436             {
1437               ONE_MORE_BYTE (c3);
1438               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1439                 goto invalid_code;
1440               if (UTF_8_3_OCTET_LEADING_P (c1))
1441                 {
1442                   c = (((c1 & 0xF) << 12)
1443                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1444                   if (c < 0x800
1445                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1446                     goto invalid_code;
1447                 }
1448               else
1449                 {
1450                   ONE_MORE_BYTE (c4);
1451                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1452                     goto invalid_code;
1453                   if (UTF_8_4_OCTET_LEADING_P (c1))
1454                     {
1455                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1456                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1457                     if (c < 0x10000)
1458                       goto invalid_code;
1459                     }
1460                   else
1461                     {
1462                       ONE_MORE_BYTE (c5);
1463                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1464                         goto invalid_code;
1465                       if (UTF_8_5_OCTET_LEADING_P (c1))
1466                         {
1467                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1468                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1469                                | (c5 & 0x3F));
1470                           if ((c > MAX_CHAR) || (c < 0x200000))
1471                             goto invalid_code;
1472                         }
1473                       else
1474                         goto invalid_code;
1475                     }
1476                 }
1477             }
1478         }
1479
1480       *charbuf++ = c;
1481       continue;
1482
1483     invalid_code:
1484       src = src_base;
1485       consumed_chars = consumed_chars_base;
1486       ONE_MORE_BYTE (c);
1487       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
1488     }
1489
1490  no_more_source:
1491   coding->consumed_char += consumed_chars_base;
1492   coding->consumed = src_base - coding->source;
1493   coding->charbuf_used = charbuf - coding->charbuf;
1494 }
1495
1496
1497 static bool
1498 encode_coding_utf_8 (struct coding_system *coding)
1499 {
1500   bool multibytep = coding->dst_multibyte;
1501   int *charbuf = coding->charbuf;
1502   int *charbuf_end = charbuf + coding->charbuf_used;
1503   unsigned char *dst = coding->destination + coding->produced;
1504   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1505   ptrdiff_t produced_chars = 0;
1506   int c;
1507
1508   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1509     {
1510       ASSURE_DESTINATION (3);
1511       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1512       CODING_UTF_8_BOM (coding) = utf_without_bom;
1513     }
1514
1515   if (multibytep)
1516     {
1517       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1518
1519       while (charbuf < charbuf_end)
1520         {
1521           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1522
1523           ASSURE_DESTINATION (safe_room);
1524           c = *charbuf++;
1525           if (CHAR_BYTE8_P (c))
1526             {
1527               c = CHAR_TO_BYTE8 (c);
1528               EMIT_ONE_BYTE (c);
1529             }
1530           else
1531             {
1532               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1533               for (p = str; p < pend; p++)
1534                 EMIT_ONE_BYTE (*p);
1535             }
1536         }
1537     }
1538   else
1539     {
1540       int safe_room = MAX_MULTIBYTE_LENGTH;
1541
1542       while (charbuf < charbuf_end)
1543         {
1544           ASSURE_DESTINATION (safe_room);
1545           c = *charbuf++;
1546           if (CHAR_BYTE8_P (c))
1547             *dst++ = CHAR_TO_BYTE8 (c);
1548           else
1549             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1550         }
1551       produced_chars = dst - (coding->destination + coding->produced);
1552     }
1553   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1554   coding->produced_char += produced_chars;
1555   coding->produced = dst - coding->destination;
1556   return 0;
1557 }
1558
1559
1560 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1561    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1562
1563 #define UTF_16_HIGH_SURROGATE_P(val) \
1564   (((val) & 0xFC00) == 0xD800)
1565
1566 #define UTF_16_LOW_SURROGATE_P(val) \
1567   (((val) & 0xFC00) == 0xDC00)
1568
1569
1570 static bool
1571 detect_coding_utf_16 (struct coding_system *coding,
1572                       struct coding_detection_info *detect_info)
1573 {
1574   const unsigned char *src = coding->source;
1575   const unsigned char *src_end = coding->source + coding->src_bytes;
1576   bool multibytep = coding->src_multibyte;
1577   int c1, c2;
1578
1579   detect_info->checked |= CATEGORY_MASK_UTF_16;
1580   if (coding->mode & CODING_MODE_LAST_BLOCK
1581       && (coding->src_chars & 1))
1582     {
1583       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1584       return 0;
1585     }
1586
1587   TWO_MORE_BYTES (c1, c2);
1588   if ((c1 == 0xFF) && (c2 == 0xFE))
1589     {
1590       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1591                              | CATEGORY_MASK_UTF_16_AUTO);
1592       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1593                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1594                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1595     }
1596   else if ((c1 == 0xFE) && (c2 == 0xFF))
1597     {
1598       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1599                              | CATEGORY_MASK_UTF_16_AUTO);
1600       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1601                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1602                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1603     }
1604   else if (c2 < 0)
1605     {
1606       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1607       return 0;
1608     }
1609   else
1610     {
1611       /* We check the dispersion of Eth and Oth bytes where E is even and
1612          O is odd.  If both are high, we assume binary data.*/
1613       unsigned char e[256], o[256];
1614       unsigned e_num = 1, o_num = 1;
1615
1616       memset (e, 0, 256);
1617       memset (o, 0, 256);
1618       e[c1] = 1;
1619       o[c2] = 1;
1620
1621       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1622                                 |CATEGORY_MASK_UTF_16_BE
1623                                 | CATEGORY_MASK_UTF_16_LE);
1624
1625       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1626              != CATEGORY_MASK_UTF_16)
1627         {
1628           TWO_MORE_BYTES (c1, c2);
1629           if (c2 < 0)
1630             break;
1631           if (! e[c1])
1632             {
1633               e[c1] = 1;
1634               e_num++;
1635               if (e_num >= 128)
1636                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1637             }
1638           if (! o[c2])
1639             {
1640               o[c2] = 1;
1641               o_num++;
1642               if (o_num >= 128)
1643                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1644             }
1645         }
1646       return 0;
1647     }
1648
1649  no_more_source:
1650   return 1;
1651 }
1652
1653 static void
1654 decode_coding_utf_16 (struct coding_system *coding)
1655 {
1656   const unsigned char *src = coding->source + coding->consumed;
1657   const unsigned char *src_end = coding->source + coding->src_bytes;
1658   const unsigned char *src_base;
1659   int *charbuf = coding->charbuf + coding->charbuf_used;
1660   /* We may produces at most 3 chars in one loop.  */
1661   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1662   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1663   bool multibytep = coding->src_multibyte;
1664   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1665   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1666   int surrogate = CODING_UTF_16_SURROGATE (coding);
1667   bool eol_dos
1668     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1669   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1670
1671   if (bom == utf_with_bom)
1672     {
1673       int c, c1, c2;
1674
1675       src_base = src;
1676       ONE_MORE_BYTE (c1);
1677       ONE_MORE_BYTE (c2);
1678       c = (c1 << 8) | c2;
1679
1680       if (endian == utf_16_big_endian
1681           ? c != 0xFEFF : c != 0xFFFE)
1682         {
1683           /* The first two bytes are not BOM.  Treat them as bytes
1684              for a normal character.  */
1685           src = src_base;
1686         }
1687       CODING_UTF_16_BOM (coding) = utf_without_bom;
1688     }
1689   else if (bom == utf_detect_bom)
1690     {
1691       /* We have already tried to detect BOM and failed in
1692          detect_coding.  */
1693       CODING_UTF_16_BOM (coding) = utf_without_bom;
1694     }
1695
1696   while (1)
1697     {
1698       int c, c1, c2;
1699
1700       src_base = src;
1701       consumed_chars_base = consumed_chars;
1702
1703       if (charbuf >= charbuf_end)
1704         {
1705           if (byte_after_cr1 >= 0)
1706             src_base -= 2;
1707           break;
1708         }
1709
1710       if (byte_after_cr1 >= 0)
1711         c1 = byte_after_cr1, byte_after_cr1 = -1;
1712       else
1713         ONE_MORE_BYTE (c1);
1714       if (c1 < 0)
1715         {
1716           *charbuf++ = -c1;
1717           continue;
1718         }
1719       if (byte_after_cr2 >= 0)
1720         c2 = byte_after_cr2, byte_after_cr2 = -1;
1721       else
1722         ONE_MORE_BYTE (c2);
1723       if (c2 < 0)
1724         {
1725           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1726           *charbuf++ = -c2;
1727           continue;
1728         }
1729       c = (endian == utf_16_big_endian
1730            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1731
1732       if (surrogate)
1733         {
1734           if (! UTF_16_LOW_SURROGATE_P (c))
1735             {
1736               if (endian == utf_16_big_endian)
1737                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1738               else
1739                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1740               *charbuf++ = c1;
1741               *charbuf++ = c2;
1742               if (UTF_16_HIGH_SURROGATE_P (c))
1743                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1744               else
1745                 *charbuf++ = c;
1746             }
1747           else
1748             {
1749               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1750               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1751               *charbuf++ = 0x10000 + c;
1752             }
1753         }
1754       else
1755         {
1756           if (UTF_16_HIGH_SURROGATE_P (c))
1757             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1758           else
1759             {
1760               if (eol_dos && c == '\r')
1761                 {
1762                   ONE_MORE_BYTE (byte_after_cr1);
1763                   ONE_MORE_BYTE (byte_after_cr2);
1764                 }
1765               *charbuf++ = c;
1766             }
1767         }
1768     }
1769
1770  no_more_source:
1771   coding->consumed_char += consumed_chars_base;
1772   coding->consumed = src_base - coding->source;
1773   coding->charbuf_used = charbuf - coding->charbuf;
1774 }
1775
1776 static bool
1777 encode_coding_utf_16 (struct coding_system *coding)
1778 {
1779   bool multibytep = coding->dst_multibyte;
1780   int *charbuf = coding->charbuf;
1781   int *charbuf_end = charbuf + coding->charbuf_used;
1782   unsigned char *dst = coding->destination + coding->produced;
1783   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1784   int safe_room = 8;
1785   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1786   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1787   ptrdiff_t produced_chars = 0;
1788   int c;
1789
1790   if (bom != utf_without_bom)
1791     {
1792       ASSURE_DESTINATION (safe_room);
1793       if (big_endian)
1794         EMIT_TWO_BYTES (0xFE, 0xFF);
1795       else
1796         EMIT_TWO_BYTES (0xFF, 0xFE);
1797       CODING_UTF_16_BOM (coding) = utf_without_bom;
1798     }
1799
1800   while (charbuf < charbuf_end)
1801     {
1802       ASSURE_DESTINATION (safe_room);
1803       c = *charbuf++;
1804       if (c > MAX_UNICODE_CHAR)
1805         c = coding->default_char;
1806
1807       if (c < 0x10000)
1808         {
1809           if (big_endian)
1810             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1811           else
1812             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1813         }
1814       else
1815         {
1816           int c1, c2;
1817
1818           c -= 0x10000;
1819           c1 = (c >> 10) + 0xD800;
1820           c2 = (c & 0x3FF) + 0xDC00;
1821           if (big_endian)
1822             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1823           else
1824             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1825         }
1826     }
1827   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1828   coding->produced = dst - coding->destination;
1829   coding->produced_char += produced_chars;
1830   return 0;
1831 }
1832
1833 \f
1834 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1835
1836 /* Emacs' internal format for representation of multiple character
1837    sets is a kind of multi-byte encoding, i.e. characters are
1838    represented by variable-length sequences of one-byte codes.
1839
1840    ASCII characters and control characters (e.g. `tab', `newline') are
1841    represented by one-byte sequences which are their ASCII codes, in
1842    the range 0x00 through 0x7F.
1843
1844    8-bit characters of the range 0x80..0x9F are represented by
1845    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1846    code + 0x20).
1847
1848    8-bit characters of the range 0xA0..0xFF are represented by
1849    one-byte sequences which are their 8-bit code.
1850
1851    The other characters are represented by a sequence of `base
1852    leading-code', optional `extended leading-code', and one or two
1853    `position-code's.  The length of the sequence is determined by the
1854    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1855    whereas extended leading-code and position-code take the range 0xA0
1856    through 0xFF.  See `charset.h' for more details about leading-code
1857    and position-code.
1858
1859    --- CODE RANGE of Emacs' internal format ---
1860    character set        range
1861    -------------        -----
1862    ascii                0x00..0x7F
1863    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1864    eight-bit-graphic    0xA0..0xBF
1865    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1866    ---------------------------------------------
1867
1868    As this is the internal character representation, the format is
1869    usually not used externally (i.e. in a file or in a data sent to a
1870    process).  But, it is possible to have a text externally in this
1871    format (i.e. by encoding by the coding system `emacs-mule').
1872
1873    In that case, a sequence of one-byte codes has a slightly different
1874    form.
1875
1876    At first, all characters in eight-bit-control are represented by
1877    one-byte sequences which are their 8-bit code.
1878
1879    Next, character composition data are represented by the byte
1880    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1881    where,
1882         METHOD is 0xF2 plus one of composition method (enum
1883         composition_method),
1884
1885         BYTES is 0xA0 plus a byte length of this composition data,
1886
1887         CHARS is 0xA0 plus a number of characters composed by this
1888         data,
1889
1890         COMPONENTs are characters of multibyte form or composition
1891         rules encoded by two-byte of ASCII codes.
1892
1893    In addition, for backward compatibility, the following formats are
1894    also recognized as composition data on decoding.
1895
1896    0x80 MSEQ ...
1897    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1898
1899    Here,
1900         MSEQ is a multibyte form but in these special format:
1901           ASCII: 0xA0 ASCII_CODE+0x80,
1902           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1903         RULE is a one byte code of the range 0xA0..0xF0 that
1904         represents a composition rule.
1905   */
1906
1907 char emacs_mule_bytes[256];
1908
1909
1910 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1911    Return true if a text is encoded in 'emacs-mule'.  */
1912
1913 static bool
1914 detect_coding_emacs_mule (struct coding_system *coding,
1915                           struct coding_detection_info *detect_info)
1916 {
1917   const unsigned char *src = coding->source, *src_base;
1918   const unsigned char *src_end = coding->source + coding->src_bytes;
1919   bool multibytep = coding->src_multibyte;
1920   ptrdiff_t consumed_chars = 0;
1921   int c;
1922   int found = 0;
1923
1924   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1925   /* A coding system of this category is always ASCII compatible.  */
1926   src += coding->head_ascii;
1927
1928   while (1)
1929     {
1930       src_base = src;
1931       ONE_MORE_BYTE (c);
1932       if (c < 0)
1933         continue;
1934       if (c == 0x80)
1935         {
1936           /* Perhaps the start of composite character.  We simply skip
1937              it because analyzing it is too heavy for detecting.  But,
1938              at least, we check that the composite character
1939              constitutes of more than 4 bytes.  */
1940           const unsigned char *src_start;
1941
1942         repeat:
1943           src_start = src;
1944           do
1945             {
1946               ONE_MORE_BYTE (c);
1947             }
1948           while (c >= 0xA0);
1949
1950           if (src - src_start <= 4)
1951             break;
1952           found = CATEGORY_MASK_EMACS_MULE;
1953           if (c == 0x80)
1954             goto repeat;
1955         }
1956
1957       if (c < 0x80)
1958         {
1959           if (c < 0x20
1960               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1961             break;
1962         }
1963       else
1964         {
1965           int more_bytes = emacs_mule_bytes[c] - 1;
1966
1967           while (more_bytes > 0)
1968             {
1969               ONE_MORE_BYTE (c);
1970               if (c < 0xA0)
1971                 {
1972                   src--;        /* Unread the last byte.  */
1973                   break;
1974                 }
1975               more_bytes--;
1976             }
1977           if (more_bytes != 0)
1978             break;
1979           found = CATEGORY_MASK_EMACS_MULE;
1980         }
1981     }
1982   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1983   return 0;
1984
1985  no_more_source:
1986   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1987     {
1988       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1989       return 0;
1990     }
1991   detect_info->found |= found;
1992   return 1;
1993 }
1994
1995
1996 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1997    character.  If CMP_STATUS indicates that we must expect MSEQ or
1998    RULE described above, decode it and return the negative value of
1999    the decoded character or rule.  If an invalid byte is found, return
2000    -1.  If SRC is too short, return -2.  */
2001
2002 static int
2003 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2004                  int *nbytes, int *nchars, int *id,
2005                  struct composition_status *cmp_status)
2006 {
2007   const unsigned char *src_end = coding->source + coding->src_bytes;
2008   const unsigned char *src_base = src;
2009   bool multibytep = coding->src_multibyte;
2010   int charset_ID;
2011   unsigned code;
2012   int c;
2013   ptrdiff_t consumed_chars = 0;
2014   bool mseq_found = 0;
2015
2016   ONE_MORE_BYTE (c);
2017   if (c < 0)
2018     {
2019       c = -c;
2020       charset_ID = emacs_mule_charset[0];
2021     }
2022   else
2023     {
2024       if (c >= 0xA0)
2025         {
2026           if (cmp_status->state != COMPOSING_NO
2027               && cmp_status->old_form)
2028             {
2029               if (cmp_status->state == COMPOSING_CHAR)
2030                 {
2031                   if (c == 0xA0)
2032                     {
2033                       ONE_MORE_BYTE (c);
2034                       c -= 0x80;
2035                       if (c < 0)
2036                         goto invalid_code;
2037                     }
2038                   else
2039                     c -= 0x20;
2040                   mseq_found = 1;
2041                 }
2042               else
2043                 {
2044                   *nbytes = src - src_base;
2045                   *nchars = consumed_chars;
2046                   return -c;
2047                 }
2048             }
2049           else
2050             goto invalid_code;
2051         }
2052
2053       switch (emacs_mule_bytes[c])
2054         {
2055         case 2:
2056           if ((charset_ID = emacs_mule_charset[c]) < 0)
2057             goto invalid_code;
2058           ONE_MORE_BYTE (c);
2059           if (c < 0xA0)
2060             goto invalid_code;
2061           code = c & 0x7F;
2062           break;
2063
2064         case 3:
2065           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2066               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2067             {
2068               ONE_MORE_BYTE (c);
2069               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2070                 goto invalid_code;
2071               ONE_MORE_BYTE (c);
2072               if (c < 0xA0)
2073                 goto invalid_code;
2074               code = c & 0x7F;
2075             }
2076           else
2077             {
2078               if ((charset_ID = emacs_mule_charset[c]) < 0)
2079                 goto invalid_code;
2080               ONE_MORE_BYTE (c);
2081               if (c < 0xA0)
2082                 goto invalid_code;
2083               code = (c & 0x7F) << 8;
2084               ONE_MORE_BYTE (c);
2085               if (c < 0xA0)
2086                 goto invalid_code;
2087               code |= c & 0x7F;
2088             }
2089           break;
2090
2091         case 4:
2092           ONE_MORE_BYTE (c);
2093           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2094             goto invalid_code;
2095           ONE_MORE_BYTE (c);
2096           if (c < 0xA0)
2097             goto invalid_code;
2098           code = (c & 0x7F) << 8;
2099           ONE_MORE_BYTE (c);
2100           if (c < 0xA0)
2101             goto invalid_code;
2102           code |= c & 0x7F;
2103           break;
2104
2105         case 1:
2106           code = c;
2107           charset_ID = ASCII_CHAR_P (code) ? charset_ascii : charset_eight_bit;
2108           break;
2109
2110         default:
2111           emacs_abort ();
2112         }
2113       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2114                           CHARSET_FROM_ID (charset_ID), code, c);
2115       if (c < 0)
2116         goto invalid_code;
2117     }
2118   *nbytes = src - src_base;
2119   *nchars = consumed_chars;
2120   if (id)
2121     *id = charset_ID;
2122   return (mseq_found ? -c : c);
2123
2124  no_more_source:
2125   return -2;
2126
2127  invalid_code:
2128   return -1;
2129 }
2130
2131
2132 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2133
2134 /* Handle these composition sequence ('|': the end of header elements,
2135    BYTES and CHARS >= 0xA0):
2136
2137    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2138    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2139    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2140
2141    and these old form:
2142
2143    (4) relative composition: 0x80 | MSEQ ... MSEQ
2144    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2145
2146    When the starter 0x80 and the following header elements are found,
2147    this annotation header is produced.
2148
2149         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2150
2151    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2152    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2153
2154    Then, upon reading the following elements, these codes are produced
2155    until the composition end is found:
2156
2157    (1) CHAR ... CHAR
2158    (2) ALT ... ALT CHAR ... CHAR
2159    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2160    (4) CHAR ... CHAR
2161    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2162
2163    When the composition end is found, LENGTH and NCHARS in the
2164    annotation header is updated as below:
2165
2166    (1) LENGTH: unchanged, NCHARS: unchanged
2167    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2168    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2169    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2170    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2171
2172    If an error is found while composing, the annotation header is
2173    changed to the original composition header (plus filler -1s) as
2174    below:
2175
2176    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2177    (5)          [ 0x80 0xFF -1 -1- -1 ]
2178
2179    and the sequence [ -2 DECODED-RULE ] is changed to the original
2180    byte sequence as below:
2181         o the original byte sequence is B: [ B -1 ]
2182         o the original byte sequence is B1 B2: [ B1 B2 ]
2183
2184    Most of the routines are implemented by macros because many
2185    variables and labels in the caller decode_coding_emacs_mule must be
2186    accessible, and they are usually called just once (thus doesn't
2187    increase the size of compiled object).  */
2188
2189 /* Decode a composition rule represented by C as a component of
2190    composition sequence of Emacs 20 style.  Set RULE to the decoded
2191    rule. */
2192
2193 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2194   do {                                                  \
2195     int gref, nref;                                     \
2196                                                         \
2197     c -= 0xA0;                                          \
2198     if (c < 0 || c >= 81)                               \
2199       goto invalid_code;                                \
2200     gref = c / 9, nref = c % 9;                         \
2201     if (gref == 4) gref = 10;                           \
2202     if (nref == 4) nref = 10;                           \
2203     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2204   } while (0)
2205
2206
2207 /* Decode a composition rule represented by C and the following byte
2208    at SRC as a component of composition sequence of Emacs 21 style.
2209    Set RULE to the decoded rule.  */
2210
2211 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2212   do {                                                  \
2213     int gref, nref;                                     \
2214                                                         \
2215     gref = c - 0x20;                                    \
2216     if (gref < 0 || gref >= 81)                         \
2217       goto invalid_code;                                \
2218     ONE_MORE_BYTE (c);                                  \
2219     nref = c - 0x20;                                    \
2220     if (nref < 0 || nref >= 81)                         \
2221       goto invalid_code;                                \
2222     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2223   } while (0)
2224
2225
2226 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2227    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2228    byte length of this composition information, CHARS is the number of
2229    characters composed by this composition.  */
2230
2231 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2232   do {                                                                  \
2233     enum composition_method method = c - 0xF2;                          \
2234     int nbytes, nchars;                                                 \
2235                                                                         \
2236     ONE_MORE_BYTE (c);                                                  \
2237     if (c < 0)                                                          \
2238       goto invalid_code;                                                \
2239     nbytes = c - 0xA0;                                                  \
2240     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2241       goto invalid_code;                                                \
2242     ONE_MORE_BYTE (c);                                                  \
2243     nchars = c - 0xA0;                                                  \
2244     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2245       goto invalid_code;                                                \
2246     cmp_status->old_form = 0;                                           \
2247     cmp_status->method = method;                                        \
2248     if (method == COMPOSITION_RELATIVE)                                 \
2249       cmp_status->state = COMPOSING_CHAR;                               \
2250     else                                                                \
2251       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2252     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2253     cmp_status->nchars = nchars;                                        \
2254     cmp_status->ncomps = nbytes - 4;                                    \
2255     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2256   } while (0)
2257
2258
2259 /* Start of Emacs 20 style format for relative composition.  */
2260
2261 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2262   do {                                                          \
2263     cmp_status->old_form = 1;                                   \
2264     cmp_status->method = COMPOSITION_RELATIVE;                  \
2265     cmp_status->state = COMPOSING_CHAR;                         \
2266     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2267     cmp_status->nchars = cmp_status->ncomps = 0;                \
2268     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2269   } while (0)
2270
2271
2272 /* Start of Emacs 20 style format for rule-base composition.  */
2273
2274 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2275   do {                                                          \
2276     cmp_status->old_form = 1;                                   \
2277     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2278     cmp_status->state = COMPOSING_CHAR;                         \
2279     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2280     cmp_status->nchars = cmp_status->ncomps = 0;                \
2281     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2282   } while (0)
2283
2284
2285 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2286   do {                                                  \
2287     const unsigned char *current_src = src;             \
2288                                                         \
2289     ONE_MORE_BYTE (c);                                  \
2290     if (c < 0)                                          \
2291       goto invalid_code;                                \
2292     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2293         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2294       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2295     else if (c < 0xA0)                                  \
2296       goto invalid_code;                                \
2297     else if (c < 0xC0)                                  \
2298       {                                                 \
2299         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2300         /* Re-read C as a composition component.  */    \
2301         src = current_src;                              \
2302       }                                                 \
2303     else if (c == 0xFF)                                 \
2304       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2305     else                                                \
2306       goto invalid_code;                                \
2307   } while (0)
2308
2309 #define EMACS_MULE_COMPOSITION_END()                            \
2310   do {                                                          \
2311     int idx = - cmp_status->length;                             \
2312                                                                 \
2313     if (cmp_status->old_form)                                   \
2314       charbuf[idx + 2] = cmp_status->nchars;                    \
2315     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2316       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2317     cmp_status->state = COMPOSING_NO;                           \
2318   } while (0)
2319
2320
2321 static int
2322 emacs_mule_finish_composition (int *charbuf,
2323                                struct composition_status *cmp_status)
2324 {
2325   int idx = - cmp_status->length;
2326   int new_chars;
2327
2328   if (cmp_status->old_form && cmp_status->nchars > 0)
2329     {
2330       charbuf[idx + 2] = cmp_status->nchars;
2331       new_chars = 0;
2332       if (cmp_status->method == COMPOSITION_WITH_RULE
2333           && cmp_status->state == COMPOSING_CHAR)
2334         {
2335           /* The last rule was invalid.  */
2336           int rule = charbuf[-1] + 0xA0;
2337
2338           charbuf[-2] = BYTE8_TO_CHAR (rule);
2339           charbuf[-1] = -1;
2340           new_chars = 1;
2341         }
2342     }
2343   else
2344     {
2345       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2346
2347       if (cmp_status->method == COMPOSITION_WITH_RULE)
2348         {
2349           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2350           charbuf[idx++] = -3;
2351           charbuf[idx++] = 0;
2352           new_chars = 1;
2353         }
2354       else
2355         {
2356           int nchars = charbuf[idx + 1] + 0xA0;
2357           int nbytes = charbuf[idx + 2] + 0xA0;
2358
2359           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2360           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2361           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2362           charbuf[idx++] = -1;
2363           new_chars = 4;
2364         }
2365     }
2366   cmp_status->state = COMPOSING_NO;
2367   return new_chars;
2368 }
2369
2370 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2371   do {                                                                    \
2372     if (cmp_status->state != COMPOSING_NO)                                \
2373       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2374   } while (0)
2375
2376
2377 static void
2378 decode_coding_emacs_mule (struct coding_system *coding)
2379 {
2380   const unsigned char *src = coding->source + coding->consumed;
2381   const unsigned char *src_end = coding->source + coding->src_bytes;
2382   const unsigned char *src_base;
2383   int *charbuf = coding->charbuf + coding->charbuf_used;
2384   /* We may produce two annotations (charset and composition) in one
2385      loop and one more charset annotation at the end.  */
2386   int *charbuf_end
2387     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2388       /* We can produce up to 2 characters in a loop.  */
2389       - 1;
2390   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2391   bool multibytep = coding->src_multibyte;
2392   ptrdiff_t char_offset = coding->produced_char;
2393   ptrdiff_t last_offset = char_offset;
2394   int last_id = charset_ascii;
2395   bool eol_dos
2396     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2397   int byte_after_cr = -1;
2398   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2399
2400   if (cmp_status->state != COMPOSING_NO)
2401     {
2402       int i;
2403
2404       if (charbuf_end - charbuf < cmp_status->length)
2405         emacs_abort ();
2406       for (i = 0; i < cmp_status->length; i++)
2407         *charbuf++ = cmp_status->carryover[i];
2408       coding->annotated = 1;
2409     }
2410
2411   while (1)
2412     {
2413       int c, id IF_LINT (= 0);
2414
2415       src_base = src;
2416       consumed_chars_base = consumed_chars;
2417
2418       if (charbuf >= charbuf_end)
2419         {
2420           if (byte_after_cr >= 0)
2421             src_base--;
2422           break;
2423         }
2424
2425       if (byte_after_cr >= 0)
2426         c = byte_after_cr, byte_after_cr = -1;
2427       else
2428         ONE_MORE_BYTE (c);
2429
2430       if (c < 0 || c == 0x80)
2431         {
2432           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2433           if (c < 0)
2434             {
2435               *charbuf++ = -c;
2436               char_offset++;
2437             }
2438           else
2439             DECODE_EMACS_MULE_COMPOSITION_START ();
2440           continue;
2441         }
2442
2443       if (c < 0x80)
2444         {
2445           if (eol_dos && c == '\r')
2446             ONE_MORE_BYTE (byte_after_cr);
2447           id = charset_ascii;
2448           if (cmp_status->state != COMPOSING_NO)
2449             {
2450               if (cmp_status->old_form)
2451                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2452               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2453                 cmp_status->ncomps--;
2454             }
2455         }
2456       else
2457         {
2458           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2459           /* emacs_mule_char can load a charset map from a file, which
2460              allocates a large structure and might cause buffer text
2461              to be relocated as result.  Thus, we need to remember the
2462              original pointer to buffer text, and fix up all related
2463              pointers after the call.  */
2464           const unsigned char *orig = coding->source;
2465           ptrdiff_t offset;
2466
2467           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2468                                cmp_status);
2469           offset = coding->source - orig;
2470           if (offset)
2471             {
2472               src += offset;
2473               src_base += offset;
2474               src_end += offset;
2475             }
2476           if (c < 0)
2477             {
2478               if (c == -1)
2479                 goto invalid_code;
2480               if (c == -2)
2481                 break;
2482             }
2483           src = src_base + nbytes;
2484           consumed_chars = consumed_chars_base + nchars;
2485           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2486             cmp_status->ncomps -= nchars;
2487         }
2488
2489       /* Now if C >= 0, we found a normally encoded character, if C <
2490          0, we found an old-style composition component character or
2491          rule.  */
2492
2493       if (cmp_status->state == COMPOSING_NO)
2494         {
2495           if (last_id != id)
2496             {
2497               if (last_id != charset_ascii)
2498                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2499                                   last_id);
2500               last_id = id;
2501               last_offset = char_offset;
2502             }
2503           *charbuf++ = c;
2504           char_offset++;
2505         }
2506       else if (cmp_status->state == COMPOSING_CHAR)
2507         {
2508           if (cmp_status->old_form)
2509             {
2510               if (c >= 0)
2511                 {
2512                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2513                   *charbuf++ = c;
2514                   char_offset++;
2515                 }
2516               else
2517                 {
2518                   *charbuf++ = -c;
2519                   cmp_status->nchars++;
2520                   cmp_status->length++;
2521                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2522                     EMACS_MULE_COMPOSITION_END ();
2523                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2524                     cmp_status->state = COMPOSING_RULE;
2525                 }
2526             }
2527           else
2528             {
2529               *charbuf++ = c;
2530               cmp_status->length++;
2531               cmp_status->nchars--;
2532               if (cmp_status->nchars == 0)
2533                 EMACS_MULE_COMPOSITION_END ();
2534             }
2535         }
2536       else if (cmp_status->state == COMPOSING_RULE)
2537         {
2538           int rule;
2539
2540           if (c >= 0)
2541             {
2542               EMACS_MULE_COMPOSITION_END ();
2543               *charbuf++ = c;
2544               char_offset++;
2545             }
2546           else
2547             {
2548               c = -c;
2549               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2550               if (rule < 0)
2551                 goto invalid_code;
2552               *charbuf++ = -2;
2553               *charbuf++ = rule;
2554               cmp_status->length += 2;
2555               cmp_status->state = COMPOSING_CHAR;
2556             }
2557         }
2558       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2559         {
2560           *charbuf++ = c;
2561           cmp_status->length++;
2562           if (cmp_status->ncomps == 0)
2563             cmp_status->state = COMPOSING_CHAR;
2564           else if (cmp_status->ncomps > 0)
2565             {
2566               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2567                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2568             }
2569           else
2570             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2571         }
2572       else                      /* COMPOSING_COMPONENT_RULE */
2573         {
2574           int rule;
2575
2576           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2577           if (rule < 0)
2578             goto invalid_code;
2579           *charbuf++ = -2;
2580           *charbuf++ = rule;
2581           cmp_status->length += 2;
2582           cmp_status->ncomps--;
2583           if (cmp_status->ncomps > 0)
2584             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2585           else
2586             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2587         }
2588       continue;
2589
2590     invalid_code:
2591       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2592       src = src_base;
2593       consumed_chars = consumed_chars_base;
2594       ONE_MORE_BYTE (c);
2595       *charbuf++ = ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
2596       char_offset++;
2597     }
2598
2599  no_more_source:
2600   if (cmp_status->state != COMPOSING_NO)
2601     {
2602       if (coding->mode & CODING_MODE_LAST_BLOCK)
2603         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2604       else
2605         {
2606           int i;
2607
2608           charbuf -= cmp_status->length;
2609           for (i = 0; i < cmp_status->length; i++)
2610             cmp_status->carryover[i] = charbuf[i];
2611         }
2612     }
2613   if (last_id != charset_ascii)
2614     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2615   coding->consumed_char += consumed_chars_base;
2616   coding->consumed = src_base - coding->source;
2617   coding->charbuf_used = charbuf - coding->charbuf;
2618 }
2619
2620
2621 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2622   do {                                          \
2623     if (id < 0xA0)                              \
2624       codes[0] = id, codes[1] = 0;              \
2625     else if (id < 0xE0)                         \
2626       codes[0] = 0x9A, codes[1] = id;           \
2627     else if (id < 0xF0)                         \
2628       codes[0] = 0x9B, codes[1] = id;           \
2629     else if (id < 0xF5)                         \
2630       codes[0] = 0x9C, codes[1] = id;           \
2631     else                                        \
2632       codes[0] = 0x9D, codes[1] = id;           \
2633   } while (0);
2634
2635
2636 static bool
2637 encode_coding_emacs_mule (struct coding_system *coding)
2638 {
2639   bool multibytep = coding->dst_multibyte;
2640   int *charbuf = coding->charbuf;
2641   int *charbuf_end = charbuf + coding->charbuf_used;
2642   unsigned char *dst = coding->destination + coding->produced;
2643   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2644   int safe_room = 8;
2645   ptrdiff_t produced_chars = 0;
2646   Lisp_Object attrs, charset_list;
2647   int c;
2648   int preferred_charset_id = -1;
2649
2650   CODING_GET_INFO (coding, attrs, charset_list);
2651   if (! EQ (charset_list, Vemacs_mule_charset_list))
2652     {
2653       charset_list = Vemacs_mule_charset_list;
2654       ASET (attrs, coding_attr_charset_list, charset_list);
2655     }
2656
2657   while (charbuf < charbuf_end)
2658     {
2659       ASSURE_DESTINATION (safe_room);
2660       c = *charbuf++;
2661
2662       if (c < 0)
2663         {
2664           /* Handle an annotation.  */
2665           switch (*charbuf)
2666             {
2667             case CODING_ANNOTATE_COMPOSITION_MASK:
2668               /* Not yet implemented.  */
2669               break;
2670             case CODING_ANNOTATE_CHARSET_MASK:
2671               preferred_charset_id = charbuf[3];
2672               if (preferred_charset_id >= 0
2673                   && NILP (Fmemq (make_number (preferred_charset_id),
2674                                   charset_list)))
2675                 preferred_charset_id = -1;
2676               break;
2677             default:
2678               emacs_abort ();
2679             }
2680           charbuf += -c - 1;
2681           continue;
2682         }
2683
2684       if (ASCII_CHAR_P (c))
2685         EMIT_ONE_ASCII_BYTE (c);
2686       else if (CHAR_BYTE8_P (c))
2687         {
2688           c = CHAR_TO_BYTE8 (c);
2689           EMIT_ONE_BYTE (c);
2690         }
2691       else
2692         {
2693           struct charset *charset;
2694           unsigned code;
2695           int dimension;
2696           int emacs_mule_id;
2697           unsigned char leading_codes[2];
2698
2699           if (preferred_charset_id >= 0)
2700             {
2701               bool result;
2702
2703               charset = CHARSET_FROM_ID (preferred_charset_id);
2704               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2705               if (result)
2706                 code = ENCODE_CHAR (charset, c);
2707               else
2708                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2709                                      &code, charset);
2710             }
2711           else
2712             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2713                                  &code, charset);
2714           if (! charset)
2715             {
2716               c = coding->default_char;
2717               if (ASCII_CHAR_P (c))
2718                 {
2719                   EMIT_ONE_ASCII_BYTE (c);
2720                   continue;
2721                 }
2722               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2723                                    &code, charset);
2724             }
2725           dimension = CHARSET_DIMENSION (charset);
2726           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2727           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2728           EMIT_ONE_BYTE (leading_codes[0]);
2729           if (leading_codes[1])
2730             EMIT_ONE_BYTE (leading_codes[1]);
2731           if (dimension == 1)
2732             EMIT_ONE_BYTE (code | 0x80);
2733           else
2734             {
2735               code |= 0x8080;
2736               EMIT_ONE_BYTE (code >> 8);
2737               EMIT_ONE_BYTE (code & 0xFF);
2738             }
2739         }
2740     }
2741   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2742   coding->produced_char += produced_chars;
2743   coding->produced = dst - coding->destination;
2744   return 0;
2745 }
2746
2747 \f
2748 /*** 7. ISO2022 handlers ***/
2749
2750 /* The following note describes the coding system ISO2022 briefly.
2751    Since the intention of this note is to help understand the
2752    functions in this file, some parts are NOT ACCURATE or are OVERLY
2753    SIMPLIFIED.  For thorough understanding, please refer to the
2754    original document of ISO2022.  This is equivalent to the standard
2755    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2756
2757    ISO2022 provides many mechanisms to encode several character sets
2758    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2759    is encoded using bytes less than 128.  This may make the encoded
2760    text a little bit longer, but the text passes more easily through
2761    several types of gateway, some of which strip off the MSB (Most
2762    Significant Bit).
2763
2764    There are two kinds of character sets: control character sets and
2765    graphic character sets.  The former contain control characters such
2766    as `newline' and `escape' to provide control functions (control
2767    functions are also provided by escape sequences).  The latter
2768    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2769    two control character sets and many graphic character sets.
2770
2771    Graphic character sets are classified into one of the following
2772    four classes, according to the number of bytes (DIMENSION) and
2773    number of characters in one dimension (CHARS) of the set:
2774    - DIMENSION1_CHARS94
2775    - DIMENSION1_CHARS96
2776    - DIMENSION2_CHARS94
2777    - DIMENSION2_CHARS96
2778
2779    In addition, each character set is assigned an identification tag,
2780    unique for each set, called the "final character" (denoted as <F>
2781    hereafter).  The <F> of each character set is decided by ECMA(*)
2782    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2783    (0x30..0x3F are for private use only).
2784
2785    Note (*): ECMA = European Computer Manufacturers Association
2786
2787    Here are examples of graphic character sets [NAME(<F>)]:
2788         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2789         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2790         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2791         o DIMENSION2_CHARS96 -- none for the moment
2792
2793    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2794         C0 [0x00..0x1F] -- control character plane 0
2795         GL [0x20..0x7F] -- graphic character plane 0
2796         C1 [0x80..0x9F] -- control character plane 1
2797         GR [0xA0..0xFF] -- graphic character plane 1
2798
2799    A control character set is directly designated and invoked to C0 or
2800    C1 by an escape sequence.  The most common case is that:
2801    - ISO646's  control character set is designated/invoked to C0, and
2802    - ISO6429's control character set is designated/invoked to C1,
2803    and usually these designations/invocations are omitted in encoded
2804    text.  In a 7-bit environment, only C0 can be used, and a control
2805    character for C1 is encoded by an appropriate escape sequence to
2806    fit into the environment.  All control characters for C1 are
2807    defined to have corresponding escape sequences.
2808
2809    A graphic character set is at first designated to one of four
2810    graphic registers (G0 through G3), then these graphic registers are
2811    invoked to GL or GR.  These designations and invocations can be
2812    done independently.  The most common case is that G0 is invoked to
2813    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2814    these invocations and designations are omitted in encoded text.
2815    In a 7-bit environment, only GL can be used.
2816
2817    When a graphic character set of CHARS94 is invoked to GL, codes
2818    0x20 and 0x7F of the GL area work as control characters SPACE and
2819    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2820    be used.
2821
2822    There are two ways of invocation: locking-shift and single-shift.
2823    With locking-shift, the invocation lasts until the next different
2824    invocation, whereas with single-shift, the invocation affects the
2825    following character only and doesn't affect the locking-shift
2826    state.  Invocations are done by the following control characters or
2827    escape sequences:
2828
2829    ----------------------------------------------------------------------
2830    abbrev  function                  cntrl escape seq   description
2831    ----------------------------------------------------------------------
2832    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2833    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2834    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2835    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2836    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2837    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2838    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2839    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2840    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2841    ----------------------------------------------------------------------
2842    (*) These are not used by any known coding system.
2843
2844    Control characters for these functions are defined by macros
2845    ISO_CODE_XXX in `coding.h'.
2846
2847    Designations are done by the following escape sequences:
2848    ----------------------------------------------------------------------
2849    escape sequence      description
2850    ----------------------------------------------------------------------
2851    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2852    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2853    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2854    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2855    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2856    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2857    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2858    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2859    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2860    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2861    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2862    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2863    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2864    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2865    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2866    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2867    ----------------------------------------------------------------------
2868
2869    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2870    of dimension 1, chars 94, and final character <F>, etc...
2871
2872    Note (*): Although these designations are not allowed in ISO2022,
2873    Emacs accepts them on decoding, and produces them on encoding
2874    CHARS96 character sets in a coding system which is characterized as
2875    7-bit environment, non-locking-shift, and non-single-shift.
2876
2877    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2878    '(' must be omitted.  We refer to this as "short-form" hereafter.
2879
2880    Now you may notice that there are a lot of ways of encoding the
2881    same multilingual text in ISO2022.  Actually, there exist many
2882    coding systems such as Compound Text (used in X11's inter client
2883    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2884    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2885    localized platforms), and all of these are variants of ISO2022.
2886
2887    In addition to the above, Emacs handles two more kinds of escape
2888    sequences: ISO6429's direction specification and Emacs' private
2889    sequence for specifying character composition.
2890
2891    ISO6429's direction specification takes the following form:
2892         o CSI ']'      -- end of the current direction
2893         o CSI '0' ']'  -- end of the current direction
2894         o CSI '1' ']'  -- start of left-to-right text
2895         o CSI '2' ']'  -- start of right-to-left text
2896    The control character CSI (0x9B: control sequence introducer) is
2897    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2898
2899    Character composition specification takes the following form:
2900         o ESC '0' -- start relative composition
2901         o ESC '1' -- end composition
2902         o ESC '2' -- start rule-base composition (*)
2903         o ESC '3' -- start relative composition with alternate chars  (**)
2904         o ESC '4' -- start rule-base composition with alternate chars  (**)
2905   Since these are not standard escape sequences of any ISO standard,
2906   the use of them with these meanings is restricted to Emacs only.
2907
2908   (*) This form is used only in Emacs 20.7 and older versions,
2909   but newer versions can safely decode it.
2910   (**) This form is used only in Emacs 21.1 and newer versions,
2911   and older versions can't decode it.
2912
2913   Here's a list of example usages of these composition escape
2914   sequences (categorized by `enum composition_method').
2915
2916   COMPOSITION_RELATIVE:
2917         ESC 0 CHAR [ CHAR ] ESC 1
2918   COMPOSITION_WITH_RULE:
2919         ESC 2 CHAR [ RULE CHAR ] ESC 1
2920   COMPOSITION_WITH_ALTCHARS:
2921         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2922   COMPOSITION_WITH_RULE_ALTCHARS:
2923         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2924
2925 static enum iso_code_class_type iso_code_class[256];
2926
2927 #define SAFE_CHARSET_P(coding, id)      \
2928   ((id) <= (coding)->max_charset_id     \
2929    && (coding)->safe_charsets[id] != 255)
2930
2931 static void
2932 setup_iso_safe_charsets (Lisp_Object attrs)
2933 {
2934   Lisp_Object charset_list, safe_charsets;
2935   Lisp_Object request;
2936   Lisp_Object reg_usage;
2937   Lisp_Object tail;
2938   EMACS_INT reg94, reg96;
2939   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2940   int max_charset_id;
2941
2942   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2943   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2944       && ! EQ (charset_list, Viso_2022_charset_list))
2945     {
2946       charset_list = Viso_2022_charset_list;
2947       ASET (attrs, coding_attr_charset_list, charset_list);
2948       ASET (attrs, coding_attr_safe_charsets, Qnil);
2949     }
2950
2951   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2952     return;
2953
2954   max_charset_id = 0;
2955   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2956     {
2957       int id = XINT (XCAR (tail));
2958       if (max_charset_id < id)
2959         max_charset_id = id;
2960     }
2961
2962   safe_charsets = make_uninit_string (max_charset_id + 1);
2963   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2964   request = AREF (attrs, coding_attr_iso_request);
2965   reg_usage = AREF (attrs, coding_attr_iso_usage);
2966   reg94 = XINT (XCAR (reg_usage));
2967   reg96 = XINT (XCDR (reg_usage));
2968
2969   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2970     {
2971       Lisp_Object id;
2972       Lisp_Object reg;
2973       struct charset *charset;
2974
2975       id = XCAR (tail);
2976       charset = CHARSET_FROM_ID (XINT (id));
2977       reg = Fcdr (Fassq (id, request));
2978       if (! NILP (reg))
2979         SSET (safe_charsets, XINT (id), XINT (reg));
2980       else if (charset->iso_chars_96)
2981         {
2982           if (reg96 < 4)
2983             SSET (safe_charsets, XINT (id), reg96);
2984         }
2985       else
2986         {
2987           if (reg94 < 4)
2988             SSET (safe_charsets, XINT (id), reg94);
2989         }
2990     }
2991   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2992 }
2993
2994
2995 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2996    Return true if a text is encoded in one of ISO-2022 based coding
2997    systems.  */
2998
2999 static bool
3000 detect_coding_iso_2022 (struct coding_system *coding,
3001                         struct coding_detection_info *detect_info)
3002 {
3003   const unsigned char *src = coding->source, *src_base = src;
3004   const unsigned char *src_end = coding->source + coding->src_bytes;
3005   bool multibytep = coding->src_multibyte;
3006   bool single_shifting = 0;
3007   int id;
3008   int c, c1;
3009   ptrdiff_t consumed_chars = 0;
3010   int i;
3011   int rejected = 0;
3012   int found = 0;
3013   int composition_count = -1;
3014
3015   detect_info->checked |= CATEGORY_MASK_ISO;
3016
3017   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3018     {
3019       struct coding_system *this = &(coding_categories[i]);
3020       Lisp_Object attrs, val;
3021
3022       if (this->id < 0)
3023         continue;
3024       attrs = CODING_ID_ATTRS (this->id);
3025       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3026           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3027         setup_iso_safe_charsets (attrs);
3028       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3029       this->max_charset_id = SCHARS (val) - 1;
3030       this->safe_charsets = SDATA (val);
3031     }
3032
3033   /* A coding system of this category is always ASCII compatible.  */
3034   src += coding->head_ascii;
3035
3036   while (rejected != CATEGORY_MASK_ISO)
3037     {
3038       src_base = src;
3039       ONE_MORE_BYTE (c);
3040       switch (c)
3041         {
3042         case ISO_CODE_ESC:
3043           if (inhibit_iso_escape_detection)
3044             break;
3045           single_shifting = 0;
3046           ONE_MORE_BYTE (c);
3047           if (c == 'N' || c == 'O')
3048             {
3049               /* ESC <Fe> for SS2 or SS3.  */
3050               single_shifting = 1;
3051               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3052             }
3053           else if (c == '1')
3054             {
3055               /* End of composition.  */
3056               if (composition_count < 0
3057                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3058                 /* Invalid */
3059                 break;
3060               composition_count = -1;
3061               found |= CATEGORY_MASK_ISO;
3062             }
3063           else if (c >= '0' && c <= '4')
3064             {
3065               /* ESC <Fp> for start/end composition.  */
3066               composition_count = 0;
3067             }
3068           else
3069             {
3070               if (c >= '(' && c <= '/')
3071                 {
3072                   /* Designation sequence for a charset of dimension 1.  */
3073                   ONE_MORE_BYTE (c1);
3074                   if (c1 < ' ' || c1 >= 0x80
3075                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3076                     {
3077                       /* Invalid designation sequence.  Just ignore.  */
3078                       if (c1 >= 0x80)
3079                         rejected |= (CATEGORY_MASK_ISO_7BIT
3080                                      | CATEGORY_MASK_ISO_7_ELSE);
3081                       break;
3082                     }
3083                 }
3084               else if (c == '$')
3085                 {
3086                   /* Designation sequence for a charset of dimension 2.  */
3087                   ONE_MORE_BYTE (c);
3088                   if (c >= '@' && c <= 'B')
3089                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3090                     id = iso_charset_table[1][0][c];
3091                   else if (c >= '(' && c <= '/')
3092                     {
3093                       ONE_MORE_BYTE (c1);
3094                       if (c1 < ' ' || c1 >= 0x80
3095                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3096                         {
3097                           /* Invalid designation sequence.  Just ignore.  */
3098                           if (c1 >= 0x80)
3099                             rejected |= (CATEGORY_MASK_ISO_7BIT
3100                                          | CATEGORY_MASK_ISO_7_ELSE);
3101                           break;
3102                         }
3103                     }
3104                   else
3105                     {
3106                       /* Invalid designation sequence.  Just ignore it.  */
3107                       if (c >= 0x80)
3108                         rejected |= (CATEGORY_MASK_ISO_7BIT
3109                                      | CATEGORY_MASK_ISO_7_ELSE);
3110                       break;
3111                     }
3112                 }
3113               else
3114                 {
3115                   /* Invalid escape sequence.  Just ignore it.  */
3116                   if (c >= 0x80)
3117                     rejected |= (CATEGORY_MASK_ISO_7BIT
3118                                  | CATEGORY_MASK_ISO_7_ELSE);
3119                   break;
3120                 }
3121
3122               /* We found a valid designation sequence for CHARSET.  */
3123               rejected |= CATEGORY_MASK_ISO_8BIT;
3124               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3125                                   id))
3126                 found |= CATEGORY_MASK_ISO_7;
3127               else
3128                 rejected |= CATEGORY_MASK_ISO_7;
3129               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3130                                   id))
3131                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3132               else
3133                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3134               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3135                                   id))
3136                 found |= CATEGORY_MASK_ISO_7_ELSE;
3137               else
3138                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3139               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3140                                   id))
3141                 found |= CATEGORY_MASK_ISO_8_ELSE;
3142               else
3143                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3144             }
3145           break;
3146
3147         case ISO_CODE_SO:
3148         case ISO_CODE_SI:
3149           /* Locking shift out/in.  */
3150           if (inhibit_iso_escape_detection)
3151             break;
3152           single_shifting = 0;
3153           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3154           break;
3155
3156         case ISO_CODE_CSI:
3157           /* Control sequence introducer.  */
3158           single_shifting = 0;
3159           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3160           found |= CATEGORY_MASK_ISO_8_ELSE;
3161           goto check_extra_latin;
3162
3163         case ISO_CODE_SS2:
3164         case ISO_CODE_SS3:
3165           /* Single shift.   */
3166           if (inhibit_iso_escape_detection)
3167             break;
3168           single_shifting = 0;
3169           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3170           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3171               & CODING_ISO_FLAG_SINGLE_SHIFT)
3172             {
3173               found |= CATEGORY_MASK_ISO_8_1;
3174               single_shifting = 1;
3175             }
3176           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3177               & CODING_ISO_FLAG_SINGLE_SHIFT)
3178             {
3179               found |= CATEGORY_MASK_ISO_8_2;
3180               single_shifting = 1;
3181             }
3182           if (single_shifting)
3183             break;
3184           goto check_extra_latin;
3185
3186         default:
3187           if (c < 0)
3188             continue;
3189           if (c < 0x80)
3190             {
3191               if (composition_count >= 0)
3192                 composition_count++;
3193               single_shifting = 0;
3194               break;
3195             }
3196           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3197           if (c >= 0xA0)
3198             {
3199               found |= CATEGORY_MASK_ISO_8_1;
3200               /* Check the length of succeeding codes of the range
3201                  0xA0..0FF.  If the byte length is even, we include
3202                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3203                  only when we are not single shifting.  */
3204               if (! single_shifting
3205                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3206                 {
3207                   ptrdiff_t len = 1;
3208                   while (src < src_end)
3209                     {
3210                       src_base = src;
3211                       ONE_MORE_BYTE (c);
3212                       if (c < 0xA0)
3213                         {
3214                           src = src_base;
3215                           break;
3216                         }
3217                       len++;
3218                     }
3219
3220                   if (len & 1 && src < src_end)
3221                     {
3222                       rejected |= CATEGORY_MASK_ISO_8_2;
3223                       if (composition_count >= 0)
3224                         composition_count += len;
3225                     }
3226                   else
3227                     {
3228                       found |= CATEGORY_MASK_ISO_8_2;
3229                       if (composition_count >= 0)
3230                         composition_count += len / 2;
3231                     }
3232                 }
3233               break;
3234             }
3235         check_extra_latin:
3236           if (! VECTORP (Vlatin_extra_code_table)
3237               || NILP (AREF (Vlatin_extra_code_table, c)))
3238             {
3239               rejected = CATEGORY_MASK_ISO;
3240               break;
3241             }
3242           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3243               & CODING_ISO_FLAG_LATIN_EXTRA)
3244             found |= CATEGORY_MASK_ISO_8_1;
3245           else
3246             rejected |= CATEGORY_MASK_ISO_8_1;
3247           rejected |= CATEGORY_MASK_ISO_8_2;
3248           break;
3249         }
3250     }
3251   detect_info->rejected |= CATEGORY_MASK_ISO;
3252   return 0;
3253
3254  no_more_source:
3255   detect_info->rejected |= rejected;
3256   detect_info->found |= (found & ~rejected);
3257   return 1;
3258 }
3259
3260
3261 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3262    escape sequence should be kept.  */
3263 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3264   do {                                                                  \
3265     int id, prev;                                                       \
3266                                                                         \
3267     if (final < '0' || final >= 128                                     \
3268         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3269         || !SAFE_CHARSET_P (coding, id))                                \
3270       {                                                                 \
3271         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3272         chars_96 = -1;                                                  \
3273         break;                                                          \
3274       }                                                                 \
3275     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3276     if (id == charset_jisx0201_roman)                                   \
3277       {                                                                 \
3278         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3279           id = charset_ascii;                                           \
3280       }                                                                 \
3281     else if (id == charset_jisx0208_1978)                               \
3282       {                                                                 \
3283         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3284           id = charset_jisx0208;                                        \
3285       }                                                                 \
3286     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3287     /* If there was an invalid designation to REG previously, and this  \
3288        designation is ASCII to REG, we should keep this designation     \
3289        sequence.  */                                                    \
3290     if (prev == -2 && id == charset_ascii)                              \
3291       chars_96 = -1;                                                    \
3292   } while (0)
3293
3294
3295 /* Handle these composition sequence (ALT: alternate char):
3296
3297    (1) relative composition: ESC 0 CHAR ... ESC 1
3298    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3299    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3300    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3301
3302    When the start sequence (ESC 0/2/3/4) is found, this annotation
3303    header is produced.
3304
3305         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3306
3307    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3308    produced until the end sequence (ESC 1) is found:
3309
3310    (1) CHAR ... CHAR
3311    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3312    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3313    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3314
3315    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3316    annotation header is updated as below:
3317
3318    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3319    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3320    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3321    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3322
3323    If an error is found while composing, the annotation header is
3324    changed to:
3325
3326         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3327
3328    and the sequence [ -2 DECODED-RULE ] is changed to the original
3329    byte sequence as below:
3330         o the original byte sequence is B: [ B -1 ]
3331         o the original byte sequence is B1 B2: [ B1 B2 ]
3332    and the sequence [ -1 -1 ] is changed to the original byte
3333    sequence:
3334         [ ESC '0' ]
3335 */
3336
3337 /* Decode a composition rule C1 and maybe one more byte from the
3338    source, and set RULE to the encoded composition rule.  If the rule
3339    is invalid, goto invalid_code.  */
3340
3341 #define DECODE_COMPOSITION_RULE(rule)                                   \
3342   do {                                                                  \
3343     rule = c1 - 32;                                                     \
3344     if (rule < 0)                                                       \
3345       goto invalid_code;                                                \
3346     if (rule < 81)              /* old format (before ver.21) */        \
3347       {                                                                 \
3348         int gref = (rule) / 9;                                          \
3349         int nref = (rule) % 9;                                          \
3350         if (gref == 4) gref = 10;                                       \
3351         if (nref == 4) nref = 10;                                       \
3352         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3353       }                                                                 \
3354     else                        /* new format (after ver.21) */         \
3355       {                                                                 \
3356         int b;                                                          \
3357                                                                         \
3358         ONE_MORE_BYTE (b);                                              \
3359         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3360           goto invalid_code;                                            \
3361         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3362         rule += 0x100;   /* Distinguish it from the old format.  */     \
3363       }                                                                 \
3364   } while (0)
3365
3366 #define ENCODE_COMPOSITION_RULE(rule)                           \
3367   do {                                                          \
3368     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3369                                                                 \
3370     if (rule < 0x100)           /* old format */                \
3371       {                                                         \
3372         if (gref == 10) gref = 4;                               \
3373         if (nref == 10) nref = 4;                               \
3374         charbuf[idx] = 32 + gref * 9 + nref;                    \
3375         charbuf[idx + 1] = -1;                                  \
3376         new_chars++;                                            \
3377       }                                                         \
3378     else                                /* new format */        \
3379       {                                                         \
3380         charbuf[idx] = 32 + 81 + gref;                          \
3381         charbuf[idx + 1] = 32 + nref;                           \
3382         new_chars += 2;                                         \
3383       }                                                         \
3384   } while (0)
3385
3386 /* Finish the current composition as invalid.  */
3387
3388 static int
3389 finish_composition (int *charbuf, struct composition_status *cmp_status)
3390 {
3391   int idx = - cmp_status->length;
3392   int new_chars;
3393
3394   /* Recover the original ESC sequence */
3395   charbuf[idx++] = ISO_CODE_ESC;
3396   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3397                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3398                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3399                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3400                     : '4');
3401   charbuf[idx++] = -2;
3402   charbuf[idx++] = 0;
3403   charbuf[idx++] = -1;
3404   new_chars = cmp_status->nchars;
3405   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3406     for (; idx < 0; idx++)
3407       {
3408         int elt = charbuf[idx];
3409
3410         if (elt == -2)
3411           {
3412             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3413             idx++;
3414           }
3415         else if (elt == -1)
3416           {
3417             charbuf[idx++] = ISO_CODE_ESC;
3418             charbuf[idx] = '0';
3419             new_chars += 2;
3420           }
3421       }
3422   cmp_status->state = COMPOSING_NO;
3423   return new_chars;
3424 }
3425
3426 /* If characters are under composition, finish the composition.  */
3427 #define MAYBE_FINISH_COMPOSITION()                              \
3428   do {                                                          \
3429     if (cmp_status->state != COMPOSING_NO)                      \
3430       char_offset += finish_composition (charbuf, cmp_status);  \
3431   } while (0)
3432
3433 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3434
3435    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3436    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3437    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3438    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3439
3440    Produce this annotation sequence now:
3441
3442    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3443 */
3444
3445 #define DECODE_COMPOSITION_START(c1)                                       \
3446   do {                                                                     \
3447     if (c1 == '0'                                                          \
3448         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3449              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3450             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3451                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3452       {                                                                    \
3453         *charbuf++ = -1;                                                   \
3454         *charbuf++= -1;                                                    \
3455         cmp_status->state = COMPOSING_CHAR;                                \
3456         cmp_status->length += 2;                                           \
3457       }                                                                    \
3458     else                                                                   \
3459       {                                                                    \
3460         MAYBE_FINISH_COMPOSITION ();                                       \
3461         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3462                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3463                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3464                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3465         cmp_status->state                                                  \
3466           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3467         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3468         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3469         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3470         coding->annotated = 1;                                             \
3471       }                                                                    \
3472   } while (0)
3473
3474
3475 /* Handle composition end sequence ESC 1.  */
3476
3477 #define DECODE_COMPOSITION_END()                                        \
3478   do {                                                                  \
3479     if (cmp_status->nchars == 0                                         \
3480         || ((cmp_status->state == COMPOSING_CHAR)                       \
3481             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3482       {                                                                 \
3483         MAYBE_FINISH_COMPOSITION ();                                    \
3484         goto invalid_code;                                              \
3485       }                                                                 \
3486     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3487       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3488     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3489       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3490     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3491     char_offset += cmp_status->nchars;                                  \
3492     cmp_status->state = COMPOSING_NO;                                   \
3493   } while (0)
3494
3495 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3496
3497 #define STORE_COMPOSITION_RULE(rule)    \
3498   do {                                  \
3499     *charbuf++ = -2;                    \
3500     *charbuf++ = rule;                  \
3501     cmp_status->length += 2;            \
3502     cmp_status->state--;                \
3503   } while (0)
3504
3505 /* Store a composed char or a component char C in charbuf, and update
3506    cmp_status.  */
3507
3508 #define STORE_COMPOSITION_CHAR(c)                                       \
3509   do {                                                                  \
3510     *charbuf++ = (c);                                                   \
3511     cmp_status->length++;                                               \
3512     if (cmp_status->state == COMPOSING_CHAR)                            \
3513       cmp_status->nchars++;                                             \
3514     else                                                                \
3515       cmp_status->ncomps++;                                             \
3516     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3517         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3518             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3519       cmp_status->state++;                                              \
3520   } while (0)
3521
3522
3523 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3524
3525 static void
3526 decode_coding_iso_2022 (struct coding_system *coding)
3527 {
3528   const unsigned char *src = coding->source + coding->consumed;
3529   const unsigned char *src_end = coding->source + coding->src_bytes;
3530   const unsigned char *src_base;
3531   int *charbuf = coding->charbuf + coding->charbuf_used;
3532   /* We may produce two annotations (charset and composition) in one
3533      loop and one more charset annotation at the end.  */
3534   int *charbuf_end
3535     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3536   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3537   bool multibytep = coding->src_multibyte;
3538   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3539   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3540   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3541   int charset_id_2, charset_id_3;
3542   struct charset *charset;
3543   int c;
3544   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3545   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3546   ptrdiff_t char_offset = coding->produced_char;
3547   ptrdiff_t last_offset = char_offset;
3548   int last_id = charset_ascii;
3549   bool eol_dos
3550     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3551   int byte_after_cr = -1;
3552   int i;
3553
3554   setup_iso_safe_charsets (attrs);
3555   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3556
3557   if (cmp_status->state != COMPOSING_NO)
3558     {
3559       if (charbuf_end - charbuf < cmp_status->length)
3560         emacs_abort ();
3561       for (i = 0; i < cmp_status->length; i++)
3562         *charbuf++ = cmp_status->carryover[i];
3563       coding->annotated = 1;
3564     }
3565
3566   while (1)
3567     {
3568       int c1, c2, c3;
3569
3570       src_base = src;
3571       consumed_chars_base = consumed_chars;
3572
3573       if (charbuf >= charbuf_end)
3574         {
3575           if (byte_after_cr >= 0)
3576             src_base--;
3577           break;
3578         }
3579
3580       if (byte_after_cr >= 0)
3581         c1 = byte_after_cr, byte_after_cr = -1;
3582       else
3583         ONE_MORE_BYTE (c1);
3584       if (c1 < 0)
3585         goto invalid_code;
3586
3587       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3588         {
3589           *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3590           char_offset++;
3591           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3592           continue;
3593         }
3594
3595       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3596         {
3597           if (c1 == ISO_CODE_ESC)
3598             {
3599               if (src + 1 >= src_end)
3600                 goto no_more_source;
3601               *charbuf++ = ISO_CODE_ESC;
3602               char_offset++;
3603               if (src[0] == '%' && src[1] == '@')
3604                 {
3605                   src += 2;
3606                   consumed_chars += 2;
3607                   char_offset += 2;
3608                   /* We are sure charbuf can contain two more chars. */
3609                   *charbuf++ = '%';
3610                   *charbuf++ = '@';
3611                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3612                 }
3613             }
3614           else
3615             {
3616               *charbuf++ = ASCII_CHAR_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3617               char_offset++;
3618             }
3619           continue;
3620         }
3621
3622       if ((cmp_status->state == COMPOSING_RULE
3623            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3624           && c1 != ISO_CODE_ESC)
3625         {
3626           int rule;
3627
3628           DECODE_COMPOSITION_RULE (rule);
3629           STORE_COMPOSITION_RULE (rule);
3630           continue;
3631         }
3632
3633       /* We produce at most one character.  */
3634       switch (iso_code_class [c1])
3635         {
3636         case ISO_0x20_or_0x7F:
3637           if (charset_id_0 < 0
3638               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3639             /* This is SPACE or DEL.  */
3640             charset = CHARSET_FROM_ID (charset_ascii);
3641           else
3642             charset = CHARSET_FROM_ID (charset_id_0);
3643           break;
3644
3645         case ISO_graphic_plane_0:
3646           if (charset_id_0 < 0)
3647             charset = CHARSET_FROM_ID (charset_ascii);
3648           else
3649             charset = CHARSET_FROM_ID (charset_id_0);
3650           break;
3651
3652         case ISO_0xA0_or_0xFF:
3653           if (charset_id_1 < 0
3654               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3655               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3656             goto invalid_code;
3657           /* This is a graphic character, we fall down ... */
3658
3659         case ISO_graphic_plane_1:
3660           if (charset_id_1 < 0)
3661             goto invalid_code;
3662           charset = CHARSET_FROM_ID (charset_id_1);
3663           break;
3664
3665         case ISO_control_0:
3666           if (eol_dos && c1 == '\r')
3667             ONE_MORE_BYTE (byte_after_cr);
3668           MAYBE_FINISH_COMPOSITION ();
3669           charset = CHARSET_FROM_ID (charset_ascii);
3670           break;
3671
3672         case ISO_control_1:
3673           goto invalid_code;
3674
3675         case ISO_shift_out:
3676           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3677               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3678             goto invalid_code;
3679           CODING_ISO_INVOCATION (coding, 0) = 1;
3680           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3681           continue;
3682
3683         case ISO_shift_in:
3684           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3685             goto invalid_code;
3686           CODING_ISO_INVOCATION (coding, 0) = 0;
3687           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3688           continue;
3689
3690         case ISO_single_shift_2_7:
3691           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3692             goto invalid_code;
3693         case ISO_single_shift_2:
3694           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3695             goto invalid_code;
3696           /* SS2 is handled as an escape sequence of ESC 'N' */
3697           c1 = 'N';
3698           goto label_escape_sequence;
3699
3700         case ISO_single_shift_3:
3701           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3702             goto invalid_code;
3703           /* SS2 is handled as an escape sequence of ESC 'O' */
3704           c1 = 'O';
3705           goto label_escape_sequence;
3706
3707         case ISO_control_sequence_introducer:
3708           /* CSI is handled as an escape sequence of ESC '[' ...  */
3709           c1 = '[';
3710           goto label_escape_sequence;
3711
3712         case ISO_escape:
3713           ONE_MORE_BYTE (c1);
3714         label_escape_sequence:
3715           /* Escape sequences handled here are invocation,
3716              designation, direction specification, and character
3717              composition specification.  */
3718           switch (c1)
3719             {
3720             case '&':           /* revision of following character set */
3721               ONE_MORE_BYTE (c1);
3722               if (!(c1 >= '@' && c1 <= '~'))
3723                 goto invalid_code;
3724               ONE_MORE_BYTE (c1);
3725               if (c1 != ISO_CODE_ESC)
3726                 goto invalid_code;
3727               ONE_MORE_BYTE (c1);
3728               goto label_escape_sequence;
3729
3730             case '$':           /* designation of 2-byte character set */
3731               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3732                 goto invalid_code;
3733               {
3734                 int reg, chars96;
3735
3736                 ONE_MORE_BYTE (c1);
3737                 if (c1 >= '@' && c1 <= 'B')
3738                   {     /* designation of JISX0208.1978, GB2312.1980,
3739                            or JISX0208.1980 */
3740                     reg = 0, chars96 = 0;
3741                   }
3742                 else if (c1 >= 0x28 && c1 <= 0x2B)
3743                   { /* designation of DIMENSION2_CHARS94 character set */
3744                     reg = c1 - 0x28, chars96 = 0;
3745                     ONE_MORE_BYTE (c1);
3746                   }
3747                 else if (c1 >= 0x2C && c1 <= 0x2F)
3748                   { /* designation of DIMENSION2_CHARS96 character set */
3749                     reg = c1 - 0x2C, chars96 = 1;
3750                     ONE_MORE_BYTE (c1);
3751                   }
3752                 else
3753                   goto invalid_code;
3754                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3755                 /* We must update these variables now.  */
3756                 if (reg == 0)
3757                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3758                 else if (reg == 1)
3759                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3760                 if (chars96 < 0)
3761                   goto invalid_code;
3762               }
3763               continue;
3764
3765             case 'n':           /* invocation of locking-shift-2 */
3766               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3767                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3768                 goto invalid_code;
3769               CODING_ISO_INVOCATION (coding, 0) = 2;
3770               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3771               continue;
3772
3773             case 'o':           /* invocation of locking-shift-3 */
3774               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3775                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3776                 goto invalid_code;
3777               CODING_ISO_INVOCATION (coding, 0) = 3;
3778               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3779               continue;
3780
3781             case 'N':           /* invocation of single-shift-2 */
3782               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3783                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3784                 goto invalid_code;
3785               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3786               if (charset_id_2 < 0)
3787                 charset = CHARSET_FROM_ID (charset_ascii);
3788               else
3789                 charset = CHARSET_FROM_ID (charset_id_2);
3790               ONE_MORE_BYTE (c1);
3791               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3792                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3793                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3794                           ? c1 >= 0x80 : c1 < 0x80)))
3795                 goto invalid_code;
3796               break;
3797
3798             case 'O':           /* invocation of single-shift-3 */
3799               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3800                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3801                 goto invalid_code;
3802               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3803               if (charset_id_3 < 0)
3804                 charset = CHARSET_FROM_ID (charset_ascii);
3805               else
3806                 charset = CHARSET_FROM_ID (charset_id_3);
3807               ONE_MORE_BYTE (c1);
3808               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3809                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3810                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3811                           ? c1 >= 0x80 : c1 < 0x80)))
3812                 goto invalid_code;
3813               break;
3814
3815             case '0': case '2': case '3': case '4': /* start composition */
3816               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3817                 goto invalid_code;
3818               if (last_id != charset_ascii)
3819                 {
3820                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3821                   last_id = charset_ascii;
3822                   last_offset = char_offset;
3823                 }
3824               DECODE_COMPOSITION_START (c1);
3825               continue;
3826
3827             case '1':           /* end composition */
3828               if (cmp_status->state == COMPOSING_NO)
3829                 goto invalid_code;
3830               DECODE_COMPOSITION_END ();
3831               continue;
3832
3833             case '[':           /* specification of direction */
3834               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3835                 goto invalid_code;
3836               /* For the moment, nested direction is not supported.
3837                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3838                  left-to-right, and nonzero means right-to-left.  */
3839               ONE_MORE_BYTE (c1);
3840               switch (c1)
3841                 {
3842                 case ']':       /* end of the current direction */
3843                   coding->mode &= ~CODING_MODE_DIRECTION;
3844
3845                 case '0':       /* end of the current direction */
3846                 case '1':       /* start of left-to-right direction */
3847                   ONE_MORE_BYTE (c1);
3848                   if (c1 == ']')
3849                     coding->mode &= ~CODING_MODE_DIRECTION;
3850                   else
3851                     goto invalid_code;
3852                   break;
3853
3854                 case '2':       /* start of right-to-left direction */
3855                   ONE_MORE_BYTE (c1);
3856                   if (c1 == ']')
3857                     coding->mode |= CODING_MODE_DIRECTION;
3858                   else
3859                     goto invalid_code;
3860                   break;
3861
3862                 default:
3863                   goto invalid_code;
3864                 }
3865               continue;
3866
3867             case '%':
3868               ONE_MORE_BYTE (c1);
3869               if (c1 == '/')
3870                 {
3871                   /* CTEXT extended segment:
3872                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3873                      We keep these bytes as is for the moment.
3874                      They may be decoded by post-read-conversion.  */
3875                   int dim, M, L;
3876                   int size;
3877
3878                   ONE_MORE_BYTE (dim);
3879                   if (dim < '0' || dim > '4')
3880                     goto invalid_code;
3881                   ONE_MORE_BYTE (M);
3882                   if (M < 128)
3883                     goto invalid_code;
3884                   ONE_MORE_BYTE (L);
3885                   if (L < 128)
3886                     goto invalid_code;
3887                   size = ((M - 128) * 128) + (L - 128);
3888                   if (charbuf + 6 > charbuf_end)
3889                     goto break_loop;
3890                   *charbuf++ = ISO_CODE_ESC;
3891                   *charbuf++ = '%';
3892                   *charbuf++ = '/';
3893                   *charbuf++ = dim;
3894                   *charbuf++ = BYTE8_TO_CHAR (M);
3895                   *charbuf++ = BYTE8_TO_CHAR (L);
3896                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3897                 }
3898               else if (c1 == 'G')
3899                 {
3900                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3901                      ESC % G --UTF-8-BYTES-- ESC % @
3902                      We keep these bytes as is for the moment.
3903                      They may be decoded by post-read-conversion.  */
3904                   if (charbuf + 3 > charbuf_end)
3905                     goto break_loop;
3906                   *charbuf++ = ISO_CODE_ESC;
3907                   *charbuf++ = '%';
3908                   *charbuf++ = 'G';
3909                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3910                 }
3911               else
3912                 goto invalid_code;
3913               continue;
3914               break;
3915
3916             default:
3917               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3918                 goto invalid_code;
3919               {
3920                 int reg, chars96;
3921
3922                 if (c1 >= 0x28 && c1 <= 0x2B)
3923                   { /* designation of DIMENSION1_CHARS94 character set */
3924                     reg = c1 - 0x28, chars96 = 0;
3925                     ONE_MORE_BYTE (c1);
3926                   }
3927                 else if (c1 >= 0x2C && c1 <= 0x2F)
3928                   { /* designation of DIMENSION1_CHARS96 character set */
3929                     reg = c1 - 0x2C, chars96 = 1;
3930                     ONE_MORE_BYTE (c1);
3931                   }
3932                 else
3933                   goto invalid_code;
3934                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3935                 /* We must update these variables now.  */
3936                 if (reg == 0)
3937                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3938                 else if (reg == 1)
3939                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3940                 if (chars96 < 0)
3941                   goto invalid_code;
3942               }
3943               continue;
3944             }
3945           break;
3946
3947         default:
3948           emacs_abort ();
3949         }
3950
3951       if (cmp_status->state == COMPOSING_NO
3952           && charset->id != charset_ascii
3953           && last_id != charset->id)
3954         {
3955           if (last_id != charset_ascii)
3956             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3957           last_id = charset->id;
3958           last_offset = char_offset;
3959         }
3960
3961       /* Now we know CHARSET and 1st position code C1 of a character.
3962          Produce a decoded character while getting 2nd and 3rd
3963          position codes C2, C3 if necessary.  */
3964       if (CHARSET_DIMENSION (charset) > 1)
3965         {
3966           ONE_MORE_BYTE (c2);
3967           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3968               || ((c1 & 0x80) != (c2 & 0x80)))
3969             /* C2 is not in a valid range.  */
3970             goto invalid_code;
3971           if (CHARSET_DIMENSION (charset) == 2)
3972             c1 = (c1 << 8) | c2;
3973           else
3974             {
3975               ONE_MORE_BYTE (c3);
3976               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3977                   || ((c1 & 0x80) != (c3 & 0x80)))
3978                 /* C3 is not in a valid range.  */
3979                 goto invalid_code;
3980               c1 = (c1 << 16) | (c2 << 8) | c2;
3981             }
3982         }
3983       c1 &= 0x7F7F7F;
3984       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3985       if (c < 0)
3986         {
3987           MAYBE_FINISH_COMPOSITION ();
3988           for (; src_base < src; src_base++, char_offset++)
3989             {
3990               if (ASCII_CHAR_P (*src_base))
3991                 *charbuf++ = *src_base;
3992               else
3993                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3994             }
3995         }
3996       else if (cmp_status->state == COMPOSING_NO)
3997         {
3998           *charbuf++ = c;
3999           char_offset++;
4000         }
4001       else if ((cmp_status->state == COMPOSING_CHAR
4002                 ? cmp_status->nchars
4003                 : cmp_status->ncomps)
4004                >= MAX_COMPOSITION_COMPONENTS)
4005         {
4006           /* Too long composition.  */
4007           MAYBE_FINISH_COMPOSITION ();
4008           *charbuf++ = c;
4009           char_offset++;
4010         }
4011       else
4012         STORE_COMPOSITION_CHAR (c);
4013       continue;
4014
4015     invalid_code:
4016       MAYBE_FINISH_COMPOSITION ();
4017       src = src_base;
4018       consumed_chars = consumed_chars_base;
4019       ONE_MORE_BYTE (c);
4020       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
4021       char_offset++;
4022       /* Reset the invocation and designation status to the safest
4023          one; i.e. designate ASCII to the graphic register 0, and
4024          invoke that register to the graphic plane 0.  This typically
4025          helps the case that an designation sequence for ASCII "ESC (
4026          B" is somehow broken (e.g. broken by a newline).  */
4027       CODING_ISO_INVOCATION (coding, 0) = 0;
4028       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4029       charset_id_0 = charset_ascii;
4030       continue;
4031
4032     break_loop:
4033       break;
4034     }
4035
4036  no_more_source:
4037   if (cmp_status->state != COMPOSING_NO)
4038     {
4039       if (coding->mode & CODING_MODE_LAST_BLOCK)
4040         MAYBE_FINISH_COMPOSITION ();
4041       else
4042         {
4043           charbuf -= cmp_status->length;
4044           for (i = 0; i < cmp_status->length; i++)
4045             cmp_status->carryover[i] = charbuf[i];
4046         }
4047     }
4048   else if (last_id != charset_ascii)
4049     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4050   coding->consumed_char += consumed_chars_base;
4051   coding->consumed = src_base - coding->source;
4052   coding->charbuf_used = charbuf - coding->charbuf;
4053 }
4054
4055
4056 /* ISO2022 encoding stuff.  */
4057
4058 /*
4059    It is not enough to say just "ISO2022" on encoding, we have to
4060    specify more details.  In Emacs, each coding system of ISO2022
4061    variant has the following specifications:
4062         1. Initial designation to G0 thru G3.
4063         2. Allows short-form designation?
4064         3. ASCII should be designated to G0 before control characters?
4065         4. ASCII should be designated to G0 at end of line?
4066         5. 7-bit environment or 8-bit environment?
4067         6. Use locking-shift?
4068         7. Use Single-shift?
4069    And the following two are only for Japanese:
4070         8. Use ASCII in place of JIS0201-1976-Roman?
4071         9. Use JISX0208-1983 in place of JISX0208-1978?
4072    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4073    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4074    details.
4075 */
4076
4077 /* Produce codes (escape sequence) for designating CHARSET to graphic
4078    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4079    '@', 'A', or 'B' and the coding system CODING allows, produce
4080    designation sequence of short-form.  */
4081
4082 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4083   do {                                                                  \
4084     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4085     const char *intermediate_char_94 = "()*+";                          \
4086     const char *intermediate_char_96 = ",-./";                          \
4087     int revision = -1;                                                  \
4088                                                                         \
4089     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4090       revision = CHARSET_ISO_REVISION (charset);                        \
4091                                                                         \
4092     if (revision >= 0)                                                  \
4093       {                                                                 \
4094         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4095         EMIT_ONE_BYTE ('@' + revision);                                 \
4096       }                                                                 \
4097     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4098     if (CHARSET_DIMENSION (charset) == 1)                               \
4099       {                                                                 \
4100         int b;                                                          \
4101         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4102           b = intermediate_char_94[reg];                                \
4103         else                                                            \
4104           b = intermediate_char_96[reg];                                \
4105         EMIT_ONE_ASCII_BYTE (b);                                        \
4106       }                                                                 \
4107     else                                                                \
4108       {                                                                 \
4109         EMIT_ONE_ASCII_BYTE ('$');                                      \
4110         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4111           {                                                             \
4112             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4113                 || reg != 0                                             \
4114                 || final_char < '@' || final_char > 'B')                \
4115               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4116           }                                                             \
4117         else                                                            \
4118           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4119       }                                                                 \
4120     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4121                                                                         \
4122     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4123   } while (0)
4124
4125
4126 /* The following two macros produce codes (control character or escape
4127    sequence) for ISO2022 single-shift functions (single-shift-2 and
4128    single-shift-3).  */
4129
4130 #define ENCODE_SINGLE_SHIFT_2                                           \
4131   do {                                                                  \
4132     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4133       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4134     else                                                                \
4135       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4136     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4137   } while (0)
4138
4139
4140 #define ENCODE_SINGLE_SHIFT_3                                           \
4141   do {                                                                  \
4142     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4143       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4144     else                                                                \
4145       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4146     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4147   } while (0)
4148
4149
4150 /* The following four macros produce codes (control character or
4151    escape sequence) for ISO2022 locking-shift functions (shift-in,
4152    shift-out, locking-shift-2, and locking-shift-3).  */
4153
4154 #define ENCODE_SHIFT_IN                                 \
4155   do {                                                  \
4156     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4157     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4158   } while (0)
4159
4160
4161 #define ENCODE_SHIFT_OUT                                \
4162   do {                                                  \
4163     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4164     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4165   } while (0)
4166
4167
4168 #define ENCODE_LOCKING_SHIFT_2                          \
4169   do {                                                  \
4170     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4171     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4172   } while (0)
4173
4174
4175 #define ENCODE_LOCKING_SHIFT_3                          \
4176   do {                                                  \
4177     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4178     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4179   } while (0)
4180
4181
4182 /* Produce codes for a DIMENSION1 character whose character set is
4183    CHARSET and whose position-code is C1.  Designation and invocation
4184    sequences are also produced in advance if necessary.  */
4185
4186 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4187   do {                                                                  \
4188     int id = CHARSET_ID (charset);                                      \
4189                                                                         \
4190     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4191         && id == charset_ascii)                                         \
4192       {                                                                 \
4193         id = charset_jisx0201_roman;                                    \
4194         charset = CHARSET_FROM_ID (id);                                 \
4195       }                                                                 \
4196                                                                         \
4197     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4198       {                                                                 \
4199         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4200           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4201         else                                                            \
4202           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4203         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4204         break;                                                          \
4205       }                                                                 \
4206     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4207       {                                                                 \
4208         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4209         break;                                                          \
4210       }                                                                 \
4211     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4212       {                                                                 \
4213         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4214         break;                                                          \
4215       }                                                                 \
4216     else                                                                \
4217       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4218          must invoke it, or, at first, designate it to some graphic     \
4219          register.  Then repeat the loop to actually produce the        \
4220          character.  */                                                 \
4221       dst = encode_invocation_designation (charset, coding, dst,        \
4222                                            &produced_chars);            \
4223   } while (1)
4224
4225
4226 /* Produce codes for a DIMENSION2 character whose character set is
4227    CHARSET and whose position-codes are C1 and C2.  Designation and
4228    invocation codes are also produced in advance if necessary.  */
4229
4230 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4231   do {                                                                  \
4232     int id = CHARSET_ID (charset);                                      \
4233                                                                         \
4234     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4235         && id == charset_jisx0208)                                      \
4236       {                                                                 \
4237         id = charset_jisx0208_1978;                                     \
4238         charset = CHARSET_FROM_ID (id);                                 \
4239       }                                                                 \
4240                                                                         \
4241     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4242       {                                                                 \
4243         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4244           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4245         else                                                            \
4246           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4247         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4248         break;                                                          \
4249       }                                                                 \
4250     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4251       {                                                                 \
4252         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4253         break;                                                          \
4254       }                                                                 \
4255     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4256       {                                                                 \
4257         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4258         break;                                                          \
4259       }                                                                 \
4260     else                                                                \
4261       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4262          must invoke it, or, at first, designate it to some graphic     \
4263          register.  Then repeat the loop to actually produce the        \
4264          character.  */                                                 \
4265       dst = encode_invocation_designation (charset, coding, dst,        \
4266                                            &produced_chars);            \
4267   } while (1)
4268
4269
4270 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4271   do {                                                                     \
4272     unsigned code;                                                         \
4273     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4274                                                                            \
4275     if (CHARSET_DIMENSION (charset) == 1)                                  \
4276       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4277     else                                                                   \
4278       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4279   } while (0)
4280
4281
4282 /* Produce designation and invocation codes at a place pointed by DST
4283    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4284    Return new DST.  */
4285
4286 static unsigned char *
4287 encode_invocation_designation (struct charset *charset,
4288                                struct coding_system *coding,
4289                                unsigned char *dst, ptrdiff_t *p_nchars)
4290 {
4291   bool multibytep = coding->dst_multibyte;
4292   ptrdiff_t produced_chars = *p_nchars;
4293   int reg;                      /* graphic register number */
4294   int id = CHARSET_ID (charset);
4295
4296   /* At first, check designations.  */
4297   for (reg = 0; reg < 4; reg++)
4298     if (id == CODING_ISO_DESIGNATION (coding, reg))
4299       break;
4300
4301   if (reg >= 4)
4302     {
4303       /* CHARSET is not yet designated to any graphic registers.  */
4304       /* At first check the requested designation.  */
4305       reg = CODING_ISO_REQUEST (coding, id);
4306       if (reg < 0)
4307         /* Since CHARSET requests no special designation, designate it
4308            to graphic register 0.  */
4309         reg = 0;
4310
4311       ENCODE_DESIGNATION (charset, reg, coding);
4312     }
4313
4314   if (CODING_ISO_INVOCATION (coding, 0) != reg
4315       && CODING_ISO_INVOCATION (coding, 1) != reg)
4316     {
4317       /* Since the graphic register REG is not invoked to any graphic
4318          planes, invoke it to graphic plane 0.  */
4319       switch (reg)
4320         {
4321         case 0:                 /* graphic register 0 */
4322           ENCODE_SHIFT_IN;
4323           break;
4324
4325         case 1:                 /* graphic register 1 */
4326           ENCODE_SHIFT_OUT;
4327           break;
4328
4329         case 2:                 /* graphic register 2 */
4330           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4331             ENCODE_SINGLE_SHIFT_2;
4332           else
4333             ENCODE_LOCKING_SHIFT_2;
4334           break;
4335
4336         case 3:                 /* graphic register 3 */
4337           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4338             ENCODE_SINGLE_SHIFT_3;
4339           else
4340             ENCODE_LOCKING_SHIFT_3;
4341           break;
4342         }
4343     }
4344
4345   *p_nchars = produced_chars;
4346   return dst;
4347 }
4348
4349
4350 /* Produce codes for designation and invocation to reset the graphic
4351    planes and registers to initial state.  */
4352 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4353   do {                                                                  \
4354     int reg;                                                            \
4355     struct charset *charset;                                            \
4356                                                                         \
4357     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4358       ENCODE_SHIFT_IN;                                                  \
4359     for (reg = 0; reg < 4; reg++)                                       \
4360       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4361           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4362               != CODING_ISO_INITIAL (coding, reg)))                     \
4363         {                                                               \
4364           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4365           ENCODE_DESIGNATION (charset, reg, coding);                    \
4366         }                                                               \
4367   } while (0)
4368
4369
4370 /* Produce designation sequences of charsets in the line started from
4371    CHARBUF to a place pointed by DST, and return the number of
4372    produced bytes.  DST should not directly point a buffer text area
4373    which may be relocated by char_charset call.
4374
4375    If the current block ends before any end-of-line, we may fail to
4376    find all the necessary designations.  */
4377
4378 static ptrdiff_t
4379 encode_designation_at_bol (struct coding_system *coding,
4380                            int *charbuf, int *charbuf_end,
4381                            unsigned char *dst)
4382 {
4383   unsigned char *orig = dst;
4384   struct charset *charset;
4385   /* Table of charsets to be designated to each graphic register.  */
4386   int r[4];
4387   int c, found = 0, reg;
4388   ptrdiff_t produced_chars = 0;
4389   bool multibytep = coding->dst_multibyte;
4390   Lisp_Object attrs;
4391   Lisp_Object charset_list;
4392
4393   attrs = CODING_ID_ATTRS (coding->id);
4394   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4395   if (EQ (charset_list, Qiso_2022))
4396     charset_list = Viso_2022_charset_list;
4397
4398   for (reg = 0; reg < 4; reg++)
4399     r[reg] = -1;
4400
4401   while (charbuf < charbuf_end && found < 4)
4402     {
4403       int id;
4404
4405       c = *charbuf++;
4406       if (c == '\n')
4407         break;
4408       charset = char_charset (c, charset_list, NULL);
4409       id = CHARSET_ID (charset);
4410       reg = CODING_ISO_REQUEST (coding, id);
4411       if (reg >= 0 && r[reg] < 0)
4412         {
4413           found++;
4414           r[reg] = id;
4415         }
4416     }
4417
4418   if (found)
4419     {
4420       for (reg = 0; reg < 4; reg++)
4421         if (r[reg] >= 0
4422             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4423           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4424     }
4425
4426   return dst - orig;
4427 }
4428
4429 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4430
4431 static bool
4432 encode_coding_iso_2022 (struct coding_system *coding)
4433 {
4434   bool multibytep = coding->dst_multibyte;
4435   int *charbuf = coding->charbuf;
4436   int *charbuf_end = charbuf + coding->charbuf_used;
4437   unsigned char *dst = coding->destination + coding->produced;
4438   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4439   int safe_room = 16;
4440   bool bol_designation
4441     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4442        && CODING_ISO_BOL (coding));
4443   ptrdiff_t produced_chars = 0;
4444   Lisp_Object attrs, eol_type, charset_list;
4445   bool ascii_compatible;
4446   int c;
4447   int preferred_charset_id = -1;
4448
4449   CODING_GET_INFO (coding, attrs, charset_list);
4450   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4451   if (VECTORP (eol_type))
4452     eol_type = Qunix;
4453
4454   setup_iso_safe_charsets (attrs);
4455   /* Charset list may have been changed.  */
4456   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4457   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4458
4459   ascii_compatible
4460     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4461        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4462                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4463
4464   while (charbuf < charbuf_end)
4465     {
4466       ASSURE_DESTINATION (safe_room);
4467
4468       if (bol_designation)
4469         {
4470           /* We have to produce designation sequences if any now.  */
4471           unsigned char desig_buf[16];
4472           ptrdiff_t nbytes;
4473           ptrdiff_t offset;
4474
4475           charset_map_loaded = 0;
4476           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4477                                               desig_buf);
4478           if (charset_map_loaded
4479               && (offset = coding_change_destination (coding)))
4480             {
4481               dst += offset;
4482               dst_end += offset;
4483             }
4484           memcpy (dst, desig_buf, nbytes);
4485           dst += nbytes;
4486           /* We are sure that designation sequences are all ASCII bytes.  */
4487           produced_chars += nbytes;
4488           bol_designation = 0;
4489           ASSURE_DESTINATION (safe_room);
4490         }
4491
4492       c = *charbuf++;
4493
4494       if (c < 0)
4495         {
4496           /* Handle an annotation.  */
4497           switch (*charbuf)
4498             {
4499             case CODING_ANNOTATE_COMPOSITION_MASK:
4500               /* Not yet implemented.  */
4501               break;
4502             case CODING_ANNOTATE_CHARSET_MASK:
4503               preferred_charset_id = charbuf[2];
4504               if (preferred_charset_id >= 0
4505                   && NILP (Fmemq (make_number (preferred_charset_id),
4506                                   charset_list)))
4507                 preferred_charset_id = -1;
4508               break;
4509             default:
4510               emacs_abort ();
4511             }
4512           charbuf += -c - 1;
4513           continue;
4514         }
4515
4516       /* Now encode the character C.  */
4517       if (c < 0x20 || c == 0x7F)
4518         {
4519           if (c == '\n'
4520               || (c == '\r' && EQ (eol_type, Qmac)))
4521             {
4522               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4523                 ENCODE_RESET_PLANE_AND_REGISTER ();
4524               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4525                 {
4526                   int i;
4527
4528                   for (i = 0; i < 4; i++)
4529                     CODING_ISO_DESIGNATION (coding, i)
4530                       = CODING_ISO_INITIAL (coding, i);
4531                 }
4532               bol_designation = ((CODING_ISO_FLAGS (coding)
4533                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4534                                  != 0);
4535             }
4536           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4537             ENCODE_RESET_PLANE_AND_REGISTER ();
4538           EMIT_ONE_ASCII_BYTE (c);
4539         }
4540       else if (ASCII_CHAR_P (c))
4541         {
4542           if (ascii_compatible)
4543             EMIT_ONE_ASCII_BYTE (c);
4544           else
4545             {
4546               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4547               ENCODE_ISO_CHARACTER (charset, c);
4548             }
4549         }
4550       else if (CHAR_BYTE8_P (c))
4551         {
4552           c = CHAR_TO_BYTE8 (c);
4553           EMIT_ONE_BYTE (c);
4554         }
4555       else
4556         {
4557           struct charset *charset;
4558
4559           if (preferred_charset_id >= 0)
4560             {
4561               bool result;
4562
4563               charset = CHARSET_FROM_ID (preferred_charset_id);
4564               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4565               if (! result)
4566                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4567                                      NULL, charset);
4568             }
4569           else
4570             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4571                                  NULL, charset);
4572           if (!charset)
4573             {
4574               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4575                 {
4576                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4577                   charset = CHARSET_FROM_ID (charset_ascii);
4578                 }
4579               else
4580                 {
4581                   c = coding->default_char;
4582                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4583                                        charset_list, NULL, charset);
4584                 }
4585             }
4586           ENCODE_ISO_CHARACTER (charset, c);
4587         }
4588     }
4589
4590   if (coding->mode & CODING_MODE_LAST_BLOCK
4591       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4592     {
4593       ASSURE_DESTINATION (safe_room);
4594       ENCODE_RESET_PLANE_AND_REGISTER ();
4595     }
4596   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4597   CODING_ISO_BOL (coding) = bol_designation;
4598   coding->produced_char += produced_chars;
4599   coding->produced = dst - coding->destination;
4600   return 0;
4601 }
4602
4603 \f
4604 /*** 8,9. SJIS and BIG5 handlers ***/
4605
4606 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4607    quite widely.  So, for the moment, Emacs supports them in the bare
4608    C code.  But, in the future, they may be supported only by CCL.  */
4609
4610 /* SJIS is a coding system encoding three character sets: ASCII, right
4611    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4612    as is.  A character of charset katakana-jisx0201 is encoded by
4613    "position-code + 0x80".  A character of charset japanese-jisx0208
4614    is encoded in 2-byte but two position-codes are divided and shifted
4615    so that it fit in the range below.
4616
4617    --- CODE RANGE of SJIS ---
4618    (character set)      (range)
4619    ASCII                0x00 .. 0x7F
4620    KATAKANA-JISX0201    0xA0 .. 0xDF
4621    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4622             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4623    -------------------------------
4624
4625 */
4626
4627 /* BIG5 is a coding system encoding two character sets: ASCII and
4628    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4629    character set and is encoded in two-byte.
4630
4631    --- CODE RANGE of BIG5 ---
4632    (character set)      (range)
4633    ASCII                0x00 .. 0x7F
4634    Big5 (1st byte)      0xA1 .. 0xFE
4635         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4636    --------------------------
4637
4638   */
4639
4640 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4641    Return true if a text is encoded in SJIS.  */
4642
4643 static bool
4644 detect_coding_sjis (struct coding_system *coding,
4645                     struct coding_detection_info *detect_info)
4646 {
4647   const unsigned char *src = coding->source, *src_base;
4648   const unsigned char *src_end = coding->source + coding->src_bytes;
4649   bool multibytep = coding->src_multibyte;
4650   ptrdiff_t consumed_chars = 0;
4651   int found = 0;
4652   int c;
4653   Lisp_Object attrs, charset_list;
4654   int max_first_byte_of_2_byte_code;
4655
4656   CODING_GET_INFO (coding, attrs, charset_list);
4657   max_first_byte_of_2_byte_code
4658     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4659
4660   detect_info->checked |= CATEGORY_MASK_SJIS;
4661   /* A coding system of this category is always ASCII compatible.  */
4662   src += coding->head_ascii;
4663
4664   while (1)
4665     {
4666       src_base = src;
4667       ONE_MORE_BYTE (c);
4668       if (c < 0x80)
4669         continue;
4670       if ((c >= 0x81 && c <= 0x9F)
4671           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4672         {
4673           ONE_MORE_BYTE (c);
4674           if (c < 0x40 || c == 0x7F || c > 0xFC)
4675             break;
4676           found = CATEGORY_MASK_SJIS;
4677         }
4678       else if (c >= 0xA0 && c < 0xE0)
4679         found = CATEGORY_MASK_SJIS;
4680       else
4681         break;
4682     }
4683   detect_info->rejected |= CATEGORY_MASK_SJIS;
4684   return 0;
4685
4686  no_more_source:
4687   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4688     {
4689       detect_info->rejected |= CATEGORY_MASK_SJIS;
4690       return 0;
4691     }
4692   detect_info->found |= found;
4693   return 1;
4694 }
4695
4696 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4697    Return true if a text is encoded in BIG5.  */
4698
4699 static bool
4700 detect_coding_big5 (struct coding_system *coding,
4701                     struct coding_detection_info *detect_info)
4702 {
4703   const unsigned char *src = coding->source, *src_base;
4704   const unsigned char *src_end = coding->source + coding->src_bytes;
4705   bool multibytep = coding->src_multibyte;
4706   ptrdiff_t consumed_chars = 0;
4707   int found = 0;
4708   int c;
4709
4710   detect_info->checked |= CATEGORY_MASK_BIG5;
4711   /* A coding system of this category is always ASCII compatible.  */
4712   src += coding->head_ascii;
4713
4714   while (1)
4715     {
4716       src_base = src;
4717       ONE_MORE_BYTE (c);
4718       if (c < 0x80)
4719         continue;
4720       if (c >= 0xA1)
4721         {
4722           ONE_MORE_BYTE (c);
4723           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4724             return 0;
4725           found = CATEGORY_MASK_BIG5;
4726         }
4727       else
4728         break;
4729     }
4730   detect_info->rejected |= CATEGORY_MASK_BIG5;
4731   return 0;
4732
4733  no_more_source:
4734   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4735     {
4736       detect_info->rejected |= CATEGORY_MASK_BIG5;
4737       return 0;
4738     }
4739   detect_info->found |= found;
4740   return 1;
4741 }
4742
4743 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4744
4745 static void
4746 decode_coding_sjis (struct coding_system *coding)
4747 {
4748   const unsigned char *src = coding->source + coding->consumed;
4749   const unsigned char *src_end = coding->source + coding->src_bytes;
4750   const unsigned char *src_base;
4751   int *charbuf = coding->charbuf + coding->charbuf_used;
4752   /* We may produce one charset annotation in one loop and one more at
4753      the end.  */
4754   int *charbuf_end
4755     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4756   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4757   bool multibytep = coding->src_multibyte;
4758   struct charset *charset_roman, *charset_kanji, *charset_kana;
4759   struct charset *charset_kanji2;
4760   Lisp_Object attrs, charset_list, val;
4761   ptrdiff_t char_offset = coding->produced_char;
4762   ptrdiff_t last_offset = char_offset;
4763   int last_id = charset_ascii;
4764   bool eol_dos
4765     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4766   int byte_after_cr = -1;
4767
4768   CODING_GET_INFO (coding, attrs, charset_list);
4769
4770   val = charset_list;
4771   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4772   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4773   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4774   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4775
4776   while (1)
4777     {
4778       int c, c1;
4779       struct charset *charset;
4780
4781       src_base = src;
4782       consumed_chars_base = consumed_chars;
4783
4784       if (charbuf >= charbuf_end)
4785         {
4786           if (byte_after_cr >= 0)
4787             src_base--;
4788           break;
4789         }
4790
4791       if (byte_after_cr >= 0)
4792         c = byte_after_cr, byte_after_cr = -1;
4793       else
4794         ONE_MORE_BYTE (c);
4795       if (c < 0)
4796         goto invalid_code;
4797       if (c < 0x80)
4798         {
4799           if (eol_dos && c == '\r')
4800             ONE_MORE_BYTE (byte_after_cr);
4801           charset = charset_roman;
4802         }
4803       else if (c == 0x80 || c == 0xA0)
4804         goto invalid_code;
4805       else if (c >= 0xA1 && c <= 0xDF)
4806         {
4807           /* SJIS -> JISX0201-Kana */
4808           c &= 0x7F;
4809           charset = charset_kana;
4810         }
4811       else if (c <= 0xEF)
4812         {
4813           /* SJIS -> JISX0208 */
4814           ONE_MORE_BYTE (c1);
4815           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4816             goto invalid_code;
4817           c = (c << 8) | c1;
4818           SJIS_TO_JIS (c);
4819           charset = charset_kanji;
4820         }
4821       else if (c <= 0xFC && charset_kanji2)
4822         {
4823           /* SJIS -> JISX0213-2 */
4824           ONE_MORE_BYTE (c1);
4825           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4826             goto invalid_code;
4827           c = (c << 8) | c1;
4828           SJIS_TO_JIS2 (c);
4829           charset = charset_kanji2;
4830         }
4831       else
4832         goto invalid_code;
4833       if (charset->id != charset_ascii
4834           && last_id != charset->id)
4835         {
4836           if (last_id != charset_ascii)
4837             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4838           last_id = charset->id;
4839           last_offset = char_offset;
4840         }
4841       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4842       *charbuf++ = c;
4843       char_offset++;
4844       continue;
4845
4846     invalid_code:
4847       src = src_base;
4848       consumed_chars = consumed_chars_base;
4849       ONE_MORE_BYTE (c);
4850       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4851       char_offset++;
4852     }
4853
4854  no_more_source:
4855   if (last_id != charset_ascii)
4856     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4857   coding->consumed_char += consumed_chars_base;
4858   coding->consumed = src_base - coding->source;
4859   coding->charbuf_used = charbuf - coding->charbuf;
4860 }
4861
4862 static void
4863 decode_coding_big5 (struct coding_system *coding)
4864 {
4865   const unsigned char *src = coding->source + coding->consumed;
4866   const unsigned char *src_end = coding->source + coding->src_bytes;
4867   const unsigned char *src_base;
4868   int *charbuf = coding->charbuf + coding->charbuf_used;
4869   /* We may produce one charset annotation in one loop and one more at
4870      the end.  */
4871   int *charbuf_end
4872     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4873   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4874   bool multibytep = coding->src_multibyte;
4875   struct charset *charset_roman, *charset_big5;
4876   Lisp_Object attrs, charset_list, val;
4877   ptrdiff_t char_offset = coding->produced_char;
4878   ptrdiff_t last_offset = char_offset;
4879   int last_id = charset_ascii;
4880   bool eol_dos
4881     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4882   int byte_after_cr = -1;
4883
4884   CODING_GET_INFO (coding, attrs, charset_list);
4885   val = charset_list;
4886   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4887   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4888
4889   while (1)
4890     {
4891       int c, c1;
4892       struct charset *charset;
4893
4894       src_base = src;
4895       consumed_chars_base = consumed_chars;
4896
4897       if (charbuf >= charbuf_end)
4898         {
4899           if (byte_after_cr >= 0)
4900             src_base--;
4901           break;
4902         }
4903
4904       if (byte_after_cr >= 0)
4905         c = byte_after_cr, byte_after_cr = -1;
4906       else
4907         ONE_MORE_BYTE (c);
4908
4909       if (c < 0)
4910         goto invalid_code;
4911       if (c < 0x80)
4912         {
4913           if (eol_dos && c == '\r')
4914             ONE_MORE_BYTE (byte_after_cr);
4915           charset = charset_roman;
4916         }
4917       else
4918         {
4919           /* BIG5 -> Big5 */
4920           if (c < 0xA1 || c > 0xFE)
4921             goto invalid_code;
4922           ONE_MORE_BYTE (c1);
4923           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4924             goto invalid_code;
4925           c = c << 8 | c1;
4926           charset = charset_big5;
4927         }
4928       if (charset->id != charset_ascii
4929           && last_id != charset->id)
4930         {
4931           if (last_id != charset_ascii)
4932             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4933           last_id = charset->id;
4934           last_offset = char_offset;
4935         }
4936       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4937       *charbuf++ = c;
4938       char_offset++;
4939       continue;
4940
4941     invalid_code:
4942       src = src_base;
4943       consumed_chars = consumed_chars_base;
4944       ONE_MORE_BYTE (c);
4945       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4946       char_offset++;
4947     }
4948
4949  no_more_source:
4950   if (last_id != charset_ascii)
4951     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4952   coding->consumed_char += consumed_chars_base;
4953   coding->consumed = src_base - coding->source;
4954   coding->charbuf_used = charbuf - coding->charbuf;
4955 }
4956
4957 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4958    This function can encode charsets `ascii', `katakana-jisx0201',
4959    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4960    are sure that all these charsets are registered as official charset
4961    (i.e. do not have extended leading-codes).  Characters of other
4962    charsets are produced without any encoding.  */
4963
4964 static bool
4965 encode_coding_sjis (struct coding_system *coding)
4966 {
4967   bool multibytep = coding->dst_multibyte;
4968   int *charbuf = coding->charbuf;
4969   int *charbuf_end = charbuf + coding->charbuf_used;
4970   unsigned char *dst = coding->destination + coding->produced;
4971   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4972   int safe_room = 4;
4973   ptrdiff_t produced_chars = 0;
4974   Lisp_Object attrs, charset_list, val;
4975   bool ascii_compatible;
4976   struct charset *charset_kanji, *charset_kana;
4977   struct charset *charset_kanji2;
4978   int c;
4979
4980   CODING_GET_INFO (coding, attrs, charset_list);
4981   val = XCDR (charset_list);
4982   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4983   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4984   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4985
4986   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4987
4988   while (charbuf < charbuf_end)
4989     {
4990       ASSURE_DESTINATION (safe_room);
4991       c = *charbuf++;
4992       /* Now encode the character C.  */
4993       if (ASCII_CHAR_P (c) && ascii_compatible)
4994         EMIT_ONE_ASCII_BYTE (c);
4995       else if (CHAR_BYTE8_P (c))
4996         {
4997           c = CHAR_TO_BYTE8 (c);
4998           EMIT_ONE_BYTE (c);
4999         }
5000       else
5001         {
5002           unsigned code;
5003           struct charset *charset;
5004           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5005                                &code, charset);
5006
5007           if (!charset)
5008             {
5009               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5010                 {
5011                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5012                   charset = CHARSET_FROM_ID (charset_ascii);
5013                 }
5014               else
5015                 {
5016                   c = coding->default_char;
5017                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5018                                        charset_list, &code, charset);
5019                 }
5020             }
5021           if (code == CHARSET_INVALID_CODE (charset))
5022             emacs_abort ();
5023           if (charset == charset_kanji)
5024             {
5025               int c1, c2;
5026               JIS_TO_SJIS (code);
5027               c1 = code >> 8, c2 = code & 0xFF;
5028               EMIT_TWO_BYTES (c1, c2);
5029             }
5030           else if (charset == charset_kana)
5031             EMIT_ONE_BYTE (code | 0x80);
5032           else if (charset_kanji2 && charset == charset_kanji2)
5033             {
5034               int c1, c2;
5035
5036               c1 = code >> 8;
5037               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5038                   || c1 == 0x28
5039                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5040                 {
5041                   JIS_TO_SJIS2 (code);
5042                   c1 = code >> 8, c2 = code & 0xFF;
5043                   EMIT_TWO_BYTES (c1, c2);
5044                 }
5045               else
5046                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5047             }
5048           else
5049             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5050         }
5051     }
5052   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5053   coding->produced_char += produced_chars;
5054   coding->produced = dst - coding->destination;
5055   return 0;
5056 }
5057
5058 static bool
5059 encode_coding_big5 (struct coding_system *coding)
5060 {
5061   bool multibytep = coding->dst_multibyte;
5062   int *charbuf = coding->charbuf;
5063   int *charbuf_end = charbuf + coding->charbuf_used;
5064   unsigned char *dst = coding->destination + coding->produced;
5065   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5066   int safe_room = 4;
5067   ptrdiff_t produced_chars = 0;
5068   Lisp_Object attrs, charset_list, val;
5069   bool ascii_compatible;
5070   struct charset *charset_big5;
5071   int c;
5072
5073   CODING_GET_INFO (coding, attrs, charset_list);
5074   val = XCDR (charset_list);
5075   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5076   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5077
5078   while (charbuf < charbuf_end)
5079     {
5080       ASSURE_DESTINATION (safe_room);
5081       c = *charbuf++;
5082       /* Now encode the character C.  */
5083       if (ASCII_CHAR_P (c) && ascii_compatible)
5084         EMIT_ONE_ASCII_BYTE (c);
5085       else if (CHAR_BYTE8_P (c))
5086         {
5087           c = CHAR_TO_BYTE8 (c);
5088           EMIT_ONE_BYTE (c);
5089         }
5090       else
5091         {
5092           unsigned code;
5093           struct charset *charset;
5094           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5095                                &code, charset);
5096
5097           if (! charset)
5098             {
5099               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5100                 {
5101                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5102                   charset = CHARSET_FROM_ID (charset_ascii);
5103                 }
5104               else
5105                 {
5106                   c = coding->default_char;
5107                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5108                                        charset_list, &code, charset);
5109                 }
5110             }
5111           if (code == CHARSET_INVALID_CODE (charset))
5112             emacs_abort ();
5113           if (charset == charset_big5)
5114             {
5115               int c1, c2;
5116
5117               c1 = code >> 8, c2 = code & 0xFF;
5118               EMIT_TWO_BYTES (c1, c2);
5119             }
5120           else
5121             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5122         }
5123     }
5124   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5125   coding->produced_char += produced_chars;
5126   coding->produced = dst - coding->destination;
5127   return 0;
5128 }
5129
5130 \f
5131 /*** 10. CCL handlers ***/
5132
5133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5134    Return true if a text is encoded in a coding system of which
5135    encoder/decoder are written in CCL program.  */
5136
5137 static bool
5138 detect_coding_ccl (struct coding_system *coding,
5139                    struct coding_detection_info *detect_info)
5140 {
5141   const unsigned char *src = coding->source, *src_base;
5142   const unsigned char *src_end = coding->source + coding->src_bytes;
5143   bool multibytep = coding->src_multibyte;
5144   ptrdiff_t consumed_chars = 0;
5145   int found = 0;
5146   unsigned char *valids;
5147   ptrdiff_t head_ascii = coding->head_ascii;
5148   Lisp_Object attrs;
5149
5150   detect_info->checked |= CATEGORY_MASK_CCL;
5151
5152   coding = &coding_categories[coding_category_ccl];
5153   valids = CODING_CCL_VALIDS (coding);
5154   attrs = CODING_ID_ATTRS (coding->id);
5155   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5156     src += head_ascii;
5157
5158   while (1)
5159     {
5160       int c;
5161
5162       src_base = src;
5163       ONE_MORE_BYTE (c);
5164       if (c < 0 || ! valids[c])
5165         break;
5166       if ((valids[c] > 1))
5167         found = CATEGORY_MASK_CCL;
5168     }
5169   detect_info->rejected |= CATEGORY_MASK_CCL;
5170   return 0;
5171
5172  no_more_source:
5173   detect_info->found |= found;
5174   return 1;
5175 }
5176
5177 static void
5178 decode_coding_ccl (struct coding_system *coding)
5179 {
5180   const unsigned char *src = coding->source + coding->consumed;
5181   const unsigned char *src_end = coding->source + coding->src_bytes;
5182   int *charbuf = coding->charbuf + coding->charbuf_used;
5183   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5184   ptrdiff_t consumed_chars = 0;
5185   bool multibytep = coding->src_multibyte;
5186   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5187   int source_charbuf[1024];
5188   int source_byteidx[1025];
5189   Lisp_Object attrs, charset_list;
5190
5191   CODING_GET_INFO (coding, attrs, charset_list);
5192
5193   while (1)
5194     {
5195       const unsigned char *p = src;
5196       ptrdiff_t offset;
5197       int i = 0;
5198
5199       if (multibytep)
5200         {
5201           while (i < 1024 && p < src_end)
5202             {
5203               source_byteidx[i] = p - src;
5204               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5205             }
5206           source_byteidx[i] = p - src;
5207         }
5208       else
5209         while (i < 1024 && p < src_end)
5210           source_charbuf[i++] = *p++;
5211
5212       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5213         ccl->last_block = true;
5214       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5215       charset_map_loaded = 0;
5216       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5217                   charset_list);
5218       if (charset_map_loaded
5219           && (offset = coding_change_source (coding)))
5220         {
5221           p += offset;
5222           src += offset;
5223           src_end += offset;
5224         }
5225       charbuf += ccl->produced;
5226       if (multibytep)
5227         src += source_byteidx[ccl->consumed];
5228       else
5229         src += ccl->consumed;
5230       consumed_chars += ccl->consumed;
5231       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5232         break;
5233     }
5234
5235   switch (ccl->status)
5236     {
5237     case CCL_STAT_SUSPEND_BY_SRC:
5238       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5239       break;
5240     case CCL_STAT_SUSPEND_BY_DST:
5241       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5242       break;
5243     case CCL_STAT_QUIT:
5244     case CCL_STAT_INVALID_CMD:
5245       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5246       break;
5247     default:
5248       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5249       break;
5250     }
5251   coding->consumed_char += consumed_chars;
5252   coding->consumed = src - coding->source;
5253   coding->charbuf_used = charbuf - coding->charbuf;
5254 }
5255
5256 static bool
5257 encode_coding_ccl (struct coding_system *coding)
5258 {
5259   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5260   bool multibytep = coding->dst_multibyte;
5261   int *charbuf = coding->charbuf;
5262   int *charbuf_end = charbuf + coding->charbuf_used;
5263   unsigned char *dst = coding->destination + coding->produced;
5264   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5265   int destination_charbuf[1024];
5266   ptrdiff_t produced_chars = 0;
5267   int i;
5268   Lisp_Object attrs, charset_list;
5269
5270   CODING_GET_INFO (coding, attrs, charset_list);
5271   if (coding->consumed_char == coding->src_chars
5272       && coding->mode & CODING_MODE_LAST_BLOCK)
5273     ccl->last_block = true;
5274
5275   do
5276     {
5277       ptrdiff_t offset;
5278
5279       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5280       charset_map_loaded = 0;
5281       ccl_driver (ccl, charbuf, destination_charbuf,
5282                   charbuf_end - charbuf, 1024, charset_list);
5283       if (charset_map_loaded
5284           && (offset = coding_change_destination (coding)))
5285         dst += offset;
5286       if (multibytep)
5287         {
5288           ASSURE_DESTINATION (ccl->produced * 2);
5289           for (i = 0; i < ccl->produced; i++)
5290             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5291         }
5292       else
5293         {
5294           ASSURE_DESTINATION (ccl->produced);
5295           for (i = 0; i < ccl->produced; i++)
5296             *dst++ = destination_charbuf[i] & 0xFF;
5297           produced_chars += ccl->produced;
5298         }
5299       charbuf += ccl->consumed;
5300       if (ccl->status == CCL_STAT_QUIT
5301           || ccl->status == CCL_STAT_INVALID_CMD)
5302         break;
5303     }
5304   while (charbuf < charbuf_end);
5305
5306   switch (ccl->status)
5307     {
5308     case CCL_STAT_SUSPEND_BY_SRC:
5309       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5310       break;
5311     case CCL_STAT_SUSPEND_BY_DST:
5312       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5313       break;
5314     case CCL_STAT_QUIT:
5315     case CCL_STAT_INVALID_CMD:
5316       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5317       break;
5318     default:
5319       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5320       break;
5321     }
5322
5323   coding->produced_char += produced_chars;
5324   coding->produced = dst - coding->destination;
5325   return 0;
5326 }
5327
5328 \f
5329 /*** 10, 11. no-conversion handlers ***/
5330
5331 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5332
5333 static void
5334 decode_coding_raw_text (struct coding_system *coding)
5335 {
5336   bool eol_dos
5337     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5338
5339   coding->chars_at_source = 1;
5340   coding->consumed_char = coding->src_chars;
5341   coding->consumed = coding->src_bytes;
5342   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5343     {
5344       coding->consumed_char--;
5345       coding->consumed--;
5346       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5347     }
5348   else
5349     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5350 }
5351
5352 static bool
5353 encode_coding_raw_text (struct coding_system *coding)
5354 {
5355   bool multibytep = coding->dst_multibyte;
5356   int *charbuf = coding->charbuf;
5357   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5358   unsigned char *dst = coding->destination + coding->produced;
5359   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5360   ptrdiff_t produced_chars = 0;
5361   int c;
5362
5363   if (multibytep)
5364     {
5365       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5366
5367       if (coding->src_multibyte)
5368         while (charbuf < charbuf_end)
5369           {
5370             ASSURE_DESTINATION (safe_room);
5371             c = *charbuf++;
5372             if (ASCII_CHAR_P (c))
5373               EMIT_ONE_ASCII_BYTE (c);
5374             else if (CHAR_BYTE8_P (c))
5375               {
5376                 c = CHAR_TO_BYTE8 (c);
5377                 EMIT_ONE_BYTE (c);
5378               }
5379             else
5380               {
5381                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5382
5383                 CHAR_STRING_ADVANCE (c, p1);
5384                 do
5385                   {
5386                     EMIT_ONE_BYTE (*p0);
5387                     p0++;
5388                   }
5389                 while (p0 < p1);
5390               }
5391           }
5392       else
5393         while (charbuf < charbuf_end)
5394           {
5395             ASSURE_DESTINATION (safe_room);
5396             c = *charbuf++;
5397             EMIT_ONE_BYTE (c);
5398           }
5399     }
5400   else
5401     {
5402       if (coding->src_multibyte)
5403         {
5404           int safe_room = MAX_MULTIBYTE_LENGTH;
5405
5406           while (charbuf < charbuf_end)
5407             {
5408               ASSURE_DESTINATION (safe_room);
5409               c = *charbuf++;
5410               if (ASCII_CHAR_P (c))
5411                 *dst++ = c;
5412               else if (CHAR_BYTE8_P (c))
5413                 *dst++ = CHAR_TO_BYTE8 (c);
5414               else
5415                 CHAR_STRING_ADVANCE (c, dst);
5416             }
5417         }
5418       else
5419         {
5420           ASSURE_DESTINATION (charbuf_end - charbuf);
5421           while (charbuf < charbuf_end && dst < dst_end)
5422             *dst++ = *charbuf++;
5423         }
5424       produced_chars = dst - (coding->destination + coding->produced);
5425     }
5426   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5427   coding->produced_char += produced_chars;
5428   coding->produced = dst - coding->destination;
5429   return 0;
5430 }
5431
5432 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5433    Return true if a text is encoded in a charset-based coding system.  */
5434
5435 static bool
5436 detect_coding_charset (struct coding_system *coding,
5437                        struct coding_detection_info *detect_info)
5438 {
5439   const unsigned char *src = coding->source, *src_base;
5440   const unsigned char *src_end = coding->source + coding->src_bytes;
5441   bool multibytep = coding->src_multibyte;
5442   ptrdiff_t consumed_chars = 0;
5443   Lisp_Object attrs, valids, name;
5444   int found = 0;
5445   ptrdiff_t head_ascii = coding->head_ascii;
5446   bool check_latin_extra = 0;
5447
5448   detect_info->checked |= CATEGORY_MASK_CHARSET;
5449
5450   coding = &coding_categories[coding_category_charset];
5451   attrs = CODING_ID_ATTRS (coding->id);
5452   valids = AREF (attrs, coding_attr_charset_valids);
5453   name = CODING_ID_NAME (coding->id);
5454   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5455                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5456       || strncmp (SSDATA (SYMBOL_NAME (name)),
5457                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5458     check_latin_extra = 1;
5459
5460   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5461     src += head_ascii;
5462
5463   while (1)
5464     {
5465       int c;
5466       Lisp_Object val;
5467       struct charset *charset;
5468       int dim, idx;
5469
5470       src_base = src;
5471       ONE_MORE_BYTE (c);
5472       if (c < 0)
5473         continue;
5474       val = AREF (valids, c);
5475       if (NILP (val))
5476         break;
5477       if (c >= 0x80)
5478         {
5479           if (c < 0xA0
5480               && check_latin_extra
5481               && (!VECTORP (Vlatin_extra_code_table)
5482                   || NILP (AREF (Vlatin_extra_code_table, c))))
5483             break;
5484           found = CATEGORY_MASK_CHARSET;
5485         }
5486       if (INTEGERP (val))
5487         {
5488           charset = CHARSET_FROM_ID (XFASTINT (val));
5489           dim = CHARSET_DIMENSION (charset);
5490           for (idx = 1; idx < dim; idx++)
5491             {
5492               if (src == src_end)
5493                 goto too_short;
5494               ONE_MORE_BYTE (c);
5495               if (c < charset->code_space[(dim - 1 - idx) * 4]
5496                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5497                 break;
5498             }
5499           if (idx < dim)
5500             break;
5501         }
5502       else
5503         {
5504           idx = 1;
5505           for (; CONSP (val); val = XCDR (val))
5506             {
5507               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5508               dim = CHARSET_DIMENSION (charset);
5509               while (idx < dim)
5510                 {
5511                   if (src == src_end)
5512                     goto too_short;
5513                   ONE_MORE_BYTE (c);
5514                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5515                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5516                     break;
5517                   idx++;
5518                 }
5519               if (idx == dim)
5520                 {
5521                   val = Qnil;
5522                   break;
5523                 }
5524             }
5525           if (CONSP (val))
5526             break;
5527         }
5528     }
5529  too_short:
5530   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5531   return 0;
5532
5533  no_more_source:
5534   detect_info->found |= found;
5535   return 1;
5536 }
5537
5538 static void
5539 decode_coding_charset (struct coding_system *coding)
5540 {
5541   const unsigned char *src = coding->source + coding->consumed;
5542   const unsigned char *src_end = coding->source + coding->src_bytes;
5543   const unsigned char *src_base;
5544   int *charbuf = coding->charbuf + coding->charbuf_used;
5545   /* We may produce one charset annotation in one loop and one more at
5546      the end.  */
5547   int *charbuf_end
5548     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5549   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5550   bool multibytep = coding->src_multibyte;
5551   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5552   Lisp_Object valids;
5553   ptrdiff_t char_offset = coding->produced_char;
5554   ptrdiff_t last_offset = char_offset;
5555   int last_id = charset_ascii;
5556   bool eol_dos
5557     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5558   int byte_after_cr = -1;
5559
5560   valids = AREF (attrs, coding_attr_charset_valids);
5561
5562   while (1)
5563     {
5564       int c;
5565       Lisp_Object val;
5566       struct charset *charset;
5567       int dim;
5568       int len = 1;
5569       unsigned code;
5570
5571       src_base = src;
5572       consumed_chars_base = consumed_chars;
5573
5574       if (charbuf >= charbuf_end)
5575         {
5576           if (byte_after_cr >= 0)
5577             src_base--;
5578           break;
5579         }
5580
5581       if (byte_after_cr >= 0)
5582         {
5583           c = byte_after_cr;
5584           byte_after_cr = -1;
5585         }
5586       else
5587         {
5588           ONE_MORE_BYTE (c);
5589           if (eol_dos && c == '\r')
5590             ONE_MORE_BYTE (byte_after_cr);
5591         }
5592       if (c < 0)
5593         goto invalid_code;
5594       code = c;
5595
5596       val = AREF (valids, c);
5597       if (! INTEGERP (val) && ! CONSP (val))
5598         goto invalid_code;
5599       if (INTEGERP (val))
5600         {
5601           charset = CHARSET_FROM_ID (XFASTINT (val));
5602           dim = CHARSET_DIMENSION (charset);
5603           while (len < dim)
5604             {
5605               ONE_MORE_BYTE (c);
5606               code = (code << 8) | c;
5607               len++;
5608             }
5609           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5610                               charset, code, c);
5611         }
5612       else
5613         {
5614           /* VAL is a list of charset IDs.  It is assured that the
5615              list is sorted by charset dimensions (smaller one
5616              comes first).  */
5617           while (CONSP (val))
5618             {
5619               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5620               dim = CHARSET_DIMENSION (charset);
5621               while (len < dim)
5622                 {
5623                   ONE_MORE_BYTE (c);
5624                   code = (code << 8) | c;
5625                   len++;
5626                 }
5627               CODING_DECODE_CHAR (coding, src, src_base,
5628                                   src_end, charset, code, c);
5629               if (c >= 0)
5630                 break;
5631               val = XCDR (val);
5632             }
5633         }
5634       if (c < 0)
5635         goto invalid_code;
5636       if (charset->id != charset_ascii
5637           && last_id != charset->id)
5638         {
5639           if (last_id != charset_ascii)
5640             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5641           last_id = charset->id;
5642           last_offset = char_offset;
5643         }
5644
5645       *charbuf++ = c;
5646       char_offset++;
5647       continue;
5648
5649     invalid_code:
5650       src = src_base;
5651       consumed_chars = consumed_chars_base;
5652       ONE_MORE_BYTE (c);
5653       *charbuf++ = c < 0 ? -c : ASCII_CHAR_P (c) ? c : BYTE8_TO_CHAR (c);
5654       char_offset++;
5655     }
5656
5657  no_more_source:
5658   if (last_id != charset_ascii)
5659     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5660   coding->consumed_char += consumed_chars_base;
5661   coding->consumed = src_base - coding->source;
5662   coding->charbuf_used = charbuf - coding->charbuf;
5663 }
5664
5665 static bool
5666 encode_coding_charset (struct coding_system *coding)
5667 {
5668   bool multibytep = coding->dst_multibyte;
5669   int *charbuf = coding->charbuf;
5670   int *charbuf_end = charbuf + coding->charbuf_used;
5671   unsigned char *dst = coding->destination + coding->produced;
5672   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5673   int safe_room = MAX_MULTIBYTE_LENGTH;
5674   ptrdiff_t produced_chars = 0;
5675   Lisp_Object attrs, charset_list;
5676   bool ascii_compatible;
5677   int c;
5678
5679   CODING_GET_INFO (coding, attrs, charset_list);
5680   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5681
5682   while (charbuf < charbuf_end)
5683     {
5684       struct charset *charset;
5685       unsigned code;
5686
5687       ASSURE_DESTINATION (safe_room);
5688       c = *charbuf++;
5689       if (ascii_compatible && ASCII_CHAR_P (c))
5690         EMIT_ONE_ASCII_BYTE (c);
5691       else if (CHAR_BYTE8_P (c))
5692         {
5693           c = CHAR_TO_BYTE8 (c);
5694           EMIT_ONE_BYTE (c);
5695         }
5696       else
5697         {
5698           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5699                                &code, charset);
5700
5701           if (charset)
5702             {
5703               if (CHARSET_DIMENSION (charset) == 1)
5704                 EMIT_ONE_BYTE (code);
5705               else if (CHARSET_DIMENSION (charset) == 2)
5706                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5707               else if (CHARSET_DIMENSION (charset) == 3)
5708                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5709               else
5710                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5711                                  (code >> 8) & 0xFF, code & 0xFF);
5712             }
5713           else
5714             {
5715               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5716                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5717               else
5718                 c = coding->default_char;
5719               EMIT_ONE_BYTE (c);
5720             }
5721         }
5722     }
5723
5724   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5725   coding->produced_char += produced_chars;
5726   coding->produced = dst - coding->destination;
5727   return 0;
5728 }
5729
5730 \f
5731 /*** 7. C library functions ***/
5732
5733 /* Setup coding context CODING from information about CODING_SYSTEM.
5734    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5735    CODING_SYSTEM is invalid, signal an error.  */
5736
5737 void
5738 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5739 {
5740   Lisp_Object attrs;
5741   Lisp_Object eol_type;
5742   Lisp_Object coding_type;
5743   Lisp_Object val;
5744
5745   if (NILP (coding_system))
5746     coding_system = Qundecided;
5747
5748   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5749
5750   attrs = CODING_ID_ATTRS (coding->id);
5751   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5752
5753   coding->mode = 0;
5754   if (VECTORP (eol_type))
5755     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5756                             | CODING_REQUIRE_DETECTION_MASK);
5757   else if (! EQ (eol_type, Qunix))
5758     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5759                             | CODING_REQUIRE_ENCODING_MASK);
5760   else
5761     coding->common_flags = 0;
5762   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5763     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5764   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5765     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5766   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5767     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5768
5769   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5770   coding->max_charset_id = SCHARS (val) - 1;
5771   coding->safe_charsets = SDATA (val);
5772   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5773   coding->carryover_bytes = 0;
5774   coding->raw_destination = 0;
5775
5776   coding_type = CODING_ATTR_TYPE (attrs);
5777   if (EQ (coding_type, Qundecided))
5778     {
5779       coding->detector = NULL;
5780       coding->decoder = decode_coding_raw_text;
5781       coding->encoder = encode_coding_raw_text;
5782       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5783       coding->spec.undecided.inhibit_nbd
5784         = (encode_inhibit_flag
5785            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5786       coding->spec.undecided.inhibit_ied
5787         = (encode_inhibit_flag
5788            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5789       coding->spec.undecided.prefer_utf_8
5790         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5791     }
5792   else if (EQ (coding_type, Qiso_2022))
5793     {
5794       int i;
5795       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5796
5797       /* Invoke graphic register 0 to plane 0.  */
5798       CODING_ISO_INVOCATION (coding, 0) = 0;
5799       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5800       CODING_ISO_INVOCATION (coding, 1)
5801         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5802       /* Setup the initial status of designation.  */
5803       for (i = 0; i < 4; i++)
5804         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5805       /* Not single shifting initially.  */
5806       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5807       /* Beginning of buffer should also be regarded as bol. */
5808       CODING_ISO_BOL (coding) = 1;
5809       coding->detector = detect_coding_iso_2022;
5810       coding->decoder = decode_coding_iso_2022;
5811       coding->encoder = encode_coding_iso_2022;
5812       if (flags & CODING_ISO_FLAG_SAFE)
5813         coding->mode |= CODING_MODE_SAFE_ENCODING;
5814       coding->common_flags
5815         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5816             | CODING_REQUIRE_FLUSHING_MASK);
5817       if (flags & CODING_ISO_FLAG_COMPOSITION)
5818         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5819       if (flags & CODING_ISO_FLAG_DESIGNATION)
5820         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5821       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5822         {
5823           setup_iso_safe_charsets (attrs);
5824           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5825           coding->max_charset_id = SCHARS (val) - 1;
5826           coding->safe_charsets = SDATA (val);
5827         }
5828       CODING_ISO_FLAGS (coding) = flags;
5829       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5830       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5831       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5832       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5833     }
5834   else if (EQ (coding_type, Qcharset))
5835     {
5836       coding->detector = detect_coding_charset;
5837       coding->decoder = decode_coding_charset;
5838       coding->encoder = encode_coding_charset;
5839       coding->common_flags
5840         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5841     }
5842   else if (EQ (coding_type, Qutf_8))
5843     {
5844       val = AREF (attrs, coding_attr_utf_bom);
5845       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5846                                    : EQ (val, Qt) ? utf_with_bom
5847                                    : utf_without_bom);
5848       coding->detector = detect_coding_utf_8;
5849       coding->decoder = decode_coding_utf_8;
5850       coding->encoder = encode_coding_utf_8;
5851       coding->common_flags
5852         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5853       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5854         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5855     }
5856   else if (EQ (coding_type, Qutf_16))
5857     {
5858       val = AREF (attrs, coding_attr_utf_bom);
5859       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5860                                     : EQ (val, Qt) ? utf_with_bom
5861                                     : utf_without_bom);
5862       val = AREF (attrs, coding_attr_utf_16_endian);
5863       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5864                                        : utf_16_little_endian);
5865       CODING_UTF_16_SURROGATE (coding) = 0;
5866       coding->detector = detect_coding_utf_16;
5867       coding->decoder = decode_coding_utf_16;
5868       coding->encoder = encode_coding_utf_16;
5869       coding->common_flags
5870         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5871       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5872         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5873     }
5874   else if (EQ (coding_type, Qccl))
5875     {
5876       coding->detector = detect_coding_ccl;
5877       coding->decoder = decode_coding_ccl;
5878       coding->encoder = encode_coding_ccl;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5881             | CODING_REQUIRE_FLUSHING_MASK);
5882     }
5883   else if (EQ (coding_type, Qemacs_mule))
5884     {
5885       coding->detector = detect_coding_emacs_mule;
5886       coding->decoder = decode_coding_emacs_mule;
5887       coding->encoder = encode_coding_emacs_mule;
5888       coding->common_flags
5889         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5890       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5891           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5892         {
5893           Lisp_Object tail, safe_charsets;
5894           int max_charset_id = 0;
5895
5896           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5897                tail = XCDR (tail))
5898             if (max_charset_id < XFASTINT (XCAR (tail)))
5899               max_charset_id = XFASTINT (XCAR (tail));
5900           safe_charsets = make_uninit_string (max_charset_id + 1);
5901           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5902           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5903                tail = XCDR (tail))
5904             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5905           coding->max_charset_id = max_charset_id;
5906           coding->safe_charsets = SDATA (safe_charsets);
5907         }
5908       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5909       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5910     }
5911   else if (EQ (coding_type, Qshift_jis))
5912     {
5913       coding->detector = detect_coding_sjis;
5914       coding->decoder = decode_coding_sjis;
5915       coding->encoder = encode_coding_sjis;
5916       coding->common_flags
5917         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5918     }
5919   else if (EQ (coding_type, Qbig5))
5920     {
5921       coding->detector = detect_coding_big5;
5922       coding->decoder = decode_coding_big5;
5923       coding->encoder = encode_coding_big5;
5924       coding->common_flags
5925         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5926     }
5927   else                          /* EQ (coding_type, Qraw_text) */
5928     {
5929       coding->detector = NULL;
5930       coding->decoder = decode_coding_raw_text;
5931       coding->encoder = encode_coding_raw_text;
5932       if (! EQ (eol_type, Qunix))
5933         {
5934           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5935           if (! VECTORP (eol_type))
5936             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5937         }
5938
5939     }
5940
5941   return;
5942 }
5943
5944 /* Return a list of charsets supported by CODING.  */
5945
5946 Lisp_Object
5947 coding_charset_list (struct coding_system *coding)
5948 {
5949   Lisp_Object attrs, charset_list;
5950
5951   CODING_GET_INFO (coding, attrs, charset_list);
5952   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5953     {
5954       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5955
5956       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5957         charset_list = Viso_2022_charset_list;
5958     }
5959   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5960     {
5961       charset_list = Vemacs_mule_charset_list;
5962     }
5963   return charset_list;
5964 }
5965
5966
5967 /* Return a list of charsets supported by CODING-SYSTEM.  */
5968
5969 Lisp_Object
5970 coding_system_charset_list (Lisp_Object coding_system)
5971 {
5972   ptrdiff_t id;
5973   Lisp_Object attrs, charset_list;
5974
5975   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5976   attrs = CODING_ID_ATTRS (id);
5977
5978   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5979     {
5980       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5981
5982       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5983         charset_list = Viso_2022_charset_list;
5984       else
5985         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5986     }
5987   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5988     {
5989       charset_list = Vemacs_mule_charset_list;
5990     }
5991   else
5992     {
5993       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5994     }
5995   return charset_list;
5996 }
5997
5998
5999 /* Return raw-text or one of its subsidiaries that has the same
6000    eol_type as CODING-SYSTEM.  */
6001
6002 Lisp_Object
6003 raw_text_coding_system (Lisp_Object coding_system)
6004 {
6005   Lisp_Object spec, attrs;
6006   Lisp_Object eol_type, raw_text_eol_type;
6007
6008   if (NILP (coding_system))
6009     return Qraw_text;
6010   spec = CODING_SYSTEM_SPEC (coding_system);
6011   attrs = AREF (spec, 0);
6012
6013   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6014     return coding_system;
6015
6016   eol_type = AREF (spec, 2);
6017   if (VECTORP (eol_type))
6018     return Qraw_text;
6019   spec = CODING_SYSTEM_SPEC (Qraw_text);
6020   raw_text_eol_type = AREF (spec, 2);
6021   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6022           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6023           : AREF (raw_text_eol_type, 2));
6024 }
6025
6026
6027 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6028    the subsidiary that has the same eol-spec as PARENT (if it is not
6029    nil and specifies end-of-line format) or the system's setting
6030    (system_eol_type).  */
6031
6032 Lisp_Object
6033 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6034 {
6035   Lisp_Object spec, eol_type;
6036
6037   if (NILP (coding_system))
6038     coding_system = Qraw_text;
6039   spec = CODING_SYSTEM_SPEC (coding_system);
6040   eol_type = AREF (spec, 2);
6041   if (VECTORP (eol_type))
6042     {
6043       Lisp_Object parent_eol_type;
6044
6045       if (! NILP (parent))
6046         {
6047           Lisp_Object parent_spec;
6048
6049           parent_spec = CODING_SYSTEM_SPEC (parent);
6050           parent_eol_type = AREF (parent_spec, 2);
6051           if (VECTORP (parent_eol_type))
6052             parent_eol_type = system_eol_type;
6053         }
6054       else
6055         parent_eol_type = system_eol_type;
6056       if (EQ (parent_eol_type, Qunix))
6057         coding_system = AREF (eol_type, 0);
6058       else if (EQ (parent_eol_type, Qdos))
6059         coding_system = AREF (eol_type, 1);
6060       else if (EQ (parent_eol_type, Qmac))
6061         coding_system = AREF (eol_type, 2);
6062     }
6063   return coding_system;
6064 }
6065
6066
6067 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6068    decided for writing to a process.  If not, complement them, and
6069    return a new coding system.  */
6070
6071 Lisp_Object
6072 complement_process_encoding_system (Lisp_Object coding_system)
6073 {
6074   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6075   Lisp_Object spec, attrs;
6076   int i;
6077
6078   for (i = 0; i < 3; i++)
6079     {
6080       if (i == 1)
6081         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6082       else if (i == 2)
6083         coding_system = preferred_coding_system ();
6084       spec = CODING_SYSTEM_SPEC (coding_system);
6085       if (NILP (spec))
6086         continue;
6087       attrs = AREF (spec, 0);
6088       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6089         coding_base = CODING_ATTR_BASE_NAME (attrs);
6090       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6091         eol_base = coding_system;
6092       if (! NILP (coding_base) && ! NILP (eol_base))
6093         break;
6094     }
6095
6096   if (i > 0)
6097     /* The original CODING_SYSTEM didn't specify text-conversion or
6098        eol-conversion.  Be sure that we return a fully complemented
6099        coding system.  */
6100     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6101   return coding_system;
6102 }
6103
6104
6105 /* Emacs has a mechanism to automatically detect a coding system if it
6106    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6107    it's impossible to distinguish some coding systems accurately
6108    because they use the same range of codes.  So, at first, coding
6109    systems are categorized into 7, those are:
6110
6111    o coding-category-emacs-mule
6112
6113         The category for a coding system which has the same code range
6114         as Emacs' internal format.  Assigned the coding-system (Lisp
6115         symbol) `emacs-mule' by default.
6116
6117    o coding-category-sjis
6118
6119         The category for a coding system which has the same code range
6120         as SJIS.  Assigned the coding-system (Lisp
6121         symbol) `japanese-shift-jis' by default.
6122
6123    o coding-category-iso-7
6124
6125         The category for a coding system which has the same code range
6126         as ISO2022 of 7-bit environment.  This doesn't use any locking
6127         shift and single shift functions.  This can encode/decode all
6128         charsets.  Assigned the coding-system (Lisp symbol)
6129         `iso-2022-7bit' by default.
6130
6131    o coding-category-iso-7-tight
6132
6133         Same as coding-category-iso-7 except that this can
6134         encode/decode only the specified charsets.
6135
6136    o coding-category-iso-8-1
6137
6138         The category for a coding system which has the same code range
6139         as ISO2022 of 8-bit environment and graphic plane 1 used only
6140         for DIMENSION1 charset.  This doesn't use any locking shift
6141         and single shift functions.  Assigned the coding-system (Lisp
6142         symbol) `iso-latin-1' by default.
6143
6144    o coding-category-iso-8-2
6145
6146         The category for a coding system which has the same code range
6147         as ISO2022 of 8-bit environment and graphic plane 1 used only
6148         for DIMENSION2 charset.  This doesn't use any locking shift
6149         and single shift functions.  Assigned the coding-system (Lisp
6150         symbol) `japanese-iso-8bit' by default.
6151
6152    o coding-category-iso-7-else
6153
6154         The category for a coding system which has the same code range
6155         as ISO2022 of 7-bit environment but uses locking shift or
6156         single shift functions.  Assigned the coding-system (Lisp
6157         symbol) `iso-2022-7bit-lock' by default.
6158
6159    o coding-category-iso-8-else
6160
6161         The category for a coding system which has the same code range
6162         as ISO2022 of 8-bit environment but uses locking shift or
6163         single shift functions.  Assigned the coding-system (Lisp
6164         symbol) `iso-2022-8bit-ss2' by default.
6165
6166    o coding-category-big5
6167
6168         The category for a coding system which has the same code range
6169         as BIG5.  Assigned the coding-system (Lisp symbol)
6170         `cn-big5' by default.
6171
6172    o coding-category-utf-8
6173
6174         The category for a coding system which has the same code range
6175         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6176         symbol) `utf-8' by default.
6177
6178    o coding-category-utf-16-be
6179
6180         The category for a coding system in which a text has an
6181         Unicode signature (cf. Unicode Standard) in the order of BIG
6182         endian at the head.  Assigned the coding-system (Lisp symbol)
6183         `utf-16-be' by default.
6184
6185    o coding-category-utf-16-le
6186
6187         The category for a coding system in which a text has an
6188         Unicode signature (cf. Unicode Standard) in the order of
6189         LITTLE endian at the head.  Assigned the coding-system (Lisp
6190         symbol) `utf-16-le' by default.
6191
6192    o coding-category-ccl
6193
6194         The category for a coding system of which encoder/decoder is
6195         written in CCL programs.  The default value is nil, i.e., no
6196         coding system is assigned.
6197
6198    o coding-category-binary
6199
6200         The category for a coding system not categorized in any of the
6201         above.  Assigned the coding-system (Lisp symbol)
6202         `no-conversion' by default.
6203
6204    Each of them is a Lisp symbol and the value is an actual
6205    `coding-system's (this is also a Lisp symbol) assigned by a user.
6206    What Emacs does actually is to detect a category of coding system.
6207    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6208    decide only one possible category, it selects a category of the
6209    highest priority.  Priorities of categories are also specified by a
6210    user in a Lisp variable `coding-category-list'.
6211
6212 */
6213
6214 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6215                                            int eol_seen);
6216
6217
6218 /* Return the number of ASCII characters at the head of the source.
6219    By side effects, set coding->head_ascii and update
6220    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6221    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6222    reliable only when all the source bytes are ASCII.  */
6223
6224 static ptrdiff_t
6225 check_ascii (struct coding_system *coding)
6226 {
6227   const unsigned char *src, *end;
6228   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6229   int eol_seen = coding->eol_seen;
6230
6231   coding_set_source (coding);
6232   src = coding->source;
6233   end = src + coding->src_bytes;
6234
6235   if (inhibit_eol_conversion
6236       || SYMBOLP (eol_type))
6237     {
6238       /* We don't have to check EOL format.  */
6239       while (src < end && !( *src & 0x80))
6240         {
6241           if (*src++ == '\n')
6242             eol_seen |= EOL_SEEN_LF;
6243         }
6244     }
6245   else
6246     {
6247       end--;                /* We look ahead one byte for "CR LF".  */
6248       while (src < end)
6249         {
6250           int c = *src;
6251
6252           if (c & 0x80)
6253             break;
6254           src++;
6255           if (c == '\r')
6256             {
6257               if (*src == '\n')
6258                 {
6259                   eol_seen |= EOL_SEEN_CRLF;
6260                   src++;
6261                 }
6262               else
6263                 eol_seen |= EOL_SEEN_CR;
6264             }
6265           else if (c == '\n')
6266             eol_seen |= EOL_SEEN_LF;
6267         }
6268       if (src == end)
6269         {
6270           int c = *src;
6271
6272           /* All bytes but the last one C are ASCII.  */
6273           if (! (c & 0x80))
6274             {
6275               if (c == '\r')
6276                 eol_seen |= EOL_SEEN_CR;
6277               else if (c  == '\n')
6278                 eol_seen |= EOL_SEEN_LF;
6279               src++;
6280             }
6281         }
6282     }
6283   coding->head_ascii = src - coding->source;
6284   coding->eol_seen = eol_seen;
6285   return (coding->head_ascii);
6286 }
6287
6288
6289 /* Return the number of characters at the source if all the bytes are
6290    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6291    effects, update coding->eol_seen.  The value of coding->eol_seen is
6292    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6293    the value is reliable only when all the source bytes are valid
6294    UTF-8.  */
6295
6296 static ptrdiff_t
6297 check_utf_8 (struct coding_system *coding)
6298 {
6299   const unsigned char *src, *end;
6300   int eol_seen;
6301   ptrdiff_t nchars = coding->head_ascii;
6302
6303   if (coding->head_ascii < 0)
6304     check_ascii (coding);
6305   else
6306     coding_set_source (coding);
6307   src = coding->source + coding->head_ascii;
6308   /* We look ahead one byte for CR LF.  */
6309   end = coding->source + coding->src_bytes - 1;
6310   eol_seen = coding->eol_seen;
6311   while (src < end)
6312     {
6313       int c = *src;
6314
6315       if (UTF_8_1_OCTET_P (*src))
6316         {
6317           src++;
6318           if (c < 0x20)
6319             {
6320               if (c == '\r')
6321                 {
6322                   if (*src == '\n')
6323                     {
6324                       eol_seen |= EOL_SEEN_CRLF;
6325                       src++;
6326                       nchars++;
6327                     }
6328                   else
6329                     eol_seen |= EOL_SEEN_CR;
6330                 }
6331               else if (c == '\n')
6332                 eol_seen |= EOL_SEEN_LF;
6333             }
6334         }
6335       else if (UTF_8_2_OCTET_LEADING_P (c))
6336         {
6337           if (c < 0xC2          /* overlong sequence */
6338               || src + 1 >= end
6339               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6340             return -1;
6341           src += 2;
6342         }
6343       else if (UTF_8_3_OCTET_LEADING_P (c))
6344         {
6345           if (src + 2 >= end
6346               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6347                     && UTF_8_EXTRA_OCTET_P (src[2])))
6348             return -1;
6349           c = (((c & 0xF) << 12)
6350                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6351           if (c < 0x800                       /* overlong sequence */
6352               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6353             return -1;
6354           src += 3;
6355         }
6356       else if (UTF_8_4_OCTET_LEADING_P (c))
6357         {
6358           if (src + 3 >= end
6359               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6360                     && UTF_8_EXTRA_OCTET_P (src[2])
6361                     && UTF_8_EXTRA_OCTET_P (src[3])))
6362             return -1;
6363           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6364                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6365           if (c < 0x10000       /* overlong sequence */
6366               || c >= 0x110000) /* non-Unicode character  */
6367             return -1;
6368           src += 4;
6369         }
6370       else
6371         return -1;
6372       nchars++;
6373     }
6374
6375   if (src == end)
6376     {
6377       if (! UTF_8_1_OCTET_P (*src))
6378         return -1;
6379       nchars++;
6380       if (*src == '\r')
6381         eol_seen |= EOL_SEEN_CR;
6382       else if (*src  == '\n')
6383         eol_seen |= EOL_SEEN_LF;
6384     }
6385   coding->eol_seen = eol_seen;
6386   return nchars;
6387 }
6388
6389
6390 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6391    SOURCE is encoded.  If CATEGORY is one of
6392    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6393    two-byte, else they are encoded by one-byte.
6394
6395    Return one of EOL_SEEN_XXX.  */
6396
6397 #define MAX_EOL_CHECK_COUNT 3
6398
6399 static int
6400 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6401             enum coding_category category)
6402 {
6403   const unsigned char *src = source, *src_end = src + src_bytes;
6404   unsigned char c;
6405   int total  = 0;
6406   int eol_seen = EOL_SEEN_NONE;
6407
6408   if ((1 << category) & CATEGORY_MASK_UTF_16)
6409     {
6410       bool msb = category == (coding_category_utf_16_le
6411                               | coding_category_utf_16_le_nosig);
6412       bool lsb = !msb;
6413
6414       while (src + 1 < src_end)
6415         {
6416           c = src[lsb];
6417           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6418             {
6419               int this_eol;
6420
6421               if (c == '\n')
6422                 this_eol = EOL_SEEN_LF;
6423               else if (src + 3 >= src_end
6424                        || src[msb + 2] != 0
6425                        || src[lsb + 2] != '\n')
6426                 this_eol = EOL_SEEN_CR;
6427               else
6428                 {
6429                   this_eol = EOL_SEEN_CRLF;
6430                   src += 2;
6431                 }
6432
6433               if (eol_seen == EOL_SEEN_NONE)
6434                 /* This is the first end-of-line.  */
6435                 eol_seen = this_eol;
6436               else if (eol_seen != this_eol)
6437                 {
6438                   /* The found type is different from what found before.
6439                      Allow for stray ^M characters in DOS EOL files.  */
6440                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6441                       || (eol_seen == EOL_SEEN_CRLF
6442                           && this_eol == EOL_SEEN_CR))
6443                     eol_seen = EOL_SEEN_CRLF;
6444                   else
6445                     {
6446                       eol_seen = EOL_SEEN_LF;
6447                       break;
6448                     }
6449                 }
6450               if (++total == MAX_EOL_CHECK_COUNT)
6451                 break;
6452             }
6453           src += 2;
6454         }
6455     }
6456   else
6457     while (src < src_end)
6458       {
6459         c = *src++;
6460         if (c == '\n' || c == '\r')
6461           {
6462             int this_eol;
6463
6464             if (c == '\n')
6465               this_eol = EOL_SEEN_LF;
6466             else if (src >= src_end || *src != '\n')
6467               this_eol = EOL_SEEN_CR;
6468             else
6469               this_eol = EOL_SEEN_CRLF, src++;
6470
6471             if (eol_seen == EOL_SEEN_NONE)
6472               /* This is the first end-of-line.  */
6473               eol_seen = this_eol;
6474             else if (eol_seen != this_eol)
6475               {
6476                 /* The found type is different from what found before.
6477                    Allow for stray ^M characters in DOS EOL files.  */
6478                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6479                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6480                   eol_seen = EOL_SEEN_CRLF;
6481                 else
6482                   {
6483                     eol_seen = EOL_SEEN_LF;
6484                     break;
6485                   }
6486               }
6487             if (++total == MAX_EOL_CHECK_COUNT)
6488               break;
6489           }
6490       }
6491   return eol_seen;
6492 }
6493
6494
6495 static Lisp_Object
6496 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6497 {
6498   Lisp_Object eol_type;
6499
6500   eol_type = CODING_ID_EOL_TYPE (coding->id);
6501   if (! VECTORP (eol_type))
6502     /* Already adjusted.  */
6503     return eol_type;
6504   if (eol_seen & EOL_SEEN_LF)
6505     {
6506       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6507       eol_type = Qunix;
6508     }
6509   else if (eol_seen & EOL_SEEN_CRLF)
6510     {
6511       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6512       eol_type = Qdos;
6513     }
6514   else if (eol_seen & EOL_SEEN_CR)
6515     {
6516       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6517       eol_type = Qmac;
6518     }
6519   return eol_type;
6520 }
6521
6522 /* Detect how a text specified in CODING is encoded.  If a coding
6523    system is detected, update fields of CODING by the detected coding
6524    system.  */
6525
6526 static void
6527 detect_coding (struct coding_system *coding)
6528 {
6529   const unsigned char *src, *src_end;
6530   unsigned int saved_mode = coding->mode;
6531   Lisp_Object found = Qnil;
6532   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6533
6534   coding->consumed = coding->consumed_char = 0;
6535   coding->produced = coding->produced_char = 0;
6536   coding_set_source (coding);
6537
6538   src_end = coding->source + coding->src_bytes;
6539
6540   coding->eol_seen = EOL_SEEN_NONE;
6541   /* If we have not yet decided the text encoding type, detect it
6542      now.  */
6543   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6544     {
6545       int c, i;
6546       struct coding_detection_info detect_info;
6547       bool null_byte_found = 0, eight_bit_found = 0;
6548       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6549                                        inhibit_null_byte_detection);
6550       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6551                                        inhibit_iso_escape_detection);
6552       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6553
6554       coding->head_ascii = 0;
6555       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6556       for (src = coding->source; src < src_end; src++)
6557         {
6558           c = *src;
6559           if (c & 0x80)
6560             {
6561               eight_bit_found = 1;
6562               if (null_byte_found)
6563                 break;
6564             }
6565           else if (c < 0x20)
6566             {
6567               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6568                   && ! inhibit_ied
6569                   && ! detect_info.checked)
6570                 {
6571                   if (detect_coding_iso_2022 (coding, &detect_info))
6572                     {
6573                       /* We have scanned the whole data.  */
6574                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6575                         {
6576                           /* We didn't find an 8-bit code.  We may
6577                              have found a null-byte, but it's very
6578                              rare that a binary file conforms to
6579                              ISO-2022.  */
6580                           src = src_end;
6581                           coding->head_ascii = src - coding->source;
6582                         }
6583                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6584                       break;
6585                     }
6586                 }
6587               else if (! c && !inhibit_nbd)
6588                 {
6589                   null_byte_found = 1;
6590                   if (eight_bit_found)
6591                     break;
6592                 }
6593               else if (! disable_ascii_optimization
6594                        && ! inhibit_eol_conversion)
6595                 {
6596                   if (c == '\r')
6597                     {
6598                       if (src < src_end && src[1] == '\n')
6599                         {
6600                           coding->eol_seen |= EOL_SEEN_CRLF;
6601                           src++;
6602                           if (! eight_bit_found)
6603                             coding->head_ascii++;
6604                         }
6605                       else
6606                         coding->eol_seen |= EOL_SEEN_CR;
6607                     }
6608                   else if (c == '\n')
6609                     {
6610                       coding->eol_seen |= EOL_SEEN_LF;
6611                     }
6612                 }
6613
6614               if (! eight_bit_found)
6615                 coding->head_ascii++;
6616             }
6617           else if (! eight_bit_found)
6618             coding->head_ascii++;
6619         }
6620
6621       if (null_byte_found || eight_bit_found
6622           || coding->head_ascii < coding->src_bytes
6623           || detect_info.found)
6624         {
6625           enum coding_category category;
6626           struct coding_system *this;
6627
6628           if (coding->head_ascii == coding->src_bytes)
6629             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6630             for (i = 0; i < coding_category_raw_text; i++)
6631               {
6632                 category = coding_priorities[i];
6633                 this = coding_categories + category;
6634                 if (detect_info.found & (1 << category))
6635                   break;
6636               }
6637           else
6638             {
6639               if (null_byte_found)
6640                 {
6641                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6642                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6643                 }
6644               else if (prefer_utf_8
6645                        && detect_coding_utf_8 (coding, &detect_info))
6646                 {
6647                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6648                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6649                 }
6650               for (i = 0; i < coding_category_raw_text; i++)
6651                 {
6652                   category = coding_priorities[i];
6653                   this = coding_categories + category;
6654                   /* Some of this->detector (e.g. detect_coding_sjis)
6655                      require this information.  */
6656                   coding->id = this->id;
6657                   if (this->id < 0)
6658                     {
6659                       /* No coding system of this category is defined.  */
6660                       detect_info.rejected |= (1 << category);
6661                     }
6662                   else if (category >= coding_category_raw_text)
6663                     continue;
6664                   else if (detect_info.checked & (1 << category))
6665                     {
6666                       if (detect_info.found & (1 << category))
6667                         break;
6668                     }
6669                   else if ((*(this->detector)) (coding, &detect_info)
6670                            && detect_info.found & (1 << category))
6671                     break;
6672                 }
6673             }
6674
6675           if (i < coding_category_raw_text)
6676             {
6677               if (category == coding_category_utf_8_auto)
6678                 {
6679                   Lisp_Object coding_systems;
6680
6681                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6682                                          coding_attr_utf_bom);
6683                   if (CONSP (coding_systems))
6684                     {
6685                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6686                         found = XCAR (coding_systems);
6687                       else
6688                         found = XCDR (coding_systems);
6689                     }
6690                   else
6691                     found = CODING_ID_NAME (this->id);
6692                 }
6693               else if (category == coding_category_utf_16_auto)
6694                 {
6695                   Lisp_Object coding_systems;
6696
6697                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6698                                          coding_attr_utf_bom);
6699                   if (CONSP (coding_systems))
6700                     {
6701                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6702                         found = XCAR (coding_systems);
6703                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6704                         found = XCDR (coding_systems);
6705                     }
6706                   else
6707                     found = CODING_ID_NAME (this->id);
6708                 }
6709               else
6710                 found = CODING_ID_NAME (this->id);
6711             }
6712           else if (null_byte_found)
6713             found = Qno_conversion;
6714           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6715                    == CATEGORY_MASK_ANY)
6716             found = Qraw_text;
6717           else if (detect_info.rejected)
6718             for (i = 0; i < coding_category_raw_text; i++)
6719               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6720                 {
6721                   this = coding_categories + coding_priorities[i];
6722                   found = CODING_ID_NAME (this->id);
6723                   break;
6724                 }
6725         }
6726     }
6727   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6728            == coding_category_utf_8_auto)
6729     {
6730       Lisp_Object coding_systems;
6731       struct coding_detection_info detect_info;
6732
6733       coding_systems
6734         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6735       detect_info.found = detect_info.rejected = 0;
6736       if (check_ascii (coding) == coding->src_bytes)
6737         {
6738           if (CONSP (coding_systems))
6739             found = XCDR (coding_systems);
6740         }
6741       else
6742         {
6743           if (CONSP (coding_systems)
6744               && detect_coding_utf_8 (coding, &detect_info))
6745             {
6746               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6747                 found = XCAR (coding_systems);
6748               else
6749                 found = XCDR (coding_systems);
6750             }
6751         }
6752     }
6753   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6754            == coding_category_utf_16_auto)
6755     {
6756       Lisp_Object coding_systems;
6757       struct coding_detection_info detect_info;
6758
6759       coding_systems
6760         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6761       detect_info.found = detect_info.rejected = 0;
6762       coding->head_ascii = 0;
6763       if (CONSP (coding_systems)
6764           && detect_coding_utf_16 (coding, &detect_info))
6765         {
6766           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6767             found = XCAR (coding_systems);
6768           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6769             found = XCDR (coding_systems);
6770         }
6771     }
6772
6773   if (! NILP (found))
6774     {
6775       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6776                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6777                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6778                            : EOL_SEEN_LF);
6779
6780       setup_coding_system (found, coding);
6781       if (specified_eol != EOL_SEEN_NONE)
6782         adjust_coding_eol_type (coding, specified_eol);
6783     }
6784
6785   coding->mode = saved_mode;
6786 }
6787
6788
6789 static void
6790 decode_eol (struct coding_system *coding)
6791 {
6792   Lisp_Object eol_type;
6793   unsigned char *p, *pbeg, *pend;
6794
6795   eol_type = CODING_ID_EOL_TYPE (coding->id);
6796   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6797     return;
6798
6799   if (NILP (coding->dst_object))
6800     pbeg = coding->destination;
6801   else
6802     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6803   pend = pbeg + coding->produced;
6804
6805   if (VECTORP (eol_type))
6806     {
6807       int eol_seen = EOL_SEEN_NONE;
6808
6809       for (p = pbeg; p < pend; p++)
6810         {
6811           if (*p == '\n')
6812             eol_seen |= EOL_SEEN_LF;
6813           else if (*p == '\r')
6814             {
6815               if (p + 1 < pend && *(p + 1) == '\n')
6816                 {
6817                   eol_seen |= EOL_SEEN_CRLF;
6818                   p++;
6819                 }
6820               else
6821                 eol_seen |= EOL_SEEN_CR;
6822             }
6823         }
6824       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6825       if ((eol_seen & EOL_SEEN_CRLF) != 0
6826           && (eol_seen & EOL_SEEN_CR) != 0
6827           && (eol_seen & EOL_SEEN_LF) == 0)
6828         eol_seen = EOL_SEEN_CRLF;
6829       else if (eol_seen != EOL_SEEN_NONE
6830           && eol_seen != EOL_SEEN_LF
6831           && eol_seen != EOL_SEEN_CRLF
6832           && eol_seen != EOL_SEEN_CR)
6833         eol_seen = EOL_SEEN_LF;
6834       if (eol_seen != EOL_SEEN_NONE)
6835         eol_type = adjust_coding_eol_type (coding, eol_seen);
6836     }
6837
6838   if (EQ (eol_type, Qmac))
6839     {
6840       for (p = pbeg; p < pend; p++)
6841         if (*p == '\r')
6842           *p = '\n';
6843     }
6844   else if (EQ (eol_type, Qdos))
6845     {
6846       ptrdiff_t n = 0;
6847
6848       if (NILP (coding->dst_object))
6849         {
6850           /* Start deleting '\r' from the tail to minimize the memory
6851              movement.  */
6852           for (p = pend - 2; p >= pbeg; p--)
6853             if (*p == '\r')
6854               {
6855                 memmove (p, p + 1, pend-- - p - 1);
6856                 n++;
6857               }
6858         }
6859       else
6860         {
6861           ptrdiff_t pos_byte = coding->dst_pos_byte;
6862           ptrdiff_t pos = coding->dst_pos;
6863           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6864
6865           while (pos < pos_end)
6866             {
6867               p = BYTE_POS_ADDR (pos_byte);
6868               if (*p == '\r' && p[1] == '\n')
6869                 {
6870                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6871                   n++;
6872                   pos_end--;
6873                 }
6874               pos++;
6875               if (coding->dst_multibyte)
6876                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6877               else
6878                 pos_byte++;
6879             }
6880         }
6881       coding->produced -= n;
6882       coding->produced_char -= n;
6883     }
6884 }
6885
6886
6887 /* MAX_LOOKUP's maximum value.  MAX_LOOKUP is an int and so cannot
6888    exceed INT_MAX.  Also, MAX_LOOKUP is multiplied by sizeof (int) for
6889    alloca, so it cannot exceed MAX_ALLOCA / sizeof (int).  */
6890 enum { MAX_LOOKUP_MAX = min (INT_MAX, MAX_ALLOCA / sizeof (int)) };
6891
6892 /* Return a translation table (or list of them) from coding system
6893    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6894    not ENCODEP). */
6895
6896 static Lisp_Object
6897 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6898 {
6899   Lisp_Object standard, translation_table;
6900   Lisp_Object val;
6901
6902   if (NILP (Venable_character_translation))
6903     {
6904       if (max_lookup)
6905         *max_lookup = 0;
6906       return Qnil;
6907     }
6908   if (encodep)
6909     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6910       standard = Vstandard_translation_table_for_encode;
6911   else
6912     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6913       standard = Vstandard_translation_table_for_decode;
6914   if (NILP (translation_table))
6915     translation_table = standard;
6916   else
6917     {
6918       if (SYMBOLP (translation_table))
6919         translation_table = Fget (translation_table, Qtranslation_table);
6920       else if (CONSP (translation_table))
6921         {
6922           translation_table = Fcopy_sequence (translation_table);
6923           for (val = translation_table; CONSP (val); val = XCDR (val))
6924             if (SYMBOLP (XCAR (val)))
6925               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6926         }
6927       if (CHAR_TABLE_P (standard))
6928         {
6929           if (CONSP (translation_table))
6930             translation_table = nconc2 (translation_table, list1 (standard));
6931           else
6932             translation_table = list2 (translation_table, standard);
6933         }
6934     }
6935
6936   if (max_lookup)
6937     {
6938       *max_lookup = 1;
6939       if (CHAR_TABLE_P (translation_table)
6940           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6941         {
6942           val = XCHAR_TABLE (translation_table)->extras[1];
6943           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6944             *max_lookup = min (XFASTINT (val), MAX_LOOKUP_MAX);
6945         }
6946       else if (CONSP (translation_table))
6947         {
6948           Lisp_Object tail;
6949
6950           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6951             if (CHAR_TABLE_P (XCAR (tail))
6952                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6953               {
6954                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6955                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6956                   *max_lookup = min (XFASTINT (tailval), MAX_LOOKUP_MAX);
6957               }
6958         }
6959     }
6960   return translation_table;
6961 }
6962
6963 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6964   do {                                                          \
6965     trans = Qnil;                                               \
6966     if (CHAR_TABLE_P (table))                                   \
6967       {                                                         \
6968         trans = CHAR_TABLE_REF (table, c);                      \
6969         if (CHARACTERP (trans))                                 \
6970           c = XFASTINT (trans), trans = Qnil;                   \
6971       }                                                         \
6972     else if (CONSP (table))                                     \
6973       {                                                         \
6974         Lisp_Object tail;                                       \
6975                                                                 \
6976         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6977           if (CHAR_TABLE_P (XCAR (tail)))                       \
6978             {                                                   \
6979               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6980               if (CHARACTERP (trans))                           \
6981                 c = XFASTINT (trans), trans = Qnil;             \
6982               else if (! NILP (trans))                          \
6983                 break;                                          \
6984             }                                                   \
6985       }                                                         \
6986   } while (0)
6987
6988
6989 /* Return a translation of character(s) at BUF according to TRANS.
6990    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6991    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6992    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6993    translation is found, and Qnil if not found..
6994    If BUF is too short to lookup characters in FROM, return Qt.  */
6995
6996 static Lisp_Object
6997 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6998 {
6999
7000   if (INTEGERP (trans))
7001     return trans;
7002   for (; CONSP (trans); trans = XCDR (trans))
7003     {
7004       Lisp_Object val = XCAR (trans);
7005       Lisp_Object from = XCAR (val);
7006       ptrdiff_t len = ASIZE (from);
7007       ptrdiff_t i;
7008
7009       for (i = 0; i < len; i++)
7010         {
7011           if (buf + i == buf_end)
7012             return Qt;
7013           if (XINT (AREF (from, i)) != buf[i])
7014             break;
7015         }
7016       if (i == len)
7017         return val;
7018     }
7019   return Qnil;
7020 }
7021
7022
7023 static int
7024 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7025                bool last_block)
7026 {
7027   unsigned char *dst = coding->destination + coding->produced;
7028   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7029   ptrdiff_t produced;
7030   ptrdiff_t produced_chars = 0;
7031   int carryover = 0;
7032
7033   if (! coding->chars_at_source)
7034     {
7035       /* Source characters are in coding->charbuf.  */
7036       int *buf = coding->charbuf;
7037       int *buf_end = buf + coding->charbuf_used;
7038
7039       if (EQ (coding->src_object, coding->dst_object)
7040           && ! NILP (coding->dst_object))
7041         {
7042           eassert (growable_destination (coding));
7043           coding_set_source (coding);
7044           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7045         }
7046
7047       while (buf < buf_end)
7048         {
7049           int c = *buf;
7050           ptrdiff_t i;
7051
7052           if (c >= 0)
7053             {
7054               ptrdiff_t from_nchars = 1, to_nchars = 1;
7055               Lisp_Object trans = Qnil;
7056
7057               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7058               if (! NILP (trans))
7059                 {
7060                   trans = get_translation (trans, buf, buf_end);
7061                   if (INTEGERP (trans))
7062                     c = XINT (trans);
7063                   else if (CONSP (trans))
7064                     {
7065                       from_nchars = ASIZE (XCAR (trans));
7066                       trans = XCDR (trans);
7067                       if (INTEGERP (trans))
7068                         c = XINT (trans);
7069                       else
7070                         {
7071                           to_nchars = ASIZE (trans);
7072                           c = XINT (AREF (trans, 0));
7073                         }
7074                     }
7075                   else if (EQ (trans, Qt) && ! last_block)
7076                     break;
7077                 }
7078
7079               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7080                 {
7081                   eassert (growable_destination (coding));
7082                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7083                        / MAX_MULTIBYTE_LENGTH)
7084                       < to_nchars)
7085                     memory_full (SIZE_MAX);
7086                   dst = alloc_destination (coding,
7087                                            buf_end - buf
7088                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7089                                            dst);
7090                   if (EQ (coding->src_object, coding->dst_object))
7091                     {
7092                       coding_set_source (coding);
7093                       dst_end = (((unsigned char *) coding->source)
7094                                  + coding->consumed);
7095                     }
7096                   else
7097                     dst_end = coding->destination + coding->dst_bytes;
7098                 }
7099
7100               for (i = 0; i < to_nchars; i++)
7101                 {
7102                   if (i > 0)
7103                     c = XINT (AREF (trans, i));
7104                   if (coding->dst_multibyte
7105                       || ! CHAR_BYTE8_P (c))
7106                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7107                   else
7108                     *dst++ = CHAR_TO_BYTE8 (c);
7109                 }
7110               produced_chars += to_nchars;
7111               buf += from_nchars;
7112             }
7113           else
7114             /* This is an annotation datum.  (-C) is the length.  */
7115             buf += -c;
7116         }
7117       carryover = buf_end - buf;
7118     }
7119   else
7120     {
7121       /* Source characters are at coding->source.  */
7122       const unsigned char *src = coding->source;
7123       const unsigned char *src_end = src + coding->consumed;
7124
7125       if (EQ (coding->dst_object, coding->src_object))
7126         {
7127           eassert (growable_destination (coding));
7128           dst_end = (unsigned char *) src;
7129         }
7130       if (coding->src_multibyte != coding->dst_multibyte)
7131         {
7132           if (coding->src_multibyte)
7133             {
7134               bool multibytep = 1;
7135               ptrdiff_t consumed_chars = 0;
7136
7137               while (1)
7138                 {
7139                   const unsigned char *src_base = src;
7140                   int c;
7141
7142                   ONE_MORE_BYTE (c);
7143                   if (dst == dst_end)
7144                     {
7145                       eassert (growable_destination (coding));
7146                       if (EQ (coding->src_object, coding->dst_object))
7147                         dst_end = (unsigned char *) src;
7148                       if (dst == dst_end)
7149                         {
7150                           ptrdiff_t offset = src - coding->source;
7151
7152                           dst = alloc_destination (coding, src_end - src + 1,
7153                                                    dst);
7154                           dst_end = coding->destination + coding->dst_bytes;
7155                           coding_set_source (coding);
7156                           src = coding->source + offset;
7157                           src_end = coding->source + coding->consumed;
7158                           if (EQ (coding->src_object, coding->dst_object))
7159                             dst_end = (unsigned char *) src;
7160                         }
7161                     }
7162                   *dst++ = c;
7163                   produced_chars++;
7164                 }
7165             no_more_source:
7166               ;
7167             }
7168           else
7169             while (src < src_end)
7170               {
7171                 bool multibytep = 1;
7172                 int c = *src++;
7173
7174                 if (dst >= dst_end - 1)
7175                   {
7176                     eassert (growable_destination (coding));
7177                     if (EQ (coding->src_object, coding->dst_object))
7178                       dst_end = (unsigned char *) src;
7179                     if (dst >= dst_end - 1)
7180                       {
7181                         ptrdiff_t offset = src - coding->source;
7182                         ptrdiff_t more_bytes;
7183
7184                         if (EQ (coding->src_object, coding->dst_object))
7185                           more_bytes = ((src_end - src) / 2) + 2;
7186                         else
7187                           more_bytes = src_end - src + 2;
7188                         dst = alloc_destination (coding, more_bytes, dst);
7189                         dst_end = coding->destination + coding->dst_bytes;
7190                         coding_set_source (coding);
7191                         src = coding->source + offset;
7192                         src_end = coding->source + coding->consumed;
7193                         if (EQ (coding->src_object, coding->dst_object))
7194                           dst_end = (unsigned char *) src;
7195                       }
7196                   }
7197                 EMIT_ONE_BYTE (c);
7198               }
7199         }
7200       else
7201         {
7202           if (!EQ (coding->src_object, coding->dst_object))
7203             {
7204               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7205
7206               if (require > 0)
7207                 {
7208                   ptrdiff_t offset = src - coding->source;
7209
7210                   dst = alloc_destination (coding, require, dst);
7211                   coding_set_source (coding);
7212                   src = coding->source + offset;
7213                   src_end = coding->source + coding->consumed;
7214                 }
7215             }
7216           produced_chars = coding->consumed_char;
7217           while (src < src_end)
7218             *dst++ = *src++;
7219         }
7220     }
7221
7222   produced = dst - (coding->destination + coding->produced);
7223   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7224     insert_from_gap (produced_chars, produced, 0);
7225   coding->produced += produced;
7226   coding->produced_char += produced_chars;
7227   return carryover;
7228 }
7229
7230 /* Compose text in CODING->object according to the annotation data at
7231    CHARBUF.  CHARBUF is an array:
7232      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7233  */
7234
7235 static void
7236 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7237 {
7238   int len;
7239   ptrdiff_t to;
7240   enum composition_method method;
7241   Lisp_Object components;
7242
7243   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7244   to = pos + charbuf[2];
7245   method = (enum composition_method) (charbuf[4]);
7246
7247   if (method == COMPOSITION_RELATIVE)
7248     components = Qnil;
7249   else
7250     {
7251       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7252       int i, j;
7253
7254       if (method == COMPOSITION_WITH_RULE)
7255         len = charbuf[2] * 3 - 2;
7256       charbuf += MAX_ANNOTATION_LENGTH;
7257       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7258       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7259         {
7260           if (charbuf[i] >= 0)
7261             args[j] = make_number (charbuf[i]);
7262           else
7263             {
7264               i++;
7265               args[j] = make_number (charbuf[i] % 0x100);
7266             }
7267         }
7268       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7269     }
7270   compose_text (pos, to, components, Qnil, coding->dst_object);
7271 }
7272
7273
7274 /* Put `charset' property on text in CODING->object according to
7275    the annotation data at CHARBUF.  CHARBUF is an array:
7276      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7277  */
7278
7279 static void
7280 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7281 {
7282   ptrdiff_t from = pos - charbuf[2];
7283   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7284
7285   Fput_text_property (make_number (from), make_number (pos),
7286                       Qcharset, CHARSET_NAME (charset),
7287                       coding->dst_object);
7288 }
7289
7290 #define MAX_CHARBUF_SIZE 0x4000
7291 /* How many units decoding functions expect in coding->charbuf at
7292    most.  Currently, decode_coding_emacs_mule expects the following
7293    size, and that is the largest value.  */
7294 #define MAX_CHARBUF_EXTRA_SIZE ((MAX_ANNOTATION_LENGTH * 3) + 1)
7295
7296 #define ALLOC_CONVERSION_WORK_AREA(coding, size)                \
7297   do {                                                          \
7298     ptrdiff_t units = min ((size) + MAX_CHARBUF_EXTRA_SIZE,     \
7299                            MAX_CHARBUF_SIZE);                   \
7300     coding->charbuf = SAFE_ALLOCA (units * sizeof (int));       \
7301     coding->charbuf_size = units;                               \
7302   } while (0)
7303
7304 static void
7305 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7306 {
7307   int *charbuf = coding->charbuf;
7308   int *charbuf_end = charbuf + coding->charbuf_used;
7309
7310   if (NILP (coding->dst_object))
7311     return;
7312
7313   while (charbuf < charbuf_end)
7314     {
7315       if (*charbuf >= 0)
7316         pos++, charbuf++;
7317       else
7318         {
7319           int len = -*charbuf;
7320
7321           if (len > 2)
7322             switch (charbuf[1])
7323               {
7324               case CODING_ANNOTATE_COMPOSITION_MASK:
7325                 produce_composition (coding, charbuf, pos);
7326                 break;
7327               case CODING_ANNOTATE_CHARSET_MASK:
7328                 produce_charset (coding, charbuf, pos);
7329                 break;
7330               }
7331           charbuf += len;
7332         }
7333     }
7334 }
7335
7336 /* Decode the data at CODING->src_object into CODING->dst_object.
7337    CODING->src_object is a buffer, a string, or nil.
7338    CODING->dst_object is a buffer.
7339
7340    If CODING->src_object is a buffer, it must be the current buffer.
7341    In this case, if CODING->src_pos is positive, it is a position of
7342    the source text in the buffer, otherwise, the source text is in the
7343    gap area of the buffer, and CODING->src_pos specifies the offset of
7344    the text from GPT (which must be the same as PT).  If this is the
7345    same buffer as CODING->dst_object, CODING->src_pos must be
7346    negative.
7347
7348    If CODING->src_object is a string, CODING->src_pos is an index to
7349    that string.
7350
7351    If CODING->src_object is nil, CODING->source must already point to
7352    the non-relocatable memory area.  In this case, CODING->src_pos is
7353    an offset from CODING->source.
7354
7355    The decoded data is inserted at the current point of the buffer
7356    CODING->dst_object.
7357 */
7358
7359 static void
7360 decode_coding (struct coding_system *coding)
7361 {
7362   Lisp_Object attrs;
7363   Lisp_Object undo_list;
7364   Lisp_Object translation_table;
7365   struct ccl_spec cclspec;
7366   int carryover;
7367   int i;
7368
7369   USE_SAFE_ALLOCA;
7370
7371   if (BUFFERP (coding->src_object)
7372       && coding->src_pos > 0
7373       && coding->src_pos < GPT
7374       && coding->src_pos + coding->src_chars > GPT)
7375     move_gap_both (coding->src_pos, coding->src_pos_byte);
7376
7377   undo_list = Qt;
7378   if (BUFFERP (coding->dst_object))
7379     {
7380       set_buffer_internal (XBUFFER (coding->dst_object));
7381       if (GPT != PT)
7382         move_gap_both (PT, PT_BYTE);
7383
7384       /* We must disable undo_list in order to record the whole insert
7385          transaction via record_insert at the end.  But doing so also
7386          disables the recording of the first change to the undo_list.
7387          Therefore we check for first change here and record it via
7388          record_first_change if needed.  */
7389       if (MODIFF <= SAVE_MODIFF)
7390         record_first_change ();
7391
7392       undo_list = BVAR (current_buffer, undo_list);
7393       bset_undo_list (current_buffer, Qt);
7394     }
7395
7396   coding->consumed = coding->consumed_char = 0;
7397   coding->produced = coding->produced_char = 0;
7398   coding->chars_at_source = 0;
7399   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7400
7401   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_bytes);
7402
7403   attrs = CODING_ID_ATTRS (coding->id);
7404   translation_table = get_translation_table (attrs, 0, NULL);
7405
7406   carryover = 0;
7407   if (coding->decoder == decode_coding_ccl)
7408     {
7409       coding->spec.ccl = &cclspec;
7410       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7411     }
7412   do
7413     {
7414       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7415
7416       coding_set_source (coding);
7417       coding->annotated = 0;
7418       coding->charbuf_used = carryover;
7419       (*(coding->decoder)) (coding);
7420       coding_set_destination (coding);
7421       carryover = produce_chars (coding, translation_table, 0);
7422       if (coding->annotated)
7423         produce_annotation (coding, pos);
7424       for (i = 0; i < carryover; i++)
7425         coding->charbuf[i]
7426           = coding->charbuf[coding->charbuf_used - carryover + i];
7427     }
7428   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7429          || (coding->consumed < coding->src_bytes
7430              && (coding->result == CODING_RESULT_SUCCESS
7431                  || coding->result == CODING_RESULT_INVALID_SRC)));
7432
7433   if (carryover > 0)
7434     {
7435       coding_set_destination (coding);
7436       coding->charbuf_used = carryover;
7437       produce_chars (coding, translation_table, 1);
7438     }
7439
7440   coding->carryover_bytes = 0;
7441   if (coding->consumed < coding->src_bytes)
7442     {
7443       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7444       const unsigned char *src;
7445
7446       coding_set_source (coding);
7447       coding_set_destination (coding);
7448       src = coding->source + coding->consumed;
7449
7450       if (coding->mode & CODING_MODE_LAST_BLOCK)
7451         {
7452           /* Flush out unprocessed data as binary chars.  We are sure
7453              that the number of data is less than the size of
7454              coding->charbuf.  */
7455           coding->charbuf_used = 0;
7456           coding->chars_at_source = 0;
7457
7458           while (nbytes-- > 0)
7459             {
7460               int c = *src++;
7461
7462               if (c & 0x80)
7463                 c = BYTE8_TO_CHAR (c);
7464               coding->charbuf[coding->charbuf_used++] = c;
7465             }
7466           produce_chars (coding, Qnil, 1);
7467         }
7468       else
7469         {
7470           /* Record unprocessed bytes in coding->carryover.  We are
7471              sure that the number of data is less than the size of
7472              coding->carryover.  */
7473           unsigned char *p = coding->carryover;
7474
7475           if (nbytes > sizeof coding->carryover)
7476             nbytes = sizeof coding->carryover;
7477           coding->carryover_bytes = nbytes;
7478           while (nbytes-- > 0)
7479             *p++ = *src++;
7480         }
7481       coding->consumed = coding->src_bytes;
7482     }
7483
7484   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7485       && !inhibit_eol_conversion)
7486     decode_eol (coding);
7487   if (BUFFERP (coding->dst_object))
7488     {
7489       bset_undo_list (current_buffer, undo_list);
7490       record_insert (coding->dst_pos, coding->produced_char);
7491     }
7492
7493   SAFE_FREE ();
7494 }
7495
7496
7497 /* Extract an annotation datum from a composition starting at POS and
7498    ending before LIMIT of CODING->src_object (buffer or string), store
7499    the data in BUF, set *STOP to a starting position of the next
7500    composition (if any) or to LIMIT, and return the address of the
7501    next element of BUF.
7502
7503    If such an annotation is not found, set *STOP to a starting
7504    position of a composition after POS (if any) or to LIMIT, and
7505    return BUF.  */
7506
7507 static int *
7508 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7509                                struct coding_system *coding, int *buf,
7510                                ptrdiff_t *stop)
7511 {
7512   ptrdiff_t start, end;
7513   Lisp_Object prop;
7514
7515   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7516       || end > limit)
7517     *stop = limit;
7518   else if (start > pos)
7519     *stop = start;
7520   else
7521     {
7522       if (start == pos)
7523         {
7524           /* We found a composition.  Store the corresponding
7525              annotation data in BUF.  */
7526           int *head = buf;
7527           enum composition_method method = composition_method (prop);
7528           int nchars = COMPOSITION_LENGTH (prop);
7529
7530           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7531           if (method != COMPOSITION_RELATIVE)
7532             {
7533               Lisp_Object components;
7534               ptrdiff_t i, len, i_byte;
7535
7536               components = COMPOSITION_COMPONENTS (prop);
7537               if (VECTORP (components))
7538                 {
7539                   len = ASIZE (components);
7540                   for (i = 0; i < len; i++)
7541                     *buf++ = XINT (AREF (components, i));
7542                 }
7543               else if (STRINGP (components))
7544                 {
7545                   len = SCHARS (components);
7546                   i = i_byte = 0;
7547                   while (i < len)
7548                     {
7549                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7550                       buf++;
7551                     }
7552                 }
7553               else if (INTEGERP (components))
7554                 {
7555                   len = 1;
7556                   *buf++ = XINT (components);
7557                 }
7558               else if (CONSP (components))
7559                 {
7560                   for (len = 0; CONSP (components);
7561                        len++, components = XCDR (components))
7562                     *buf++ = XINT (XCAR (components));
7563                 }
7564               else
7565                 emacs_abort ();
7566               *head -= len;
7567             }
7568         }
7569
7570       if (find_composition (end, limit, &start, &end, &prop,
7571                             coding->src_object)
7572           && end <= limit)
7573         *stop = start;
7574       else
7575         *stop = limit;
7576     }
7577   return buf;
7578 }
7579
7580
7581 /* Extract an annotation datum from a text property `charset' at POS of
7582    CODING->src_object (buffer of string), store the data in BUF, set
7583    *STOP to the position where the value of `charset' property changes
7584    (limiting by LIMIT), and return the address of the next element of
7585    BUF.
7586
7587    If the property value is nil, set *STOP to the position where the
7588    property value is non-nil (limiting by LIMIT), and return BUF.  */
7589
7590 static int *
7591 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7592                            struct coding_system *coding, int *buf,
7593                            ptrdiff_t *stop)
7594 {
7595   Lisp_Object val, next;
7596   int id;
7597
7598   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7599   if (! NILP (val) && CHARSETP (val))
7600     id = XINT (CHARSET_SYMBOL_ID (val));
7601   else
7602     id = -1;
7603   ADD_CHARSET_DATA (buf, 0, id);
7604   next = Fnext_single_property_change (make_number (pos), Qcharset,
7605                                        coding->src_object,
7606                                        make_number (limit));
7607   *stop = XINT (next);
7608   return buf;
7609 }
7610
7611
7612 static void
7613 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7614                int max_lookup)
7615 {
7616   int *buf = coding->charbuf;
7617   int *buf_end = coding->charbuf + coding->charbuf_size;
7618   const unsigned char *src = coding->source + coding->consumed;
7619   const unsigned char *src_end = coding->source + coding->src_bytes;
7620   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7621   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7622   bool multibytep = coding->src_multibyte;
7623   Lisp_Object eol_type;
7624   int c;
7625   ptrdiff_t stop, stop_composition, stop_charset;
7626   int *lookup_buf = NULL;
7627
7628   if (! NILP (translation_table))
7629     lookup_buf = alloca (sizeof (int) * max_lookup);
7630
7631   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7632   if (VECTORP (eol_type))
7633     eol_type = Qunix;
7634
7635   /* Note: composition handling is not yet implemented.  */
7636   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7637
7638   if (NILP (coding->src_object))
7639     stop = stop_composition = stop_charset = end_pos;
7640   else
7641     {
7642       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7643         stop = stop_composition = pos;
7644       else
7645         stop = stop_composition = end_pos;
7646       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7647         stop = stop_charset = pos;
7648       else
7649         stop_charset = end_pos;
7650     }
7651
7652   /* Compensate for CRLF and conversion.  */
7653   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7654   while (buf < buf_end)
7655     {
7656       Lisp_Object trans;
7657
7658       if (pos == stop)
7659         {
7660           if (pos == end_pos)
7661             break;
7662           if (pos == stop_composition)
7663             buf = handle_composition_annotation (pos, end_pos, coding,
7664                                                  buf, &stop_composition);
7665           if (pos == stop_charset)
7666             buf = handle_charset_annotation (pos, end_pos, coding,
7667                                              buf, &stop_charset);
7668           stop = (stop_composition < stop_charset
7669                   ? stop_composition : stop_charset);
7670         }
7671
7672       if (! multibytep)
7673         {
7674           int bytes;
7675
7676           if (coding->encoder == encode_coding_raw_text
7677               || coding->encoder == encode_coding_ccl)
7678             c = *src++, pos++;
7679           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7680             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7681           else
7682             c = BYTE8_TO_CHAR (*src), src++, pos++;
7683         }
7684       else
7685         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7686       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7687         c = '\n';
7688       if (! EQ (eol_type, Qunix))
7689         {
7690           if (c == '\n')
7691             {
7692               if (EQ (eol_type, Qdos))
7693                 *buf++ = '\r';
7694               else
7695                 c = '\r';
7696             }
7697         }
7698
7699       trans = Qnil;
7700       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7701       if (NILP (trans))
7702         *buf++ = c;
7703       else
7704         {
7705           ptrdiff_t from_nchars = 1, to_nchars = 1;
7706           int *lookup_buf_end;
7707           const unsigned char *p = src;
7708           int i;
7709
7710           lookup_buf[0] = c;
7711           for (i = 1; i < max_lookup && p < src_end; i++)
7712             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7713           lookup_buf_end = lookup_buf + i;
7714           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7715           if (INTEGERP (trans))
7716             c = XINT (trans);
7717           else if (CONSP (trans))
7718             {
7719               from_nchars = ASIZE (XCAR (trans));
7720               trans = XCDR (trans);
7721               if (INTEGERP (trans))
7722                 c = XINT (trans);
7723               else
7724                 {
7725                   to_nchars = ASIZE (trans);
7726                   if (buf_end - buf < to_nchars)
7727                     break;
7728                   c = XINT (AREF (trans, 0));
7729                 }
7730             }
7731           else
7732             break;
7733           *buf++ = c;
7734           for (i = 1; i < to_nchars; i++)
7735             *buf++ = XINT (AREF (trans, i));
7736           for (i = 1; i < from_nchars; i++, pos++)
7737             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7738         }
7739     }
7740
7741   coding->consumed = src - coding->source;
7742   coding->consumed_char = pos - coding->src_pos;
7743   coding->charbuf_used = buf - coding->charbuf;
7744   coding->chars_at_source = 0;
7745 }
7746
7747
7748 /* Encode the text at CODING->src_object into CODING->dst_object.
7749    CODING->src_object is a buffer or a string.
7750    CODING->dst_object is a buffer or nil.
7751
7752    If CODING->src_object is a buffer, it must be the current buffer.
7753    In this case, if CODING->src_pos is positive, it is a position of
7754    the source text in the buffer, otherwise. the source text is in the
7755    gap area of the buffer, and coding->src_pos specifies the offset of
7756    the text from GPT (which must be the same as PT).  If this is the
7757    same buffer as CODING->dst_object, CODING->src_pos must be
7758    negative and CODING should not have `pre-write-conversion'.
7759
7760    If CODING->src_object is a string, CODING should not have
7761    `pre-write-conversion'.
7762
7763    If CODING->dst_object is a buffer, the encoded data is inserted at
7764    the current point of that buffer.
7765
7766    If CODING->dst_object is nil, the encoded data is placed at the
7767    memory area specified by CODING->destination.  */
7768
7769 static void
7770 encode_coding (struct coding_system *coding)
7771 {
7772   Lisp_Object attrs;
7773   Lisp_Object translation_table;
7774   int max_lookup;
7775   struct ccl_spec cclspec;
7776
7777   USE_SAFE_ALLOCA;
7778
7779   attrs = CODING_ID_ATTRS (coding->id);
7780   if (coding->encoder == encode_coding_raw_text)
7781     translation_table = Qnil, max_lookup = 0;
7782   else
7783     translation_table = get_translation_table (attrs, 1, &max_lookup);
7784
7785   if (BUFFERP (coding->dst_object))
7786     {
7787       set_buffer_internal (XBUFFER (coding->dst_object));
7788       coding->dst_multibyte
7789         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7790     }
7791
7792   coding->consumed = coding->consumed_char = 0;
7793   coding->produced = coding->produced_char = 0;
7794   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7795
7796   ALLOC_CONVERSION_WORK_AREA (coding, coding->src_chars);
7797
7798   if (coding->encoder == encode_coding_ccl)
7799     {
7800       coding->spec.ccl = &cclspec;
7801       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7802     }
7803   do {
7804     coding_set_source (coding);
7805     consume_chars (coding, translation_table, max_lookup);
7806     coding_set_destination (coding);
7807     (*(coding->encoder)) (coding);
7808   } while (coding->consumed_char < coding->src_chars);
7809
7810   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7811     insert_from_gap (coding->produced_char, coding->produced, 0);
7812
7813   SAFE_FREE ();
7814 }
7815
7816
7817 /* Name (or base name) of work buffer for code conversion.  */
7818 static Lisp_Object Vcode_conversion_workbuf_name;
7819
7820 /* A working buffer used by the top level conversion.  Once it is
7821    created, it is never destroyed.  It has the name
7822    Vcode_conversion_workbuf_name.  The other working buffers are
7823    destroyed after the use is finished, and their names are modified
7824    versions of Vcode_conversion_workbuf_name.  */
7825 static Lisp_Object Vcode_conversion_reused_workbuf;
7826
7827 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7828 static bool reused_workbuf_in_use;
7829
7830
7831 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7832    multibyteness of returning buffer.  */
7833
7834 static Lisp_Object
7835 make_conversion_work_buffer (bool multibyte)
7836 {
7837   Lisp_Object name, workbuf;
7838   struct buffer *current;
7839
7840   if (reused_workbuf_in_use)
7841     {
7842       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7843       workbuf = Fget_buffer_create (name);
7844     }
7845   else
7846     {
7847       reused_workbuf_in_use = 1;
7848       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7849         Vcode_conversion_reused_workbuf
7850           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7851       workbuf = Vcode_conversion_reused_workbuf;
7852     }
7853   current = current_buffer;
7854   set_buffer_internal (XBUFFER (workbuf));
7855   /* We can't allow modification hooks to run in the work buffer.  For
7856      instance, directory_files_internal assumes that file decoding
7857      doesn't compile new regexps.  */
7858   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7859   Ferase_buffer ();
7860   bset_undo_list (current_buffer, Qt);
7861   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7862   set_buffer_internal (current);
7863   return workbuf;
7864 }
7865
7866
7867 static void
7868 code_conversion_restore (Lisp_Object arg)
7869 {
7870   Lisp_Object current, workbuf;
7871   struct gcpro gcpro1;
7872
7873   GCPRO1 (arg);
7874   current = XCAR (arg);
7875   workbuf = XCDR (arg);
7876   if (! NILP (workbuf))
7877     {
7878       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7879         reused_workbuf_in_use = 0;
7880       else
7881         Fkill_buffer (workbuf);
7882     }
7883   set_buffer_internal (XBUFFER (current));
7884   UNGCPRO;
7885 }
7886
7887 Lisp_Object
7888 code_conversion_save (bool with_work_buf, bool multibyte)
7889 {
7890   Lisp_Object workbuf = Qnil;
7891
7892   if (with_work_buf)
7893     workbuf = make_conversion_work_buffer (multibyte);
7894   record_unwind_protect (code_conversion_restore,
7895                          Fcons (Fcurrent_buffer (), workbuf));
7896   return workbuf;
7897 }
7898
7899 void
7900 decode_coding_gap (struct coding_system *coding,
7901                    ptrdiff_t chars, ptrdiff_t bytes)
7902 {
7903   ptrdiff_t count = SPECPDL_INDEX ();
7904   Lisp_Object attrs;
7905
7906   coding->src_object = Fcurrent_buffer ();
7907   coding->src_chars = chars;
7908   coding->src_bytes = bytes;
7909   coding->src_pos = -chars;
7910   coding->src_pos_byte = -bytes;
7911   coding->src_multibyte = chars < bytes;
7912   coding->dst_object = coding->src_object;
7913   coding->dst_pos = PT;
7914   coding->dst_pos_byte = PT_BYTE;
7915   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7916
7917   coding->head_ascii = -1;
7918   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7919   coding->eol_seen = EOL_SEEN_NONE;
7920   if (CODING_REQUIRE_DETECTION (coding))
7921     detect_coding (coding);
7922   attrs = CODING_ID_ATTRS (coding->id);
7923   if (! disable_ascii_optimization
7924       && ! coding->src_multibyte
7925       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7926       && NILP (CODING_ATTR_POST_READ (attrs))
7927       && NILP (get_translation_table (attrs, 0, NULL)))
7928     {
7929       chars = coding->head_ascii;
7930       if (chars < 0)
7931         chars = check_ascii (coding);
7932       if (chars != bytes)
7933         {
7934           /* There exists a non-ASCII byte.  */
7935           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7936               && coding->detected_utf8_bytes == coding->src_bytes)
7937             {
7938               if (coding->detected_utf8_chars >= 0)
7939                 chars = coding->detected_utf8_chars;
7940               else
7941                 chars = check_utf_8 (coding);
7942               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7943                   && coding->head_ascii == 0
7944                   && coding->source[0] == UTF_8_BOM_1
7945                   && coding->source[1] == UTF_8_BOM_2
7946                   && coding->source[2] == UTF_8_BOM_3)
7947                 {
7948                   chars--;
7949                   bytes -= 3;
7950                   coding->src_bytes -= 3;
7951                 }
7952             }
7953           else
7954             chars = -1;
7955         }
7956       if (chars >= 0)
7957         {
7958           Lisp_Object eol_type;
7959
7960           eol_type = CODING_ID_EOL_TYPE (coding->id);
7961           if (VECTORP (eol_type))
7962             {
7963               if (coding->eol_seen != EOL_SEEN_NONE)
7964                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7965             }
7966           if (EQ (eol_type, Qmac))
7967             {
7968               unsigned char *src_end = GAP_END_ADDR;
7969               unsigned char *src = src_end - coding->src_bytes;
7970
7971               while (src < src_end)
7972                 {
7973                   if (*src++ == '\r')
7974                     src[-1] = '\n';
7975                 }
7976             }
7977           else if (EQ (eol_type, Qdos))
7978             {
7979               unsigned char *src = GAP_END_ADDR;
7980               unsigned char *src_beg = src - coding->src_bytes;
7981               unsigned char *dst = src;
7982               ptrdiff_t diff;
7983
7984               while (src_beg < src)
7985                 {
7986                   *--dst = *--src;
7987                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7988                     src--;
7989                 }
7990               diff = dst - src;
7991               bytes -= diff;
7992               chars -= diff;
7993             }
7994           coding->produced = bytes;
7995           coding->produced_char = chars;
7996           insert_from_gap (chars, bytes, 1);
7997           return;
7998         }
7999     }
8000   code_conversion_save (0, 0);
8001
8002   coding->mode |= CODING_MODE_LAST_BLOCK;
8003   current_buffer->text->inhibit_shrinking = 1;
8004   decode_coding (coding);
8005   current_buffer->text->inhibit_shrinking = 0;
8006
8007   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8008     {
8009       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8010       Lisp_Object val;
8011
8012       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8013       val = call1 (CODING_ATTR_POST_READ (attrs),
8014                    make_number (coding->produced_char));
8015       CHECK_NATNUM (val);
8016       coding->produced_char += Z - prev_Z;
8017       coding->produced += Z_BYTE - prev_Z_BYTE;
8018     }
8019
8020   unbind_to (count, Qnil);
8021 }
8022
8023
8024 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8025    SRC_OBJECT into DST_OBJECT by coding context CODING.
8026
8027    SRC_OBJECT is a buffer, a string, or Qnil.
8028
8029    If it is a buffer, the text is at point of the buffer.  FROM and TO
8030    are positions in the buffer.
8031
8032    If it is a string, the text is at the beginning of the string.
8033    FROM and TO are indices to the string.
8034
8035    If it is nil, the text is at coding->source.  FROM and TO are
8036    indices to coding->source.
8037
8038    DST_OBJECT is a buffer, Qt, or Qnil.
8039
8040    If it is a buffer, the decoded text is inserted at point of the
8041    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8042    is deleted.
8043
8044    If it is Qt, a string is made from the decoded text, and
8045    set in CODING->dst_object.
8046
8047    If it is Qnil, the decoded text is stored at CODING->destination.
8048    The caller must allocate CODING->dst_bytes bytes at
8049    CODING->destination by xmalloc.  If the decoded text is longer than
8050    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8051  */
8052
8053 void
8054 decode_coding_object (struct coding_system *coding,
8055                       Lisp_Object src_object,
8056                       ptrdiff_t from, ptrdiff_t from_byte,
8057                       ptrdiff_t to, ptrdiff_t to_byte,
8058                       Lisp_Object dst_object)
8059 {
8060   ptrdiff_t count = SPECPDL_INDEX ();
8061   unsigned char *destination IF_LINT (= NULL);
8062   ptrdiff_t dst_bytes IF_LINT (= 0);
8063   ptrdiff_t chars = to - from;
8064   ptrdiff_t bytes = to_byte - from_byte;
8065   Lisp_Object attrs;
8066   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8067   bool need_marker_adjustment = 0;
8068   Lisp_Object old_deactivate_mark;
8069
8070   old_deactivate_mark = Vdeactivate_mark;
8071
8072   if (NILP (dst_object))
8073     {
8074       destination = coding->destination;
8075       dst_bytes = coding->dst_bytes;
8076     }
8077
8078   coding->src_object = src_object;
8079   coding->src_chars = chars;
8080   coding->src_bytes = bytes;
8081   coding->src_multibyte = chars < bytes;
8082
8083   if (STRINGP (src_object))
8084     {
8085       coding->src_pos = from;
8086       coding->src_pos_byte = from_byte;
8087     }
8088   else if (BUFFERP (src_object))
8089     {
8090       set_buffer_internal (XBUFFER (src_object));
8091       if (from != GPT)
8092         move_gap_both (from, from_byte);
8093       if (EQ (src_object, dst_object))
8094         {
8095           struct Lisp_Marker *tail;
8096
8097           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8098             {
8099               tail->need_adjustment
8100                 = tail->charpos == (tail->insertion_type ? from : to);
8101               need_marker_adjustment |= tail->need_adjustment;
8102             }
8103           saved_pt = PT, saved_pt_byte = PT_BYTE;
8104           TEMP_SET_PT_BOTH (from, from_byte);
8105           current_buffer->text->inhibit_shrinking = 1;
8106           del_range_both (from, from_byte, to, to_byte, 1);
8107           coding->src_pos = -chars;
8108           coding->src_pos_byte = -bytes;
8109         }
8110       else
8111         {
8112           coding->src_pos = from;
8113           coding->src_pos_byte = from_byte;
8114         }
8115     }
8116
8117   if (CODING_REQUIRE_DETECTION (coding))
8118     detect_coding (coding);
8119   attrs = CODING_ID_ATTRS (coding->id);
8120
8121   if (EQ (dst_object, Qt)
8122       || (! NILP (CODING_ATTR_POST_READ (attrs))
8123           && NILP (dst_object)))
8124     {
8125       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8126       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8127       coding->dst_pos = BEG;
8128       coding->dst_pos_byte = BEG_BYTE;
8129     }
8130   else if (BUFFERP (dst_object))
8131     {
8132       code_conversion_save (0, 0);
8133       coding->dst_object = dst_object;
8134       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8135       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8136       coding->dst_multibyte
8137         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8138     }
8139   else
8140     {
8141       code_conversion_save (0, 0);
8142       coding->dst_object = Qnil;
8143       /* Most callers presume this will return a multibyte result, and they
8144          won't use `binary' or `raw-text' anyway, so let's not worry about
8145          CODING_FOR_UNIBYTE.  */
8146       coding->dst_multibyte = 1;
8147     }
8148
8149   decode_coding (coding);
8150
8151   if (BUFFERP (coding->dst_object))
8152     set_buffer_internal (XBUFFER (coding->dst_object));
8153
8154   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8155     {
8156       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8157       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8158       Lisp_Object val;
8159
8160       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8161       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8162               old_deactivate_mark);
8163       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8164                         make_number (coding->produced_char));
8165       UNGCPRO;
8166       CHECK_NATNUM (val);
8167       coding->produced_char += Z - prev_Z;
8168       coding->produced += Z_BYTE - prev_Z_BYTE;
8169     }
8170
8171   if (EQ (dst_object, Qt))
8172     {
8173       coding->dst_object = Fbuffer_string ();
8174     }
8175   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8176     {
8177       set_buffer_internal (XBUFFER (coding->dst_object));
8178       if (dst_bytes < coding->produced)
8179         {
8180           eassert (coding->produced > 0);
8181           destination = xrealloc (destination, coding->produced);
8182           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8183             move_gap_both (BEGV, BEGV_BYTE);
8184           memcpy (destination, BEGV_ADDR, coding->produced);
8185           coding->destination = destination;
8186         }
8187     }
8188
8189   if (saved_pt >= 0)
8190     {
8191       /* This is the case of:
8192          (BUFFERP (src_object) && EQ (src_object, dst_object))
8193          As we have moved PT while replacing the original buffer
8194          contents, we must recover it now.  */
8195       set_buffer_internal (XBUFFER (src_object));
8196       current_buffer->text->inhibit_shrinking = 0;
8197       if (saved_pt < from)
8198         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8199       else if (saved_pt < from + chars)
8200         TEMP_SET_PT_BOTH (from, from_byte);
8201       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8202         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8203                           saved_pt_byte + (coding->produced - bytes));
8204       else
8205         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8206                           saved_pt_byte + (coding->produced - bytes));
8207
8208       if (need_marker_adjustment)
8209         {
8210           struct Lisp_Marker *tail;
8211
8212           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8213             if (tail->need_adjustment)
8214               {
8215                 tail->need_adjustment = 0;
8216                 if (tail->insertion_type)
8217                   {
8218                     tail->bytepos = from_byte;
8219                     tail->charpos = from;
8220                   }
8221                 else
8222                   {
8223                     tail->bytepos = from_byte + coding->produced;
8224                     tail->charpos
8225                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8226                          ? tail->bytepos : from + coding->produced_char);
8227                   }
8228               }
8229         }
8230     }
8231
8232   Vdeactivate_mark = old_deactivate_mark;
8233   unbind_to (count, coding->dst_object);
8234 }
8235
8236
8237 void
8238 encode_coding_object (struct coding_system *coding,
8239                       Lisp_Object src_object,
8240                       ptrdiff_t from, ptrdiff_t from_byte,
8241                       ptrdiff_t to, ptrdiff_t to_byte,
8242                       Lisp_Object dst_object)
8243 {
8244   ptrdiff_t count = SPECPDL_INDEX ();
8245   ptrdiff_t chars = to - from;
8246   ptrdiff_t bytes = to_byte - from_byte;
8247   Lisp_Object attrs;
8248   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8249   bool need_marker_adjustment = 0;
8250   bool kill_src_buffer = 0;
8251   Lisp_Object old_deactivate_mark;
8252
8253   old_deactivate_mark = Vdeactivate_mark;
8254
8255   coding->src_object = src_object;
8256   coding->src_chars = chars;
8257   coding->src_bytes = bytes;
8258   coding->src_multibyte = chars < bytes;
8259
8260   attrs = CODING_ID_ATTRS (coding->id);
8261
8262   if (EQ (src_object, dst_object))
8263     {
8264       struct Lisp_Marker *tail;
8265
8266       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8267         {
8268           tail->need_adjustment
8269             = tail->charpos == (tail->insertion_type ? from : to);
8270           need_marker_adjustment |= tail->need_adjustment;
8271         }
8272     }
8273
8274   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8275     {
8276       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8277       set_buffer_internal (XBUFFER (coding->src_object));
8278       if (STRINGP (src_object))
8279         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8280       else if (BUFFERP (src_object))
8281         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8282       else
8283         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8284
8285       if (EQ (src_object, dst_object))
8286         {
8287           set_buffer_internal (XBUFFER (src_object));
8288           saved_pt = PT, saved_pt_byte = PT_BYTE;
8289           del_range_both (from, from_byte, to, to_byte, 1);
8290           set_buffer_internal (XBUFFER (coding->src_object));
8291         }
8292
8293       {
8294         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8295
8296         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8297                 old_deactivate_mark);
8298         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8299                     make_number (BEG), make_number (Z));
8300         UNGCPRO;
8301       }
8302       if (XBUFFER (coding->src_object) != current_buffer)
8303         kill_src_buffer = 1;
8304       coding->src_object = Fcurrent_buffer ();
8305       if (BEG != GPT)
8306         move_gap_both (BEG, BEG_BYTE);
8307       coding->src_chars = Z - BEG;
8308       coding->src_bytes = Z_BYTE - BEG_BYTE;
8309       coding->src_pos = BEG;
8310       coding->src_pos_byte = BEG_BYTE;
8311       coding->src_multibyte = Z < Z_BYTE;
8312     }
8313   else if (STRINGP (src_object))
8314     {
8315       code_conversion_save (0, 0);
8316       coding->src_pos = from;
8317       coding->src_pos_byte = from_byte;
8318     }
8319   else if (BUFFERP (src_object))
8320     {
8321       code_conversion_save (0, 0);
8322       set_buffer_internal (XBUFFER (src_object));
8323       if (EQ (src_object, dst_object))
8324         {
8325           saved_pt = PT, saved_pt_byte = PT_BYTE;
8326           coding->src_object = del_range_1 (from, to, 1, 1);
8327           coding->src_pos = 0;
8328           coding->src_pos_byte = 0;
8329         }
8330       else
8331         {
8332           if (from < GPT && to >= GPT)
8333             move_gap_both (from, from_byte);
8334           coding->src_pos = from;
8335           coding->src_pos_byte = from_byte;
8336         }
8337     }
8338   else
8339     code_conversion_save (0, 0);
8340
8341   if (BUFFERP (dst_object))
8342     {
8343       coding->dst_object = dst_object;
8344       if (EQ (src_object, dst_object))
8345         {
8346           coding->dst_pos = from;
8347           coding->dst_pos_byte = from_byte;
8348         }
8349       else
8350         {
8351           struct buffer *current = current_buffer;
8352
8353           set_buffer_temp (XBUFFER (dst_object));
8354           coding->dst_pos = PT;
8355           coding->dst_pos_byte = PT_BYTE;
8356           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8357           set_buffer_temp (current);
8358         }
8359       coding->dst_multibyte
8360         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8361     }
8362   else if (EQ (dst_object, Qt))
8363     {
8364       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8365       coding->dst_object = Qnil;
8366       coding->destination = xmalloc (dst_bytes);
8367       coding->dst_bytes = dst_bytes;
8368       coding->dst_multibyte = 0;
8369     }
8370   else
8371     {
8372       coding->dst_object = Qnil;
8373       coding->dst_multibyte = 0;
8374     }
8375
8376   encode_coding (coding);
8377
8378   if (EQ (dst_object, Qt))
8379     {
8380       if (BUFFERP (coding->dst_object))
8381         coding->dst_object = Fbuffer_string ();
8382       else if (coding->raw_destination)
8383         /* This is used to avoid creating huge Lisp string.
8384            NOTE: caller who sets `raw_destination' is also
8385            responsible for freeing `destination' buffer.  */
8386         coding->dst_object = Qnil;
8387       else
8388         {
8389           coding->dst_object
8390             = make_unibyte_string ((char *) coding->destination,
8391                                    coding->produced);
8392           xfree (coding->destination);
8393         }
8394     }
8395
8396   if (saved_pt >= 0)
8397     {
8398       /* This is the case of:
8399          (BUFFERP (src_object) && EQ (src_object, dst_object))
8400          As we have moved PT while replacing the original buffer
8401          contents, we must recover it now.  */
8402       set_buffer_internal (XBUFFER (src_object));
8403       if (saved_pt < from)
8404         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8405       else if (saved_pt < from + chars)
8406         TEMP_SET_PT_BOTH (from, from_byte);
8407       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8408         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8409                           saved_pt_byte + (coding->produced - bytes));
8410       else
8411         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8412                           saved_pt_byte + (coding->produced - bytes));
8413
8414       if (need_marker_adjustment)
8415         {
8416           struct Lisp_Marker *tail;
8417
8418           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8419             if (tail->need_adjustment)
8420               {
8421                 tail->need_adjustment = 0;
8422                 if (tail->insertion_type)
8423                   {
8424                     tail->bytepos = from_byte;
8425                     tail->charpos = from;
8426                   }
8427                 else
8428                   {
8429                     tail->bytepos = from_byte + coding->produced;
8430                     tail->charpos
8431                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8432                          ? tail->bytepos : from + coding->produced_char);
8433                   }
8434               }
8435         }
8436     }
8437
8438   if (kill_src_buffer)
8439     Fkill_buffer (coding->src_object);
8440
8441   Vdeactivate_mark = old_deactivate_mark;
8442   unbind_to (count, Qnil);
8443 }
8444
8445
8446 Lisp_Object
8447 preferred_coding_system (void)
8448 {
8449   int id = coding_categories[coding_priorities[0]].id;
8450
8451   return CODING_ID_NAME (id);
8452 }
8453
8454 #if defined (WINDOWSNT) || defined (CYGWIN)
8455
8456 Lisp_Object
8457 from_unicode (Lisp_Object str)
8458 {
8459   CHECK_STRING (str);
8460   if (!STRING_MULTIBYTE (str) &&
8461       SBYTES (str) & 1)
8462     {
8463       str = Fsubstring (str, make_number (0), make_number (-1));
8464     }
8465
8466   return code_convert_string_norecord (str, Qutf_16le, 0);
8467 }
8468
8469 Lisp_Object
8470 from_unicode_buffer (const wchar_t *wstr)
8471 {
8472     return from_unicode (
8473         make_unibyte_string (
8474             (char *) wstr,
8475             /* we get one of the two final 0 bytes for free. */
8476             1 + sizeof (wchar_t) * wcslen (wstr)));
8477 }
8478
8479 wchar_t *
8480 to_unicode (Lisp_Object str, Lisp_Object *buf)
8481 {
8482   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8483   /* We need to make another copy (in addition to the one made by
8484      code_convert_string_norecord) to ensure that the final string is
8485      _doubly_ zero terminated --- that is, that the string is
8486      terminated by two zero bytes and one utf-16le null character.
8487      Because strings are already terminated with a single zero byte,
8488      we just add one additional zero. */
8489   str = make_uninit_string (SBYTES (*buf) + 1);
8490   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8491   SDATA (str) [SBYTES (*buf)] = '\0';
8492   *buf = str;
8493   return WCSDATA (*buf);
8494 }
8495
8496 #endif /* WINDOWSNT || CYGWIN */
8497
8498 \f
8499 #ifdef emacs
8500 /*** 8. Emacs Lisp library functions ***/
8501
8502 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8503        doc: /* Return t if OBJECT is nil or a coding-system.
8504 See the documentation of `define-coding-system' for information
8505 about coding-system objects.  */)
8506   (Lisp_Object object)
8507 {
8508   if (NILP (object)
8509       || CODING_SYSTEM_ID (object) >= 0)
8510     return Qt;
8511   if (! SYMBOLP (object)
8512       || NILP (Fget (object, Qcoding_system_define_form)))
8513     return Qnil;
8514   return Qt;
8515 }
8516
8517 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8518        Sread_non_nil_coding_system, 1, 1, 0,
8519        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8520   (Lisp_Object prompt)
8521 {
8522   Lisp_Object val;
8523   do
8524     {
8525       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8526                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8527     }
8528   while (SCHARS (val) == 0);
8529   return (Fintern (val, Qnil));
8530 }
8531
8532 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8533        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8534 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8535 Ignores case when completing coding systems (all Emacs coding systems
8536 are lower-case).  */)
8537   (Lisp_Object prompt, Lisp_Object default_coding_system)
8538 {
8539   Lisp_Object val;
8540   ptrdiff_t count = SPECPDL_INDEX ();
8541
8542   if (SYMBOLP (default_coding_system))
8543     default_coding_system = SYMBOL_NAME (default_coding_system);
8544   specbind (Qcompletion_ignore_case, Qt);
8545   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8546                           Qt, Qnil, Qcoding_system_history,
8547                           default_coding_system, Qnil);
8548   unbind_to (count, Qnil);
8549   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8550 }
8551
8552 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8553        1, 1, 0,
8554        doc: /* Check validity of CODING-SYSTEM.
8555 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8556 It is valid if it is nil or a symbol defined as a coding system by the
8557 function `define-coding-system'.  */)
8558   (Lisp_Object coding_system)
8559 {
8560   Lisp_Object define_form;
8561
8562   define_form = Fget (coding_system, Qcoding_system_define_form);
8563   if (! NILP (define_form))
8564     {
8565       Fput (coding_system, Qcoding_system_define_form, Qnil);
8566       safe_eval (define_form);
8567     }
8568   if (!NILP (Fcoding_system_p (coding_system)))
8569     return coding_system;
8570   xsignal1 (Qcoding_system_error, coding_system);
8571 }
8572
8573 \f
8574 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8575    HIGHEST, return the coding system of the highest
8576    priority among the detected coding systems.  Otherwise return a
8577    list of detected coding systems sorted by their priorities.  If
8578    MULTIBYTEP, it is assumed that the bytes are in correct
8579    multibyte form but contains only ASCII and eight-bit chars.
8580    Otherwise, the bytes are raw bytes.
8581
8582    CODING-SYSTEM controls the detection as below:
8583
8584    If it is nil, detect both text-format and eol-format.  If the
8585    text-format part of CODING-SYSTEM is already specified
8586    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8587    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8588    detect only text-format.  */
8589
8590 Lisp_Object
8591 detect_coding_system (const unsigned char *src,
8592                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8593                       bool highest, bool multibytep,
8594                       Lisp_Object coding_system)
8595 {
8596   const unsigned char *src_end = src + src_bytes;
8597   Lisp_Object attrs, eol_type;
8598   Lisp_Object val = Qnil;
8599   struct coding_system coding;
8600   ptrdiff_t id;
8601   struct coding_detection_info detect_info;
8602   enum coding_category base_category;
8603   bool null_byte_found = 0, eight_bit_found = 0;
8604
8605   if (NILP (coding_system))
8606     coding_system = Qundecided;
8607   setup_coding_system (coding_system, &coding);
8608   attrs = CODING_ID_ATTRS (coding.id);
8609   eol_type = CODING_ID_EOL_TYPE (coding.id);
8610   coding_system = CODING_ATTR_BASE_NAME (attrs);
8611
8612   coding.source = src;
8613   coding.src_chars = src_chars;
8614   coding.src_bytes = src_bytes;
8615   coding.src_multibyte = multibytep;
8616   coding.consumed = 0;
8617   coding.mode |= CODING_MODE_LAST_BLOCK;
8618   coding.head_ascii = 0;
8619
8620   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8621
8622   /* At first, detect text-format if necessary.  */
8623   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8624   if (base_category == coding_category_undecided)
8625     {
8626       enum coding_category category IF_LINT (= 0);
8627       struct coding_system *this IF_LINT (= NULL);
8628       int c, i;
8629       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8630                                        inhibit_null_byte_detection);
8631       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8632                                        inhibit_iso_escape_detection);
8633       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8634
8635       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8636       for (; src < src_end; src++)
8637         {
8638           c = *src;
8639           if (c & 0x80)
8640             {
8641               eight_bit_found = 1;
8642               if (null_byte_found)
8643                 break;
8644             }
8645           else if (c < 0x20)
8646             {
8647               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8648                   && ! inhibit_ied
8649                   && ! detect_info.checked)
8650                 {
8651                   if (detect_coding_iso_2022 (&coding, &detect_info))
8652                     {
8653                       /* We have scanned the whole data.  */
8654                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8655                         {
8656                           /* We didn't find an 8-bit code.  We may
8657                              have found a null-byte, but it's very
8658                              rare that a binary file confirm to
8659                              ISO-2022.  */
8660                           src = src_end;
8661                           coding.head_ascii = src - coding.source;
8662                         }
8663                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8664                       break;
8665                     }
8666                 }
8667               else if (! c && !inhibit_nbd)
8668                 {
8669                   null_byte_found = 1;
8670                   if (eight_bit_found)
8671                     break;
8672                 }
8673               if (! eight_bit_found)
8674                 coding.head_ascii++;
8675             }
8676           else if (! eight_bit_found)
8677             coding.head_ascii++;
8678         }
8679
8680       if (null_byte_found || eight_bit_found
8681           || coding.head_ascii < coding.src_bytes
8682           || detect_info.found)
8683         {
8684           if (coding.head_ascii == coding.src_bytes)
8685             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8686             for (i = 0; i < coding_category_raw_text; i++)
8687               {
8688                 category = coding_priorities[i];
8689                 this = coding_categories + category;
8690                 if (detect_info.found & (1 << category))
8691                   break;
8692               }
8693           else
8694             {
8695               if (null_byte_found)
8696                 {
8697                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8698                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8699                 }
8700               else if (prefer_utf_8
8701                        && detect_coding_utf_8 (&coding, &detect_info))
8702                 {
8703                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8704                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8705                 }
8706               for (i = 0; i < coding_category_raw_text; i++)
8707                 {
8708                   category = coding_priorities[i];
8709                   this = coding_categories + category;
8710
8711                   if (this->id < 0)
8712                     {
8713                       /* No coding system of this category is defined.  */
8714                       detect_info.rejected |= (1 << category);
8715                     }
8716                   else if (category >= coding_category_raw_text)
8717                     continue;
8718                   else if (detect_info.checked & (1 << category))
8719                     {
8720                       if (highest
8721                           && (detect_info.found & (1 << category)))
8722                         break;
8723                     }
8724                   else if ((*(this->detector)) (&coding, &detect_info)
8725                            && highest
8726                            && (detect_info.found & (1 << category)))
8727                     {
8728                       if (category == coding_category_utf_16_auto)
8729                         {
8730                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8731                             category = coding_category_utf_16_le;
8732                           else
8733                             category = coding_category_utf_16_be;
8734                         }
8735                       break;
8736                     }
8737                 }
8738             }
8739         }
8740
8741       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8742           || null_byte_found)
8743         {
8744           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8745           id = CODING_SYSTEM_ID (Qno_conversion);
8746           val = list1 (make_number (id));
8747         }
8748       else if (! detect_info.rejected && ! detect_info.found)
8749         {
8750           detect_info.found = CATEGORY_MASK_ANY;
8751           id = coding_categories[coding_category_undecided].id;
8752           val = list1 (make_number (id));
8753         }
8754       else if (highest)
8755         {
8756           if (detect_info.found)
8757             {
8758               detect_info.found = 1 << category;
8759               val = list1 (make_number (this->id));
8760             }
8761           else
8762             for (i = 0; i < coding_category_raw_text; i++)
8763               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8764                 {
8765                   detect_info.found = 1 << coding_priorities[i];
8766                   id = coding_categories[coding_priorities[i]].id;
8767                   val = list1 (make_number (id));
8768                   break;
8769                 }
8770         }
8771       else
8772         {
8773           int mask = detect_info.rejected | detect_info.found;
8774           int found = 0;
8775
8776           for (i = coding_category_raw_text - 1; i >= 0; i--)
8777             {
8778               category = coding_priorities[i];
8779               if (! (mask & (1 << category)))
8780                 {
8781                   found |= 1 << category;
8782                   id = coding_categories[category].id;
8783                   if (id >= 0)
8784                     val = list1 (make_number (id));
8785                 }
8786             }
8787           for (i = coding_category_raw_text - 1; i >= 0; i--)
8788             {
8789               category = coding_priorities[i];
8790               if (detect_info.found & (1 << category))
8791                 {
8792                   id = coding_categories[category].id;
8793                   val = Fcons (make_number (id), val);
8794                 }
8795             }
8796           detect_info.found |= found;
8797         }
8798     }
8799   else if (base_category == coding_category_utf_8_auto)
8800     {
8801       if (detect_coding_utf_8 (&coding, &detect_info))
8802         {
8803           struct coding_system *this;
8804
8805           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8806             this = coding_categories + coding_category_utf_8_sig;
8807           else
8808             this = coding_categories + coding_category_utf_8_nosig;
8809           val = list1 (make_number (this->id));
8810         }
8811     }
8812   else if (base_category == coding_category_utf_16_auto)
8813     {
8814       if (detect_coding_utf_16 (&coding, &detect_info))
8815         {
8816           struct coding_system *this;
8817
8818           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8819             this = coding_categories + coding_category_utf_16_le;
8820           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8821             this = coding_categories + coding_category_utf_16_be;
8822           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8823             this = coding_categories + coding_category_utf_16_be_nosig;
8824           else
8825             this = coding_categories + coding_category_utf_16_le_nosig;
8826           val = list1 (make_number (this->id));
8827         }
8828     }
8829   else
8830     {
8831       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8832       val = list1 (make_number (coding.id));
8833     }
8834
8835   /* Then, detect eol-format if necessary.  */
8836   {
8837     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8838     Lisp_Object tail;
8839
8840     if (VECTORP (eol_type))
8841       {
8842         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8843           {
8844             if (null_byte_found)
8845               normal_eol = EOL_SEEN_LF;
8846             else
8847               normal_eol = detect_eol (coding.source, src_bytes,
8848                                        coding_category_raw_text);
8849           }
8850         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8851                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8852           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8853                                       coding_category_utf_16_be);
8854         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8855                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8856           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8857                                       coding_category_utf_16_le);
8858       }
8859     else
8860       {
8861         if (EQ (eol_type, Qunix))
8862           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8863         else if (EQ (eol_type, Qdos))
8864           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8865         else
8866           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8867       }
8868
8869     for (tail = val; CONSP (tail); tail = XCDR (tail))
8870       {
8871         enum coding_category category;
8872         int this_eol;
8873
8874         id = XINT (XCAR (tail));
8875         attrs = CODING_ID_ATTRS (id);
8876         category = XINT (CODING_ATTR_CATEGORY (attrs));
8877         eol_type = CODING_ID_EOL_TYPE (id);
8878         if (VECTORP (eol_type))
8879           {
8880             if (category == coding_category_utf_16_be
8881                 || category == coding_category_utf_16_be_nosig)
8882               this_eol = utf_16_be_eol;
8883             else if (category == coding_category_utf_16_le
8884                      || category == coding_category_utf_16_le_nosig)
8885               this_eol = utf_16_le_eol;
8886             else
8887               this_eol = normal_eol;
8888
8889             if (this_eol == EOL_SEEN_LF)
8890               XSETCAR (tail, AREF (eol_type, 0));
8891             else if (this_eol == EOL_SEEN_CRLF)
8892               XSETCAR (tail, AREF (eol_type, 1));
8893             else if (this_eol == EOL_SEEN_CR)
8894               XSETCAR (tail, AREF (eol_type, 2));
8895             else
8896               XSETCAR (tail, CODING_ID_NAME (id));
8897           }
8898         else
8899           XSETCAR (tail, CODING_ID_NAME (id));
8900       }
8901   }
8902
8903   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8904 }
8905
8906
8907 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8908        2, 3, 0,
8909        doc: /* Detect coding system of the text in the region between START and END.
8910 Return a list of possible coding systems ordered by priority.
8911 The coding systems to try and their priorities follows what
8912 the function `coding-system-priority-list' (which see) returns.
8913
8914 If only ASCII characters are found (except for such ISO-2022 control
8915 characters as ESC), it returns a list of single element `undecided'
8916 or its subsidiary coding system according to a detected end-of-line
8917 format.
8918
8919 If optional argument HIGHEST is non-nil, return the coding system of
8920 highest priority.  */)
8921   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8922 {
8923   ptrdiff_t from, to;
8924   ptrdiff_t from_byte, to_byte;
8925
8926   validate_region (&start, &end);
8927   from = XINT (start), to = XINT (end);
8928   from_byte = CHAR_TO_BYTE (from);
8929   to_byte = CHAR_TO_BYTE (to);
8930
8931   if (from < GPT && to >= GPT)
8932     move_gap_both (to, to_byte);
8933
8934   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8935                                to - from, to_byte - from_byte,
8936                                !NILP (highest),
8937                                !NILP (BVAR (current_buffer
8938                                       , enable_multibyte_characters)),
8939                                Qnil);
8940 }
8941
8942 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8943        1, 2, 0,
8944        doc: /* Detect coding system of the text in STRING.
8945 Return a list of possible coding systems ordered by priority.
8946 The coding systems to try and their priorities follows what
8947 the function `coding-system-priority-list' (which see) returns.
8948
8949 If only ASCII characters are found (except for such ISO-2022 control
8950 characters as ESC), it returns a list of single element `undecided'
8951 or its subsidiary coding system according to a detected end-of-line
8952 format.
8953
8954 If optional argument HIGHEST is non-nil, return the coding system of
8955 highest priority.  */)
8956   (Lisp_Object string, Lisp_Object highest)
8957 {
8958   CHECK_STRING (string);
8959
8960   return detect_coding_system (SDATA (string),
8961                                SCHARS (string), SBYTES (string),
8962                                !NILP (highest), STRING_MULTIBYTE (string),
8963                                Qnil);
8964 }
8965
8966
8967 static bool
8968 char_encodable_p (int c, Lisp_Object attrs)
8969 {
8970   Lisp_Object tail;
8971   struct charset *charset;
8972   Lisp_Object translation_table;
8973
8974   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8975   if (! NILP (translation_table))
8976     c = translate_char (translation_table, c);
8977   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8978        CONSP (tail); tail = XCDR (tail))
8979     {
8980       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8981       if (CHAR_CHARSET_P (c, charset))
8982         break;
8983     }
8984   return (! NILP (tail));
8985 }
8986
8987
8988 /* Return a list of coding systems that safely encode the text between
8989    START and END.  If EXCLUDE is non-nil, it is a list of coding
8990    systems not to check.  The returned list doesn't contain any such
8991    coding systems.  In any case, if the text contains only ASCII or is
8992    unibyte, return t.  */
8993
8994 DEFUN ("find-coding-systems-region-internal",
8995        Ffind_coding_systems_region_internal,
8996        Sfind_coding_systems_region_internal, 2, 3, 0,
8997        doc: /* Internal use only.  */)
8998   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8999 {
9000   Lisp_Object coding_attrs_list, safe_codings;
9001   ptrdiff_t start_byte, end_byte;
9002   const unsigned char *p, *pbeg, *pend;
9003   int c;
9004   Lisp_Object tail, elt, work_table;
9005
9006   if (STRINGP (start))
9007     {
9008       if (!STRING_MULTIBYTE (start)
9009           || SCHARS (start) == SBYTES (start))
9010         return Qt;
9011       start_byte = 0;
9012       end_byte = SBYTES (start);
9013     }
9014   else
9015     {
9016       CHECK_NUMBER_COERCE_MARKER (start);
9017       CHECK_NUMBER_COERCE_MARKER (end);
9018       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9019         args_out_of_range (start, end);
9020       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9021         return Qt;
9022       start_byte = CHAR_TO_BYTE (XINT (start));
9023       end_byte = CHAR_TO_BYTE (XINT (end));
9024       if (XINT (end) - XINT (start) == end_byte - start_byte)
9025         return Qt;
9026
9027       if (XINT (start) < GPT && XINT (end) > GPT)
9028         {
9029           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9030             move_gap_both (XINT (start), start_byte);
9031           else
9032             move_gap_both (XINT (end), end_byte);
9033         }
9034     }
9035
9036   coding_attrs_list = Qnil;
9037   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9038     if (NILP (exclude)
9039         || NILP (Fmemq (XCAR (tail), exclude)))
9040       {
9041         Lisp_Object attrs;
9042
9043         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9044         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9045           {
9046             ASET (attrs, coding_attr_trans_tbl,
9047                   get_translation_table (attrs, 1, NULL));
9048             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9049           }
9050       }
9051
9052   if (STRINGP (start))
9053     p = pbeg = SDATA (start);
9054   else
9055     p = pbeg = BYTE_POS_ADDR (start_byte);
9056   pend = p + (end_byte - start_byte);
9057
9058   while (p < pend && ASCII_CHAR_P (*p)) p++;
9059   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9060
9061   work_table = Fmake_char_table (Qnil, Qnil);
9062   while (p < pend)
9063     {
9064       if (ASCII_CHAR_P (*p))
9065         p++;
9066       else
9067         {
9068           c = STRING_CHAR_ADVANCE (p);
9069           if (!NILP (char_table_ref (work_table, c)))
9070             /* This character was already checked.  Ignore it.  */
9071             continue;
9072
9073           charset_map_loaded = 0;
9074           for (tail = coding_attrs_list; CONSP (tail);)
9075             {
9076               elt = XCAR (tail);
9077               if (NILP (elt))
9078                 tail = XCDR (tail);
9079               else if (char_encodable_p (c, elt))
9080                 tail = XCDR (tail);
9081               else if (CONSP (XCDR (tail)))
9082                 {
9083                   XSETCAR (tail, XCAR (XCDR (tail)));
9084                   XSETCDR (tail, XCDR (XCDR (tail)));
9085                 }
9086               else
9087                 {
9088                   XSETCAR (tail, Qnil);
9089                   tail = XCDR (tail);
9090                 }
9091             }
9092           if (charset_map_loaded)
9093             {
9094               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9095
9096               if (STRINGP (start))
9097                 pbeg = SDATA (start);
9098               else
9099                 pbeg = BYTE_POS_ADDR (start_byte);
9100               p = pbeg + p_offset;
9101               pend = pbeg + pend_offset;
9102             }
9103           char_table_set (work_table, c, Qt);
9104         }
9105     }
9106
9107   safe_codings = list2 (Qraw_text, Qno_conversion);
9108   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9109     if (! NILP (XCAR (tail)))
9110       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9111
9112   return safe_codings;
9113 }
9114
9115
9116 DEFUN ("unencodable-char-position", Funencodable_char_position,
9117        Sunencodable_char_position, 3, 5, 0,
9118        doc: /* Return position of first un-encodable character in a region.
9119 START and END specify the region and CODING-SYSTEM specifies the
9120 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9121
9122 If optional 4th argument COUNT is non-nil, it specifies at most how
9123 many un-encodable characters to search.  In this case, the value is a
9124 list of positions.
9125
9126 If optional 5th argument STRING is non-nil, it is a string to search
9127 for un-encodable characters.  In that case, START and END are indexes
9128 to the string and treated as in `substring'.  */)
9129   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system,
9130    Lisp_Object count, Lisp_Object string)
9131 {
9132   EMACS_INT n;
9133   struct coding_system coding;
9134   Lisp_Object attrs, charset_list, translation_table;
9135   Lisp_Object positions;
9136   ptrdiff_t from, to;
9137   const unsigned char *p, *stop, *pend;
9138   bool ascii_compatible;
9139
9140   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9141   attrs = CODING_ID_ATTRS (coding.id);
9142   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9143     return Qnil;
9144   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9145   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9146   translation_table = get_translation_table (attrs, 1, NULL);
9147
9148   if (NILP (string))
9149     {
9150       validate_region (&start, &end);
9151       from = XINT (start);
9152       to = XINT (end);
9153       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9154           || (ascii_compatible
9155               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9156         return Qnil;
9157       p = CHAR_POS_ADDR (from);
9158       pend = CHAR_POS_ADDR (to);
9159       if (from < GPT && to >= GPT)
9160         stop = GPT_ADDR;
9161       else
9162         stop = pend;
9163     }
9164   else
9165     {
9166       CHECK_STRING (string);
9167       validate_subarray (string, start, end, SCHARS (string), &from, &to);
9168       if (! STRING_MULTIBYTE (string))
9169         return Qnil;
9170       p = SDATA (string) + string_char_to_byte (string, from);
9171       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9172       if (ascii_compatible && (to - from) == (pend - p))
9173         return Qnil;
9174     }
9175
9176   if (NILP (count))
9177     n = 1;
9178   else
9179     {
9180       CHECK_NATNUM (count);
9181       n = XINT (count);
9182     }
9183
9184   positions = Qnil;
9185   charset_map_loaded = 0;
9186   while (1)
9187     {
9188       int c;
9189
9190       if (ascii_compatible)
9191         while (p < stop && ASCII_CHAR_P (*p))
9192           p++, from++;
9193       if (p >= stop)
9194         {
9195           if (p >= pend)
9196             break;
9197           stop = pend;
9198           p = GAP_END_ADDR;
9199         }
9200
9201       c = STRING_CHAR_ADVANCE (p);
9202       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9203           && ! char_charset (translate_char (translation_table, c),
9204                              charset_list, NULL))
9205         {
9206           positions = Fcons (make_number (from), positions);
9207           n--;
9208           if (n == 0)
9209             break;
9210         }
9211
9212       from++;
9213       if (charset_map_loaded && NILP (string))
9214         {
9215           p = CHAR_POS_ADDR (from);
9216           pend = CHAR_POS_ADDR (to);
9217           if (from < GPT && to >= GPT)
9218             stop = GPT_ADDR;
9219           else
9220             stop = pend;
9221           charset_map_loaded = 0;
9222         }
9223     }
9224
9225   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9226 }
9227
9228
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9230        Scheck_coding_systems_region, 3, 3, 0,
9231        doc: /* Check if the region is encodable by coding systems.
9232
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9235
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9240
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9242 value is nil.
9243
9244 START may be a string.  In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions.  END is ignored.
9247
9248 If the current buffer (or START if it is a string) is unibyte, the value
9249 is nil.  */)
9250   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9251 {
9252   Lisp_Object list;
9253   ptrdiff_t start_byte, end_byte;
9254   ptrdiff_t pos;
9255   const unsigned char *p, *pbeg, *pend;
9256   int c;
9257   Lisp_Object tail, elt, attrs;
9258
9259   if (STRINGP (start))
9260     {
9261       if (!STRING_MULTIBYTE (start)
9262           || SCHARS (start) == SBYTES (start))
9263         return Qnil;
9264       start_byte = 0;
9265       end_byte = SBYTES (start);
9266       pos = 0;
9267     }
9268   else
9269     {
9270       CHECK_NUMBER_COERCE_MARKER (start);
9271       CHECK_NUMBER_COERCE_MARKER (end);
9272       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9273         args_out_of_range (start, end);
9274       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9275         return Qnil;
9276       start_byte = CHAR_TO_BYTE (XINT (start));
9277       end_byte = CHAR_TO_BYTE (XINT (end));
9278       if (XINT (end) - XINT (start) == end_byte - start_byte)
9279         return Qnil;
9280
9281       if (XINT (start) < GPT && XINT (end) > GPT)
9282         {
9283           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9284             move_gap_both (XINT (start), start_byte);
9285           else
9286             move_gap_both (XINT (end), end_byte);
9287         }
9288       pos = XINT (start);
9289     }
9290
9291   list = Qnil;
9292   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9293     {
9294       elt = XCAR (tail);
9295       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9296       ASET (attrs, coding_attr_trans_tbl,
9297             get_translation_table (attrs, 1, NULL));
9298       list = Fcons (list2 (elt, attrs), list);
9299     }
9300
9301   if (STRINGP (start))
9302     p = pbeg = SDATA (start);
9303   else
9304     p = pbeg = BYTE_POS_ADDR (start_byte);
9305   pend = p + (end_byte - start_byte);
9306
9307   while (p < pend && ASCII_CHAR_P (*p)) p++, pos++;
9308   while (p < pend && ASCII_CHAR_P (*(pend - 1))) pend--;
9309
9310   while (p < pend)
9311     {
9312       if (ASCII_CHAR_P (*p))
9313         p++;
9314       else
9315         {
9316           c = STRING_CHAR_ADVANCE (p);
9317
9318           charset_map_loaded = 0;
9319           for (tail = list; CONSP (tail); tail = XCDR (tail))
9320             {
9321               elt = XCDR (XCAR (tail));
9322               if (! char_encodable_p (c, XCAR (elt)))
9323                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9324             }
9325           if (charset_map_loaded)
9326             {
9327               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9328
9329               if (STRINGP (start))
9330                 pbeg = SDATA (start);
9331               else
9332                 pbeg = BYTE_POS_ADDR (start_byte);
9333               p = pbeg + p_offset;
9334               pend = pbeg + pend_offset;
9335             }
9336         }
9337       pos++;
9338     }
9339
9340   tail = list;
9341   list = Qnil;
9342   for (; CONSP (tail); tail = XCDR (tail))
9343     {
9344       elt = XCAR (tail);
9345       if (CONSP (XCDR (XCDR (elt))))
9346         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9347                       list);
9348     }
9349
9350   return list;
9351 }
9352
9353
9354 static Lisp_Object
9355 code_convert_region (Lisp_Object start, Lisp_Object end,
9356                      Lisp_Object coding_system, Lisp_Object dst_object,
9357                      bool encodep, bool norecord)
9358 {
9359   struct coding_system coding;
9360   ptrdiff_t from, from_byte, to, to_byte;
9361   Lisp_Object src_object;
9362
9363   if (NILP (coding_system))
9364     coding_system = Qno_conversion;
9365   else
9366     CHECK_CODING_SYSTEM (coding_system);
9367   src_object = Fcurrent_buffer ();
9368   if (NILP (dst_object))
9369     dst_object = src_object;
9370   else if (! EQ (dst_object, Qt))
9371     CHECK_BUFFER (dst_object);
9372
9373   validate_region (&start, &end);
9374   from = XFASTINT (start);
9375   from_byte = CHAR_TO_BYTE (from);
9376   to = XFASTINT (end);
9377   to_byte = CHAR_TO_BYTE (to);
9378
9379   setup_coding_system (coding_system, &coding);
9380   coding.mode |= CODING_MODE_LAST_BLOCK;
9381
9382   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9383     {
9384       struct buffer *buf = XBUFFER (dst_object);
9385       ptrdiff_t buf_pt = BUF_PT (buf);
9386
9387       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9388     }
9389
9390   if (encodep)
9391     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392                           dst_object);
9393   else
9394     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9395                           dst_object);
9396   if (! norecord)
9397     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9398
9399   return (BUFFERP (dst_object)
9400           ? make_number (coding.produced_char)
9401           : coding.dst_object);
9402 }
9403
9404
9405 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9406        3, 4, "r\nzCoding system: ",
9407        doc: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409         START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9411
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9415 does not move).
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9418
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.)  */)
9422   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9423 {
9424   return code_convert_region (start, end, coding_system, destination, 0, 0);
9425 }
9426
9427 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9428        3, 4, "r\nzCoding system: ",
9429        doc: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431         START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9433
9434 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replace by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9437 does not move).
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9440
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.)  */)
9444   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9445 {
9446   return code_convert_region (start, end, coding_system, destination, 1, 0);
9447 }
9448
9449 Lisp_Object
9450 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9451                      Lisp_Object dst_object, bool encodep, bool nocopy,
9452                      bool norecord)
9453 {
9454   struct coding_system coding;
9455   ptrdiff_t chars, bytes;
9456
9457   CHECK_STRING (string);
9458   if (NILP (coding_system))
9459     {
9460       if (! norecord)
9461         Vlast_coding_system_used = Qno_conversion;
9462       if (NILP (dst_object))
9463         return (nocopy ? Fcopy_sequence (string) : string);
9464     }
9465
9466   if (NILP (coding_system))
9467     coding_system = Qno_conversion;
9468   else
9469     CHECK_CODING_SYSTEM (coding_system);
9470   if (NILP (dst_object))
9471     dst_object = Qt;
9472   else if (! EQ (dst_object, Qt))
9473     CHECK_BUFFER (dst_object);
9474
9475   setup_coding_system (coding_system, &coding);
9476   coding.mode |= CODING_MODE_LAST_BLOCK;
9477   chars = SCHARS (string);
9478   bytes = SBYTES (string);
9479
9480   if (BUFFERP (dst_object))
9481     {
9482       struct buffer *buf = XBUFFER (dst_object);
9483       ptrdiff_t buf_pt = BUF_PT (buf);
9484
9485       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9486     }
9487
9488   if (encodep)
9489     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9490   else
9491     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9492   if (! norecord)
9493     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9494
9495   return (BUFFERP (dst_object)
9496           ? make_number (coding.produced_char)
9497           : coding.dst_object);
9498 }
9499
9500
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502    Do not set Vlast_coding_system_used.
9503
9504    This function is called only from macros DECODE_FILE and
9505    ENCODE_FILE, thus we ignore character composition.  */
9506
9507 Lisp_Object
9508 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9509                               bool encodep)
9510 {
9511   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9512 }
9513
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515    for passing to C library functions.  */
9516 Lisp_Object
9517 decode_file_name (Lisp_Object fname)
9518 {
9519 #ifdef WINDOWSNT
9520   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521      converts the file names either to UTF-16LE or to the system ANSI
9522      codepage internally, depending on the underlying OS; see w32.c.  */
9523   if (! NILP (Fcoding_system_p (Qutf_8)))
9524     return code_convert_string_norecord (fname, Qutf_8, 0);
9525   return fname;
9526 #else  /* !WINDOWSNT */
9527   if (! NILP (Vfile_name_coding_system))
9528     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9529   else if (! NILP (Vdefault_file_name_coding_system))
9530     return code_convert_string_norecord (fname,
9531                                          Vdefault_file_name_coding_system, 0);
9532   else
9533     return fname;
9534 #endif
9535 }
9536
9537 Lisp_Object
9538 encode_file_name (Lisp_Object fname)
9539 {
9540   /* This is especially important during bootstrap and dumping, when
9541      file-name encoding is not yet known, and therefore any non-ASCII
9542      file names are unibyte strings, and could only be thrashed if we
9543      try to encode them.  */
9544   if (!STRING_MULTIBYTE (fname))
9545     return fname;
9546 #ifdef WINDOWSNT
9547   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548      converts the file names either to UTF-16LE or to the system ANSI
9549      codepage internally, depending on the underlying OS; see w32.c.  */
9550   if (! NILP (Fcoding_system_p (Qutf_8)))
9551     return code_convert_string_norecord (fname, Qutf_8, 1);
9552   return fname;
9553 #else  /* !WINDOWSNT */
9554   if (! NILP (Vfile_name_coding_system))
9555     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9556   else if (! NILP (Vdefault_file_name_coding_system))
9557     return code_convert_string_norecord (fname,
9558                                          Vdefault_file_name_coding_system, 1);
9559   else
9560     return fname;
9561 #endif
9562 }
9563
9564 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9565        2, 4, 0,
9566        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9567
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9570
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move).  In this
9573 case, the return value is the length of the decoded text.
9574
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.)  */)
9578   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9579 {
9580   return code_convert_string (string, coding_system, buffer,
9581                               0, ! NILP (nocopy), 0);
9582 }
9583
9584 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9585        2, 4, 0,
9586        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9587
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9590
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move).  In this
9593 case, the return value is the length of the encoded text.
9594
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.)  */)
9598   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9599 {
9600   return code_convert_string (string, coding_system, buffer,
9601                               1, ! NILP (nocopy), 0);
9602 }
9603
9604 \f
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9606        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character.  */)
9608   (Lisp_Object code)
9609 {
9610   Lisp_Object spec, attrs, val;
9611   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9612   EMACS_INT ch;
9613   int c;
9614
9615   CHECK_NATNUM (code);
9616   ch = XFASTINT (code);
9617   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618   attrs = AREF (spec, 0);
9619
9620   if (ASCII_CHAR_P (ch)
9621       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622     return code;
9623
9624   val = CODING_ATTR_CHARSET_LIST (attrs);
9625   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9627   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9628
9629   if (ch <= 0x7F)
9630     {
9631       c = ch;
9632       charset = charset_roman;
9633     }
9634   else if (ch >= 0xA0 && ch < 0xDF)
9635     {
9636       c = ch - 0x80;
9637       charset = charset_kana;
9638     }
9639   else
9640     {
9641       EMACS_INT c1 = ch >> 8;
9642       int c2 = ch & 0xFF;
9643
9644       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9645           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9646         error ("Invalid code: %"pI"d", ch);
9647       c = ch;
9648       SJIS_TO_JIS (c);
9649       charset = charset_kanji;
9650     }
9651   c = DECODE_CHAR (charset, c);
9652   if (c < 0)
9653     error ("Invalid code: %"pI"d", ch);
9654   return make_number (c);
9655 }
9656
9657
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9659        doc: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS.  */)
9661   (Lisp_Object ch)
9662 {
9663   Lisp_Object spec, attrs, charset_list;
9664   int c;
9665   struct charset *charset;
9666   unsigned code;
9667
9668   CHECK_CHARACTER (ch);
9669   c = XFASTINT (ch);
9670   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9671   attrs = AREF (spec, 0);
9672
9673   if (ASCII_CHAR_P (c)
9674       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9675     return ch;
9676
9677   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9678   charset = char_charset (c, charset_list, &code);
9679   if (code == CHARSET_INVALID_CODE (charset))
9680     error ("Can't encode by shift_jis encoding: %c", c);
9681   JIS_TO_SJIS (code);
9682
9683   return make_number (code);
9684 }
9685
9686 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9687        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character.  */)
9689   (Lisp_Object code)
9690 {
9691   Lisp_Object spec, attrs, val;
9692   struct charset *charset_roman, *charset_big5, *charset;
9693   EMACS_INT ch;
9694   int c;
9695
9696   CHECK_NATNUM (code);
9697   ch = XFASTINT (code);
9698   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699   attrs = AREF (spec, 0);
9700
9701   if (ASCII_CHAR_P (ch)
9702       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9703     return code;
9704
9705   val = CODING_ATTR_CHARSET_LIST (attrs);
9706   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9707   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9708
9709   if (ch <= 0x7F)
9710     {
9711       c = ch;
9712       charset = charset_roman;
9713     }
9714   else
9715     {
9716       EMACS_INT b1 = ch >> 8;
9717       int b2 = ch & 0x7F;
9718       if (b1 < 0xA1 || b1 > 0xFE
9719           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9720         error ("Invalid code: %"pI"d", ch);
9721       c = ch;
9722       charset = charset_big5;
9723     }
9724   c = DECODE_CHAR (charset, c);
9725   if (c < 0)
9726     error ("Invalid code: %"pI"d", ch);
9727   return make_number (c);
9728 }
9729
9730 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9731        doc: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5.  */)
9733   (Lisp_Object ch)
9734 {
9735   Lisp_Object spec, attrs, charset_list;
9736   struct charset *charset;
9737   int c;
9738   unsigned code;
9739
9740   CHECK_CHARACTER (ch);
9741   c = XFASTINT (ch);
9742   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9743   attrs = AREF (spec, 0);
9744   if (ASCII_CHAR_P (c)
9745       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9746     return ch;
9747
9748   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9749   charset = char_charset (c, charset_list, &code);
9750   if (code == CHARSET_INVALID_CODE (charset))
9751     error ("Can't encode by Big5 encoding: %c", c);
9752
9753   return make_number (code);
9754 }
9755
9756 \f
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9758        Sset_terminal_coding_system_internal, 1, 2, 0,
9759        doc: /* Internal use only.  */)
9760   (Lisp_Object coding_system, Lisp_Object terminal)
9761 {
9762   struct terminal *term = decode_live_terminal (terminal);
9763   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9766   /* We had better not send unsafe characters to terminal.  */
9767   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9768   /* Character composition should be disabled.  */
9769   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770   terminal_coding->src_multibyte = 1;
9771   terminal_coding->dst_multibyte = 0;
9772   tset_charset_list
9773     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9774             ? coding_charset_list (terminal_coding)
9775             : list1 (make_number (charset_ascii))));
9776   return Qnil;
9777 }
9778
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780        Fset_safe_terminal_coding_system_internal,
9781        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9782        doc: /* Internal use only.  */)
9783   (Lisp_Object coding_system)
9784 {
9785   CHECK_SYMBOL (coding_system);
9786   setup_coding_system (Fcheck_coding_system (coding_system),
9787                        &safe_terminal_coding);
9788   /* Character composition should be disabled.  */
9789   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9790   safe_terminal_coding.src_multibyte = 1;
9791   safe_terminal_coding.dst_multibyte = 0;
9792   return Qnil;
9793 }
9794
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9796        Sterminal_coding_system, 0, 1, 0,
9797        doc: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device.  */)
9800   (Lisp_Object terminal)
9801 {
9802   struct coding_system *terminal_coding
9803     = TERMINAL_TERMINAL_CODING (decode_live_terminal (terminal));
9804   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9805
9806   /* For backward compatibility, return nil if it is `undecided'.  */
9807   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9808 }
9809
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9811        Sset_keyboard_coding_system_internal, 1, 2, 0,
9812        doc: /* Internal use only.  */)
9813   (Lisp_Object coding_system, Lisp_Object terminal)
9814 {
9815   struct terminal *t = decode_live_terminal (terminal);
9816   CHECK_SYMBOL (coding_system);
9817   if (NILP (coding_system))
9818     coding_system = Qno_conversion;
9819   else
9820     Fcheck_coding_system (coding_system);
9821   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9822   /* Character composition should be disabled.  */
9823   TERMINAL_KEYBOARD_CODING (t)->common_flags
9824     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9825   return Qnil;
9826 }
9827
9828 DEFUN ("keyboard-coding-system",
9829        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9830        doc: /* Return coding system specified for decoding keyboard input.  */)
9831   (Lisp_Object terminal)
9832 {
9833   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834                          (decode_live_terminal (terminal))->id);
9835 }
9836
9837 \f
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9839        Sfind_operation_coding_system,  1, MANY, 0,
9840        doc: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9845
9846 The first argument OPERATION specifies an I/O primitive:
9847   For file I/O, `insert-file-contents' or `write-region'.
9848   For process I/O, `call-process', `call-process-region', or `start-process'.
9849   For network I/O, `open-network-stream'.
9850
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive.  Depending on which primitive, one of those arguments
9853 is selected as the TARGET.  For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9855
9856 TARGET has a meaning which depends on OPERATION:
9857   For file I/O, TARGET is a file name (except for the special case below).
9858   For process I/O, TARGET is a process name.
9859   For network I/O, TARGET is a service name or a port number.
9860
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9870
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9877
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9879   (ptrdiff_t nargs, Lisp_Object *args)
9880 {
9881   Lisp_Object operation, target_idx, target, val;
9882   register Lisp_Object chain;
9883
9884   if (nargs < 2)
9885     error ("Too few arguments");
9886   operation = args[0];
9887   if (!SYMBOLP (operation)
9888       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9889     error ("Invalid first argument");
9890   if (nargs <= 1 + XFASTINT (target_idx))
9891     error ("Too few arguments for operation `%s'",
9892            SDATA (SYMBOL_NAME (operation)));
9893   target = args[XFASTINT (target_idx) + 1];
9894   if (!(STRINGP (target)
9895         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9896             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9897         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9898     error ("Invalid argument %"pI"d of operation `%s'",
9899            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9900   if (CONSP (target))
9901     target = XCAR (target);
9902
9903   chain = ((EQ (operation, Qinsert_file_contents)
9904             || EQ (operation, Qwrite_region))
9905            ? Vfile_coding_system_alist
9906            : (EQ (operation, Qopen_network_stream)
9907               ? Vnetwork_coding_system_alist
9908               : Vprocess_coding_system_alist));
9909   if (NILP (chain))
9910     return Qnil;
9911
9912   for (; CONSP (chain); chain = XCDR (chain))
9913     {
9914       Lisp_Object elt;
9915
9916       elt = XCAR (chain);
9917       if (CONSP (elt)
9918           && ((STRINGP (target)
9919                && STRINGP (XCAR (elt))
9920                && fast_string_match (XCAR (elt), target) >= 0)
9921               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9922         {
9923           val = XCDR (elt);
9924           /* Here, if VAL is both a valid coding system and a valid
9925              function symbol, we return VAL as a coding system.  */
9926           if (CONSP (val))
9927             return val;
9928           if (! SYMBOLP (val))
9929             return Qnil;
9930           if (! NILP (Fcoding_system_p (val)))
9931             return Fcons (val, val);
9932           if (! NILP (Ffboundp (val)))
9933             {
9934               /* We use call1 rather than safe_call1
9935                  so as to get bug reports about functions called here
9936                  which don't handle the current interface.  */
9937               val = call1 (val, Flist (nargs, args));
9938               if (CONSP (val))
9939                 return val;
9940               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9941                 return Fcons (val, val);
9942             }
9943           return Qnil;
9944         }
9945     }
9946   return Qnil;
9947 }
9948
9949 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9950        Sset_coding_system_priority, 0, MANY, 0,
9951        doc: /* Assign higher priority to the coding systems given as arguments.
9952 If multiple coding systems belong to the same category,
9953 all but the first one are ignored.
9954
9955 usage: (set-coding-system-priority &rest coding-systems)  */)
9956   (ptrdiff_t nargs, Lisp_Object *args)
9957 {
9958   ptrdiff_t i, j;
9959   bool changed[coding_category_max];
9960   enum coding_category priorities[coding_category_max];
9961
9962   memset (changed, 0, sizeof changed);
9963
9964   for (i = j = 0; i < nargs; i++)
9965     {
9966       enum coding_category category;
9967       Lisp_Object spec, attrs;
9968
9969       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9970       attrs = AREF (spec, 0);
9971       category = XINT (CODING_ATTR_CATEGORY (attrs));
9972       if (changed[category])
9973         /* Ignore this coding system because a coding system of the
9974            same category already had a higher priority.  */
9975         continue;
9976       changed[category] = 1;
9977       priorities[j++] = category;
9978       if (coding_categories[category].id >= 0
9979           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9980         setup_coding_system (args[i], &coding_categories[category]);
9981       Fset (AREF (Vcoding_category_table, category), args[i]);
9982     }
9983
9984   /* Now we have decided top J priorities.  Reflect the order of the
9985      original priorities to the remaining priorities.  */
9986
9987   for (i = j, j = 0; i < coding_category_max; i++, j++)
9988     {
9989       while (j < coding_category_max
9990              && changed[coding_priorities[j]])
9991         j++;
9992       if (j == coding_category_max)
9993         emacs_abort ();
9994       priorities[i] = coding_priorities[j];
9995     }
9996
9997   memcpy (coding_priorities, priorities, sizeof priorities);
9998
9999   /* Update `coding-category-list'.  */
10000   Vcoding_category_list = Qnil;
10001   for (i = coding_category_max; i-- > 0; )
10002     Vcoding_category_list
10003       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10004                Vcoding_category_list);
10005
10006   return Qnil;
10007 }
10008
10009 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10010        Scoding_system_priority_list, 0, 1, 0,
10011        doc: /* Return a list of coding systems ordered by their priorities.
10012 The list contains a subset of coding systems; i.e. coding systems
10013 assigned to each coding category (see `coding-category-list').
10014
10015 HIGHESTP non-nil means just return the highest priority one.  */)
10016   (Lisp_Object highestp)
10017 {
10018   int i;
10019   Lisp_Object val;
10020
10021   for (i = 0, val = Qnil; i < coding_category_max; i++)
10022     {
10023       enum coding_category category = coding_priorities[i];
10024       int id = coding_categories[category].id;
10025       Lisp_Object attrs;
10026
10027       if (id < 0)
10028         continue;
10029       attrs = CODING_ID_ATTRS (id);
10030       if (! NILP (highestp))
10031         return CODING_ATTR_BASE_NAME (attrs);
10032       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10033     }
10034   return Fnreverse (val);
10035 }
10036
10037 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10038
10039 static Lisp_Object
10040 make_subsidiaries (Lisp_Object base)
10041 {
10042   Lisp_Object subsidiaries;
10043   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10044   USE_SAFE_ALLOCA;
10045   char *buf = SAFE_ALLOCA (base_name_len + 6);
10046   int i;
10047
10048   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10049   subsidiaries = make_uninit_vector (3);
10050   for (i = 0; i < 3; i++)
10051     {
10052       strcpy (buf + base_name_len, suffixes[i]);
10053       ASET (subsidiaries, i, intern (buf));
10054     }
10055   SAFE_FREE ();
10056   return subsidiaries;
10057 }
10058
10059
10060 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10061        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10062        doc: /* For internal use only.
10063 usage: (define-coding-system-internal ...)  */)
10064   (ptrdiff_t nargs, Lisp_Object *args)
10065 {
10066   Lisp_Object name;
10067   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10068   Lisp_Object attrs;            /* Vector of attributes.  */
10069   Lisp_Object eol_type;
10070   Lisp_Object aliases;
10071   Lisp_Object coding_type, charset_list, safe_charsets;
10072   enum coding_category category;
10073   Lisp_Object tail, val;
10074   int max_charset_id = 0;
10075   int i;
10076
10077   if (nargs < coding_arg_max)
10078     goto short_args;
10079
10080   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10081
10082   name = args[coding_arg_name];
10083   CHECK_SYMBOL (name);
10084   ASET (attrs, coding_attr_base_name, name);
10085
10086   val = args[coding_arg_mnemonic];
10087   if (! STRINGP (val))
10088     CHECK_CHARACTER (val);
10089   ASET (attrs, coding_attr_mnemonic, val);
10090
10091   coding_type = args[coding_arg_coding_type];
10092   CHECK_SYMBOL (coding_type);
10093   ASET (attrs, coding_attr_type, coding_type);
10094
10095   charset_list = args[coding_arg_charset_list];
10096   if (SYMBOLP (charset_list))
10097     {
10098       if (EQ (charset_list, Qiso_2022))
10099         {
10100           if (! EQ (coding_type, Qiso_2022))
10101             error ("Invalid charset-list");
10102           charset_list = Viso_2022_charset_list;
10103         }
10104       else if (EQ (charset_list, Qemacs_mule))
10105         {
10106           if (! EQ (coding_type, Qemacs_mule))
10107             error ("Invalid charset-list");
10108           charset_list = Vemacs_mule_charset_list;
10109         }
10110       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10111         {
10112           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10113             error ("Invalid charset-list");
10114           if (max_charset_id < XFASTINT (XCAR (tail)))
10115             max_charset_id = XFASTINT (XCAR (tail));
10116         }
10117     }
10118   else
10119     {
10120       charset_list = Fcopy_sequence (charset_list);
10121       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10122         {
10123           struct charset *charset;
10124
10125           val = XCAR (tail);
10126           CHECK_CHARSET_GET_CHARSET (val, charset);
10127           if (EQ (coding_type, Qiso_2022)
10128               ? CHARSET_ISO_FINAL (charset) < 0
10129               : EQ (coding_type, Qemacs_mule)
10130               ? CHARSET_EMACS_MULE_ID (charset) < 0
10131               : 0)
10132             error ("Can't handle charset `%s'",
10133                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10134
10135           XSETCAR (tail, make_number (charset->id));
10136           if (max_charset_id < charset->id)
10137             max_charset_id = charset->id;
10138         }
10139     }
10140   ASET (attrs, coding_attr_charset_list, charset_list);
10141
10142   safe_charsets = make_uninit_string (max_charset_id + 1);
10143   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10144   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10145     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10146   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10147
10148   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10149
10150   val = args[coding_arg_decode_translation_table];
10151   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10152     CHECK_SYMBOL (val);
10153   ASET (attrs, coding_attr_decode_tbl, val);
10154
10155   val = args[coding_arg_encode_translation_table];
10156   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10157     CHECK_SYMBOL (val);
10158   ASET (attrs, coding_attr_encode_tbl, val);
10159
10160   val = args[coding_arg_post_read_conversion];
10161   CHECK_SYMBOL (val);
10162   ASET (attrs, coding_attr_post_read, val);
10163
10164   val = args[coding_arg_pre_write_conversion];
10165   CHECK_SYMBOL (val);
10166   ASET (attrs, coding_attr_pre_write, val);
10167
10168   val = args[coding_arg_default_char];
10169   if (NILP (val))
10170     ASET (attrs, coding_attr_default_char, make_number (' '));
10171   else
10172     {
10173       CHECK_CHARACTER (val);
10174       ASET (attrs, coding_attr_default_char, val);
10175     }
10176
10177   val = args[coding_arg_for_unibyte];
10178   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10179
10180   val = args[coding_arg_plist];
10181   CHECK_LIST (val);
10182   ASET (attrs, coding_attr_plist, val);
10183
10184   if (EQ (coding_type, Qcharset))
10185     {
10186       /* Generate a lisp vector of 256 elements.  Each element is nil,
10187          integer, or a list of charset IDs.
10188
10189          If Nth element is nil, the byte code N is invalid in this
10190          coding system.
10191
10192          If Nth element is a number NUM, N is the first byte of a
10193          charset whose ID is NUM.
10194
10195          If Nth element is a list of charset IDs, N is the first byte
10196          of one of them.  The list is sorted by dimensions of the
10197          charsets.  A charset of smaller dimension comes first. */
10198       val = Fmake_vector (make_number (256), Qnil);
10199
10200       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10201         {
10202           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10203           int dim = CHARSET_DIMENSION (charset);
10204           int idx = (dim - 1) * 4;
10205
10206           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10207             ASET (attrs, coding_attr_ascii_compat, Qt);
10208
10209           for (i = charset->code_space[idx];
10210                i <= charset->code_space[idx + 1]; i++)
10211             {
10212               Lisp_Object tmp, tmp2;
10213               int dim2;
10214
10215               tmp = AREF (val, i);
10216               if (NILP (tmp))
10217                 tmp = XCAR (tail);
10218               else if (NUMBERP (tmp))
10219                 {
10220                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10221                   if (dim < dim2)
10222                     tmp = list2 (XCAR (tail), tmp);
10223                   else
10224                     tmp = list2 (tmp, XCAR (tail));
10225                 }
10226               else
10227                 {
10228                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10229                     {
10230                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10231                       if (dim < dim2)
10232                         break;
10233                     }
10234                   if (NILP (tmp2))
10235                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10236                   else
10237                     {
10238                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10239                       XSETCAR (tmp2, XCAR (tail));
10240                     }
10241                 }
10242               ASET (val, i, tmp);
10243             }
10244         }
10245       ASET (attrs, coding_attr_charset_valids, val);
10246       category = coding_category_charset;
10247     }
10248   else if (EQ (coding_type, Qccl))
10249     {
10250       Lisp_Object valids;
10251
10252       if (nargs < coding_arg_ccl_max)
10253         goto short_args;
10254
10255       val = args[coding_arg_ccl_decoder];
10256       CHECK_CCL_PROGRAM (val);
10257       if (VECTORP (val))
10258         val = Fcopy_sequence (val);
10259       ASET (attrs, coding_attr_ccl_decoder, val);
10260
10261       val = args[coding_arg_ccl_encoder];
10262       CHECK_CCL_PROGRAM (val);
10263       if (VECTORP (val))
10264         val = Fcopy_sequence (val);
10265       ASET (attrs, coding_attr_ccl_encoder, val);
10266
10267       val = args[coding_arg_ccl_valids];
10268       valids = Fmake_string (make_number (256), make_number (0));
10269       for (tail = val; CONSP (tail); tail = XCDR (tail))
10270         {
10271           int from, to;
10272
10273           val = XCAR (tail);
10274           if (INTEGERP (val))
10275             {
10276               if (! (0 <= XINT (val) && XINT (val) <= 255))
10277                 args_out_of_range_3 (val, make_number (0), make_number (255));
10278               from = to = XINT (val);
10279             }
10280           else
10281             {
10282               CHECK_CONS (val);
10283               CHECK_NATNUM_CAR (val);
10284               CHECK_NUMBER_CDR (val);
10285               if (XINT (XCAR (val)) > 255)
10286                 args_out_of_range_3 (XCAR (val),
10287                                      make_number (0), make_number (255));
10288               from = XINT (XCAR (val));
10289               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10290                 args_out_of_range_3 (XCDR (val),
10291                                      XCAR (val), make_number (255));
10292               to = XINT (XCDR (val));
10293             }
10294           for (i = from; i <= to; i++)
10295             SSET (valids, i, 1);
10296         }
10297       ASET (attrs, coding_attr_ccl_valids, valids);
10298
10299       category = coding_category_ccl;
10300     }
10301   else if (EQ (coding_type, Qutf_16))
10302     {
10303       Lisp_Object bom, endian;
10304
10305       ASET (attrs, coding_attr_ascii_compat, Qnil);
10306
10307       if (nargs < coding_arg_utf16_max)
10308         goto short_args;
10309
10310       bom = args[coding_arg_utf16_bom];
10311       if (! NILP (bom) && ! EQ (bom, Qt))
10312         {
10313           CHECK_CONS (bom);
10314           val = XCAR (bom);
10315           CHECK_CODING_SYSTEM (val);
10316           val = XCDR (bom);
10317           CHECK_CODING_SYSTEM (val);
10318         }
10319       ASET (attrs, coding_attr_utf_bom, bom);
10320
10321       endian = args[coding_arg_utf16_endian];
10322       CHECK_SYMBOL (endian);
10323       if (NILP (endian))
10324         endian = Qbig;
10325       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10326         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10327       ASET (attrs, coding_attr_utf_16_endian, endian);
10328
10329       category = (CONSP (bom)
10330                   ? coding_category_utf_16_auto
10331                   : NILP (bom)
10332                   ? (EQ (endian, Qbig)
10333                      ? coding_category_utf_16_be_nosig
10334                      : coding_category_utf_16_le_nosig)
10335                   : (EQ (endian, Qbig)
10336                      ? coding_category_utf_16_be
10337                      : coding_category_utf_16_le));
10338     }
10339   else if (EQ (coding_type, Qiso_2022))
10340     {
10341       Lisp_Object initial, reg_usage, request, flags;
10342
10343       if (nargs < coding_arg_iso2022_max)
10344         goto short_args;
10345
10346       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10347       CHECK_VECTOR (initial);
10348       for (i = 0; i < 4; i++)
10349         {
10350           val = AREF (initial, i);
10351           if (! NILP (val))
10352             {
10353               struct charset *charset;
10354
10355               CHECK_CHARSET_GET_CHARSET (val, charset);
10356               ASET (initial, i, make_number (CHARSET_ID (charset)));
10357               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10358                 ASET (attrs, coding_attr_ascii_compat, Qt);
10359             }
10360           else
10361             ASET (initial, i, make_number (-1));
10362         }
10363
10364       reg_usage = args[coding_arg_iso2022_reg_usage];
10365       CHECK_CONS (reg_usage);
10366       CHECK_NUMBER_CAR (reg_usage);
10367       CHECK_NUMBER_CDR (reg_usage);
10368
10369       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10370       for (tail = request; CONSP (tail); tail = XCDR (tail))
10371         {
10372           int id;
10373           Lisp_Object tmp1;
10374
10375           val = XCAR (tail);
10376           CHECK_CONS (val);
10377           tmp1 = XCAR (val);
10378           CHECK_CHARSET_GET_ID (tmp1, id);
10379           CHECK_NATNUM_CDR (val);
10380           if (XINT (XCDR (val)) >= 4)
10381             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10382           XSETCAR (val, make_number (id));
10383         }
10384
10385       flags = args[coding_arg_iso2022_flags];
10386       CHECK_NATNUM (flags);
10387       i = XINT (flags) & INT_MAX;
10388       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10389         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10390       flags = make_number (i);
10391
10392       ASET (attrs, coding_attr_iso_initial, initial);
10393       ASET (attrs, coding_attr_iso_usage, reg_usage);
10394       ASET (attrs, coding_attr_iso_request, request);
10395       ASET (attrs, coding_attr_iso_flags, flags);
10396       setup_iso_safe_charsets (attrs);
10397
10398       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10399         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10400                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10401                     ? coding_category_iso_7_else
10402                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10403                     ? coding_category_iso_7
10404                     : coding_category_iso_7_tight);
10405       else
10406         {
10407           int id = XINT (AREF (initial, 1));
10408
10409           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10410                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10411                        || id < 0)
10412                       ? coding_category_iso_8_else
10413                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10414                       ? coding_category_iso_8_1
10415                       : coding_category_iso_8_2);
10416         }
10417       if (category != coding_category_iso_8_1
10418           && category != coding_category_iso_8_2)
10419         ASET (attrs, coding_attr_ascii_compat, Qnil);
10420     }
10421   else if (EQ (coding_type, Qemacs_mule))
10422     {
10423       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10424         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10425       ASET (attrs, coding_attr_ascii_compat, Qt);
10426       category = coding_category_emacs_mule;
10427     }
10428   else if (EQ (coding_type, Qshift_jis))
10429     {
10430
10431       struct charset *charset;
10432
10433       if (XINT (Flength (charset_list)) != 3
10434           && XINT (Flength (charset_list)) != 4)
10435         error ("There should be three or four charsets");
10436
10437       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10438       if (CHARSET_DIMENSION (charset) != 1)
10439         error ("Dimension of charset %s is not one",
10440                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10441       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10442         ASET (attrs, coding_attr_ascii_compat, Qt);
10443
10444       charset_list = XCDR (charset_list);
10445       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10446       if (CHARSET_DIMENSION (charset) != 1)
10447         error ("Dimension of charset %s is not one",
10448                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10449
10450       charset_list = XCDR (charset_list);
10451       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10452       if (CHARSET_DIMENSION (charset) != 2)
10453         error ("Dimension of charset %s is not two",
10454                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10455
10456       charset_list = XCDR (charset_list);
10457       if (! NILP (charset_list))
10458         {
10459           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10460           if (CHARSET_DIMENSION (charset) != 2)
10461             error ("Dimension of charset %s is not two",
10462                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10463         }
10464
10465       category = coding_category_sjis;
10466       Vsjis_coding_system = name;
10467     }
10468   else if (EQ (coding_type, Qbig5))
10469     {
10470       struct charset *charset;
10471
10472       if (XINT (Flength (charset_list)) != 2)
10473         error ("There should be just two charsets");
10474
10475       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10476       if (CHARSET_DIMENSION (charset) != 1)
10477         error ("Dimension of charset %s is not one",
10478                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10479       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10480         ASET (attrs, coding_attr_ascii_compat, Qt);
10481
10482       charset_list = XCDR (charset_list);
10483       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10484       if (CHARSET_DIMENSION (charset) != 2)
10485         error ("Dimension of charset %s is not two",
10486                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10487
10488       category = coding_category_big5;
10489       Vbig5_coding_system = name;
10490     }
10491   else if (EQ (coding_type, Qraw_text))
10492     {
10493       category = coding_category_raw_text;
10494       ASET (attrs, coding_attr_ascii_compat, Qt);
10495     }
10496   else if (EQ (coding_type, Qutf_8))
10497     {
10498       Lisp_Object bom;
10499
10500       if (nargs < coding_arg_utf8_max)
10501         goto short_args;
10502
10503       bom = args[coding_arg_utf8_bom];
10504       if (! NILP (bom) && ! EQ (bom, Qt))
10505         {
10506           CHECK_CONS (bom);
10507           val = XCAR (bom);
10508           CHECK_CODING_SYSTEM (val);
10509           val = XCDR (bom);
10510           CHECK_CODING_SYSTEM (val);
10511         }
10512       ASET (attrs, coding_attr_utf_bom, bom);
10513       if (NILP (bom))
10514         ASET (attrs, coding_attr_ascii_compat, Qt);
10515
10516       category = (CONSP (bom) ? coding_category_utf_8_auto
10517                   : NILP (bom) ? coding_category_utf_8_nosig
10518                   : coding_category_utf_8_sig);
10519     }
10520   else if (EQ (coding_type, Qundecided))
10521     {
10522       if (nargs < coding_arg_undecided_max)
10523         goto short_args;
10524       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10525             args[coding_arg_undecided_inhibit_null_byte_detection]);
10526       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10527             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10528       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10529             args[coding_arg_undecided_prefer_utf_8]);
10530       category = coding_category_undecided;
10531     }
10532   else
10533     error ("Invalid coding system type: %s",
10534            SDATA (SYMBOL_NAME (coding_type)));
10535
10536   ASET (attrs, coding_attr_category, make_number (category));
10537   ASET (attrs, coding_attr_plist,
10538         Fcons (QCcategory,
10539                Fcons (AREF (Vcoding_category_table, category),
10540                       CODING_ATTR_PLIST (attrs))));
10541   ASET (attrs, coding_attr_plist,
10542         Fcons (QCascii_compatible_p,
10543                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10544                       CODING_ATTR_PLIST (attrs))));
10545
10546   eol_type = args[coding_arg_eol_type];
10547   if (! NILP (eol_type)
10548       && ! EQ (eol_type, Qunix)
10549       && ! EQ (eol_type, Qdos)
10550       && ! EQ (eol_type, Qmac))
10551     error ("Invalid eol-type");
10552
10553   aliases = list1 (name);
10554
10555   if (NILP (eol_type))
10556     {
10557       eol_type = make_subsidiaries (name);
10558       for (i = 0; i < 3; i++)
10559         {
10560           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10561
10562           this_name = AREF (eol_type, i);
10563           this_aliases = list1 (this_name);
10564           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10565           this_spec = make_uninit_vector (3);
10566           ASET (this_spec, 0, attrs);
10567           ASET (this_spec, 1, this_aliases);
10568           ASET (this_spec, 2, this_eol_type);
10569           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10570           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10571           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10572           if (NILP (val))
10573             Vcoding_system_alist
10574               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10575                        Vcoding_system_alist);
10576         }
10577     }
10578
10579   spec_vec = make_uninit_vector (3);
10580   ASET (spec_vec, 0, attrs);
10581   ASET (spec_vec, 1, aliases);
10582   ASET (spec_vec, 2, eol_type);
10583
10584   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10585   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10586   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10587   if (NILP (val))
10588     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10589                                   Vcoding_system_alist);
10590
10591   {
10592     int id = coding_categories[category].id;
10593
10594     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10595       setup_coding_system (name, &coding_categories[category]);
10596   }
10597
10598   return Qnil;
10599
10600  short_args:
10601   return Fsignal (Qwrong_number_of_arguments,
10602                   Fcons (intern ("define-coding-system-internal"),
10603                          make_number (nargs)));
10604 }
10605
10606
10607 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10608        3, 3, 0,
10609        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10610   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10611 {
10612   Lisp_Object spec, attrs;
10613
10614   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10615   attrs = AREF (spec, 0);
10616   if (EQ (prop, QCmnemonic))
10617     {
10618       if (! STRINGP (val))
10619         CHECK_CHARACTER (val);
10620       ASET (attrs, coding_attr_mnemonic, val);
10621     }
10622   else if (EQ (prop, QCdefault_char))
10623     {
10624       if (NILP (val))
10625         val = make_number (' ');
10626       else
10627         CHECK_CHARACTER (val);
10628       ASET (attrs, coding_attr_default_char, val);
10629     }
10630   else if (EQ (prop, QCdecode_translation_table))
10631     {
10632       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10633         CHECK_SYMBOL (val);
10634       ASET (attrs, coding_attr_decode_tbl, val);
10635     }
10636   else if (EQ (prop, QCencode_translation_table))
10637     {
10638       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10639         CHECK_SYMBOL (val);
10640       ASET (attrs, coding_attr_encode_tbl, val);
10641     }
10642   else if (EQ (prop, QCpost_read_conversion))
10643     {
10644       CHECK_SYMBOL (val);
10645       ASET (attrs, coding_attr_post_read, val);
10646     }
10647   else if (EQ (prop, QCpre_write_conversion))
10648     {
10649       CHECK_SYMBOL (val);
10650       ASET (attrs, coding_attr_pre_write, val);
10651     }
10652   else if (EQ (prop, QCascii_compatible_p))
10653     {
10654       ASET (attrs, coding_attr_ascii_compat, val);
10655     }
10656
10657   ASET (attrs, coding_attr_plist,
10658         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10659   return val;
10660 }
10661
10662
10663 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10664        Sdefine_coding_system_alias, 2, 2, 0,
10665        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10666   (Lisp_Object alias, Lisp_Object coding_system)
10667 {
10668   Lisp_Object spec, aliases, eol_type, val;
10669
10670   CHECK_SYMBOL (alias);
10671   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10672   aliases = AREF (spec, 1);
10673   /* ALIASES should be a list of length more than zero, and the first
10674      element is a base coding system.  Append ALIAS at the tail of the
10675      list.  */
10676   while (!NILP (XCDR (aliases)))
10677     aliases = XCDR (aliases);
10678   XSETCDR (aliases, list1 (alias));
10679
10680   eol_type = AREF (spec, 2);
10681   if (VECTORP (eol_type))
10682     {
10683       Lisp_Object subsidiaries;
10684       int i;
10685
10686       subsidiaries = make_subsidiaries (alias);
10687       for (i = 0; i < 3; i++)
10688         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10689                                      AREF (eol_type, i));
10690     }
10691
10692   Fputhash (alias, spec, Vcoding_system_hash_table);
10693   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10694   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10695   if (NILP (val))
10696     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10697                                   Vcoding_system_alist);
10698
10699   return Qnil;
10700 }
10701
10702 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10703        1, 1, 0,
10704        doc: /* Return the base of CODING-SYSTEM.
10705 Any alias or subsidiary coding system is not a base coding system.  */)
10706   (Lisp_Object coding_system)
10707 {
10708   Lisp_Object spec, attrs;
10709
10710   if (NILP (coding_system))
10711     return (Qno_conversion);
10712   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10713   attrs = AREF (spec, 0);
10714   return CODING_ATTR_BASE_NAME (attrs);
10715 }
10716
10717 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10718        1, 1, 0,
10719        doc: "Return the property list of CODING-SYSTEM.")
10720   (Lisp_Object coding_system)
10721 {
10722   Lisp_Object spec, attrs;
10723
10724   if (NILP (coding_system))
10725     coding_system = Qno_conversion;
10726   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10727   attrs = AREF (spec, 0);
10728   return CODING_ATTR_PLIST (attrs);
10729 }
10730
10731
10732 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10733        1, 1, 0,
10734        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10735   (Lisp_Object coding_system)
10736 {
10737   Lisp_Object spec;
10738
10739   if (NILP (coding_system))
10740     coding_system = Qno_conversion;
10741   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10742   return AREF (spec, 1);
10743 }
10744
10745 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10746        Scoding_system_eol_type, 1, 1, 0,
10747        doc: /* Return eol-type of CODING-SYSTEM.
10748 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10749
10750 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10751 and CR respectively.
10752
10753 A vector value indicates that a format of end-of-line should be
10754 detected automatically.  Nth element of the vector is the subsidiary
10755 coding system whose eol-type is N.  */)
10756   (Lisp_Object coding_system)
10757 {
10758   Lisp_Object spec, eol_type;
10759   int n;
10760
10761   if (NILP (coding_system))
10762     coding_system = Qno_conversion;
10763   if (! CODING_SYSTEM_P (coding_system))
10764     return Qnil;
10765   spec = CODING_SYSTEM_SPEC (coding_system);
10766   eol_type = AREF (spec, 2);
10767   if (VECTORP (eol_type))
10768     return Fcopy_sequence (eol_type);
10769   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10770   return make_number (n);
10771 }
10772
10773 #endif /* emacs */
10774
10775 \f
10776 /*** 9. Post-amble ***/
10777
10778 void
10779 init_coding_once (void)
10780 {
10781   int i;
10782
10783   for (i = 0; i < coding_category_max; i++)
10784     {
10785       coding_categories[i].id = -1;
10786       coding_priorities[i] = i;
10787     }
10788
10789   /* ISO2022 specific initialize routine.  */
10790   for (i = 0; i < 0x20; i++)
10791     iso_code_class[i] = ISO_control_0;
10792   for (i = 0x21; i < 0x7F; i++)
10793     iso_code_class[i] = ISO_graphic_plane_0;
10794   for (i = 0x80; i < 0xA0; i++)
10795     iso_code_class[i] = ISO_control_1;
10796   for (i = 0xA1; i < 0xFF; i++)
10797     iso_code_class[i] = ISO_graphic_plane_1;
10798   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10799   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10800   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10801   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10802   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10803   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10804   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10805   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10806   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10807
10808   for (i = 0; i < 256; i++)
10809     {
10810       emacs_mule_bytes[i] = 1;
10811     }
10812   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10813   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10814   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10815   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10816 }
10817
10818 #ifdef emacs
10819
10820 void
10821 syms_of_coding (void)
10822 {
10823   staticpro (&Vcoding_system_hash_table);
10824   {
10825     Lisp_Object args[2];
10826     args[0] = QCtest;
10827     args[1] = Qeq;
10828     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10829   }
10830
10831   staticpro (&Vsjis_coding_system);
10832   Vsjis_coding_system = Qnil;
10833
10834   staticpro (&Vbig5_coding_system);
10835   Vbig5_coding_system = Qnil;
10836
10837   staticpro (&Vcode_conversion_reused_workbuf);
10838   Vcode_conversion_reused_workbuf = Qnil;
10839
10840   staticpro (&Vcode_conversion_workbuf_name);
10841   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10842
10843   reused_workbuf_in_use = 0;
10844
10845   DEFSYM (Qcharset, "charset");
10846   DEFSYM (Qtarget_idx, "target-idx");
10847   DEFSYM (Qcoding_system_history, "coding-system-history");
10848   Fset (Qcoding_system_history, Qnil);
10849
10850   /* Target FILENAME is the first argument.  */
10851   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10852   /* Target FILENAME is the third argument.  */
10853   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10854
10855   DEFSYM (Qcall_process, "call-process");
10856   /* Target PROGRAM is the first argument.  */
10857   Fput (Qcall_process, Qtarget_idx, make_number (0));
10858
10859   DEFSYM (Qcall_process_region, "call-process-region");
10860   /* Target PROGRAM is the third argument.  */
10861   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10862
10863   DEFSYM (Qstart_process, "start-process");
10864   /* Target PROGRAM is the third argument.  */
10865   Fput (Qstart_process, Qtarget_idx, make_number (2));
10866
10867   DEFSYM (Qopen_network_stream, "open-network-stream");
10868   /* Target SERVICE is the fourth argument.  */
10869   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10870
10871   DEFSYM (Qcoding_system, "coding-system");
10872   DEFSYM (Qcoding_aliases, "coding-aliases");
10873
10874   DEFSYM (Qeol_type, "eol-type");
10875   DEFSYM (Qunix, "unix");
10876   DEFSYM (Qdos, "dos");
10877   DEFSYM (Qmac, "mac");
10878
10879   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10880   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10881   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10882   DEFSYM (Qdefault_char, "default-char");
10883   DEFSYM (Qundecided, "undecided");
10884   DEFSYM (Qno_conversion, "no-conversion");
10885   DEFSYM (Qraw_text, "raw-text");
10886
10887   DEFSYM (Qiso_2022, "iso-2022");
10888
10889   DEFSYM (Qutf_8, "utf-8");
10890   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10891
10892 #if defined (WINDOWSNT) || defined (CYGWIN)
10893   /* No, not utf-16-le: that one has a BOM.  */
10894   DEFSYM (Qutf_16le, "utf-16le");
10895 #endif
10896
10897   DEFSYM (Qutf_16, "utf-16");
10898   DEFSYM (Qbig, "big");
10899   DEFSYM (Qlittle, "little");
10900
10901   DEFSYM (Qshift_jis, "shift-jis");
10902   DEFSYM (Qbig5, "big5");
10903
10904   DEFSYM (Qcoding_system_p, "coding-system-p");
10905
10906   DEFSYM (Qcoding_system_error, "coding-system-error");
10907   Fput (Qcoding_system_error, Qerror_conditions,
10908         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10909   Fput (Qcoding_system_error, Qerror_message,
10910         build_pure_c_string ("Invalid coding system"));
10911
10912   DEFSYM (Qtranslation_table, "translation-table");
10913   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10914   DEFSYM (Qtranslation_table_id, "translation-table-id");
10915   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10916   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10917
10918   DEFSYM (Qvalid_codes, "valid-codes");
10919
10920   DEFSYM (Qemacs_mule, "emacs-mule");
10921
10922   DEFSYM (QCcategory, ":category");
10923   DEFSYM (QCmnemonic, ":mnemonic");
10924   DEFSYM (QCdefault_char, ":default-char");
10925   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10926   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10927   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10928   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10929   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10930
10931   Vcoding_category_table
10932     = Fmake_vector (make_number (coding_category_max), Qnil);
10933   staticpro (&Vcoding_category_table);
10934   /* Followings are target of code detection.  */
10935   ASET (Vcoding_category_table, coding_category_iso_7,
10936         intern_c_string ("coding-category-iso-7"));
10937   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10938         intern_c_string ("coding-category-iso-7-tight"));
10939   ASET (Vcoding_category_table, coding_category_iso_8_1,
10940         intern_c_string ("coding-category-iso-8-1"));
10941   ASET (Vcoding_category_table, coding_category_iso_8_2,
10942         intern_c_string ("coding-category-iso-8-2"));
10943   ASET (Vcoding_category_table, coding_category_iso_7_else,
10944         intern_c_string ("coding-category-iso-7-else"));
10945   ASET (Vcoding_category_table, coding_category_iso_8_else,
10946         intern_c_string ("coding-category-iso-8-else"));
10947   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10948         intern_c_string ("coding-category-utf-8-auto"));
10949   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10950         intern_c_string ("coding-category-utf-8"));
10951   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10952         intern_c_string ("coding-category-utf-8-sig"));
10953   ASET (Vcoding_category_table, coding_category_utf_16_be,
10954         intern_c_string ("coding-category-utf-16-be"));
10955   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10956         intern_c_string ("coding-category-utf-16-auto"));
10957   ASET (Vcoding_category_table, coding_category_utf_16_le,
10958         intern_c_string ("coding-category-utf-16-le"));
10959   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10960         intern_c_string ("coding-category-utf-16-be-nosig"));
10961   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10962         intern_c_string ("coding-category-utf-16-le-nosig"));
10963   ASET (Vcoding_category_table, coding_category_charset,
10964         intern_c_string ("coding-category-charset"));
10965   ASET (Vcoding_category_table, coding_category_sjis,
10966         intern_c_string ("coding-category-sjis"));
10967   ASET (Vcoding_category_table, coding_category_big5,
10968         intern_c_string ("coding-category-big5"));
10969   ASET (Vcoding_category_table, coding_category_ccl,
10970         intern_c_string ("coding-category-ccl"));
10971   ASET (Vcoding_category_table, coding_category_emacs_mule,
10972         intern_c_string ("coding-category-emacs-mule"));
10973   /* Followings are NOT target of code detection.  */
10974   ASET (Vcoding_category_table, coding_category_raw_text,
10975         intern_c_string ("coding-category-raw-text"));
10976   ASET (Vcoding_category_table, coding_category_undecided,
10977         intern_c_string ("coding-category-undecided"));
10978
10979   DEFSYM (Qinsufficient_source, "insufficient-source");
10980   DEFSYM (Qinvalid_source, "invalid-source");
10981   DEFSYM (Qinterrupted, "interrupted");
10982   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10983
10984   defsubr (&Scoding_system_p);
10985   defsubr (&Sread_coding_system);
10986   defsubr (&Sread_non_nil_coding_system);
10987   defsubr (&Scheck_coding_system);
10988   defsubr (&Sdetect_coding_region);
10989   defsubr (&Sdetect_coding_string);
10990   defsubr (&Sfind_coding_systems_region_internal);
10991   defsubr (&Sunencodable_char_position);
10992   defsubr (&Scheck_coding_systems_region);
10993   defsubr (&Sdecode_coding_region);
10994   defsubr (&Sencode_coding_region);
10995   defsubr (&Sdecode_coding_string);
10996   defsubr (&Sencode_coding_string);
10997   defsubr (&Sdecode_sjis_char);
10998   defsubr (&Sencode_sjis_char);
10999   defsubr (&Sdecode_big5_char);
11000   defsubr (&Sencode_big5_char);
11001   defsubr (&Sset_terminal_coding_system_internal);
11002   defsubr (&Sset_safe_terminal_coding_system_internal);
11003   defsubr (&Sterminal_coding_system);
11004   defsubr (&Sset_keyboard_coding_system_internal);
11005   defsubr (&Skeyboard_coding_system);
11006   defsubr (&Sfind_operation_coding_system);
11007   defsubr (&Sset_coding_system_priority);
11008   defsubr (&Sdefine_coding_system_internal);
11009   defsubr (&Sdefine_coding_system_alias);
11010   defsubr (&Scoding_system_put);
11011   defsubr (&Scoding_system_base);
11012   defsubr (&Scoding_system_plist);
11013   defsubr (&Scoding_system_aliases);
11014   defsubr (&Scoding_system_eol_type);
11015   defsubr (&Scoding_system_priority_list);
11016
11017   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11018                doc: /* List of coding systems.
11019
11020 Do not alter the value of this variable manually.  This variable should be
11021 updated by the functions `define-coding-system' and
11022 `define-coding-system-alias'.  */);
11023   Vcoding_system_list = Qnil;
11024
11025   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11026                doc: /* Alist of coding system names.
11027 Each element is one element list of coding system name.
11028 This variable is given to `completing-read' as COLLECTION argument.
11029
11030 Do not alter the value of this variable manually.  This variable should be
11031 updated by the functions `make-coding-system' and
11032 `define-coding-system-alias'.  */);
11033   Vcoding_system_alist = Qnil;
11034
11035   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11036                doc: /* List of coding-categories (symbols) ordered by priority.
11037
11038 On detecting a coding system, Emacs tries code detection algorithms
11039 associated with each coding-category one by one in this order.  When
11040 one algorithm agrees with a byte sequence of source text, the coding
11041 system bound to the corresponding coding-category is selected.
11042
11043 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11044   {
11045     int i;
11046
11047     Vcoding_category_list = Qnil;
11048     for (i = coding_category_max - 1; i >= 0; i--)
11049       Vcoding_category_list
11050         = Fcons (AREF (Vcoding_category_table, i),
11051                  Vcoding_category_list);
11052   }
11053
11054   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11055                doc: /* Specify the coding system for read operations.
11056 It is useful to bind this variable with `let', but do not set it globally.
11057 If the value is a coding system, it is used for decoding on read operation.
11058 If not, an appropriate element is used from one of the coding system alists.
11059 There are three such tables: `file-coding-system-alist',
11060 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11061   Vcoding_system_for_read = Qnil;
11062
11063   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11064                doc: /* Specify the coding system for write operations.
11065 Programs bind this variable with `let', but you should not set it globally.
11066 If the value is a coding system, it is used for encoding of output,
11067 when writing it to a file and when sending it to a file or subprocess.
11068
11069 If this does not specify a coding system, an appropriate element
11070 is used from one of the coding system alists.
11071 There are three such tables: `file-coding-system-alist',
11072 `process-coding-system-alist', and `network-coding-system-alist'.
11073 For output to files, if the above procedure does not specify a coding system,
11074 the value of `buffer-file-coding-system' is used.  */);
11075   Vcoding_system_for_write = Qnil;
11076
11077   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11078                doc: /*
11079 Coding system used in the latest file or process I/O.  */);
11080   Vlast_coding_system_used = Qnil;
11081
11082   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11083                doc: /*
11084 Error status of the last code conversion.
11085
11086 When an error was detected in the last code conversion, this variable
11087 is set to one of the following symbols.
11088   `insufficient-source'
11089   `inconsistent-eol'
11090   `invalid-source'
11091   `interrupted'
11092   `insufficient-memory'
11093 When no error was detected, the value doesn't change.  So, to check
11094 the error status of a code conversion by this variable, you must
11095 explicitly set this variable to nil before performing code
11096 conversion.  */);
11097   Vlast_code_conversion_error = Qnil;
11098
11099   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11100                doc: /*
11101 *Non-nil means always inhibit code conversion of end-of-line format.
11102 See info node `Coding Systems' and info node `Text and Binary' concerning
11103 such conversion.  */);
11104   inhibit_eol_conversion = 0;
11105
11106   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11107                doc: /*
11108 Non-nil means process buffer inherits coding system of process output.
11109 Bind it to t if the process output is to be treated as if it were a file
11110 read from some filesystem.  */);
11111   inherit_process_coding_system = 0;
11112
11113   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11114                doc: /*
11115 Alist to decide a coding system to use for a file I/O operation.
11116 The format is ((PATTERN . VAL) ...),
11117 where PATTERN is a regular expression matching a file name,
11118 VAL is a coding system, a cons of coding systems, or a function symbol.
11119 If VAL is a coding system, it is used for both decoding and encoding
11120 the file contents.
11121 If VAL is a cons of coding systems, the car part is used for decoding,
11122 and the cdr part is used for encoding.
11123 If VAL is a function symbol, the function must return a coding system
11124 or a cons of coding systems which are used as above.  The function is
11125 called with an argument that is a list of the arguments with which
11126 `find-operation-coding-system' was called.  If the function can't decide
11127 a coding system, it can return `undecided' so that the normal
11128 code-detection is performed.
11129
11130 See also the function `find-operation-coding-system'
11131 and the variable `auto-coding-alist'.  */);
11132   Vfile_coding_system_alist = Qnil;
11133
11134   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11135                doc: /*
11136 Alist to decide a coding system to use for a process I/O operation.
11137 The format is ((PATTERN . VAL) ...),
11138 where PATTERN is a regular expression matching a program name,
11139 VAL is a coding system, a cons of coding systems, or a function symbol.
11140 If VAL is a coding system, it is used for both decoding what received
11141 from the program and encoding what sent to the program.
11142 If VAL is a cons of coding systems, the car part is used for decoding,
11143 and the cdr part is used for encoding.
11144 If VAL is a function symbol, the function must return a coding system
11145 or a cons of coding systems which are used as above.
11146
11147 See also the function `find-operation-coding-system'.  */);
11148   Vprocess_coding_system_alist = Qnil;
11149
11150   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11151                doc: /*
11152 Alist to decide a coding system to use for a network I/O operation.
11153 The format is ((PATTERN . VAL) ...),
11154 where PATTERN is a regular expression matching a network service name
11155 or is a port number to connect to,
11156 VAL is a coding system, a cons of coding systems, or a function symbol.
11157 If VAL is a coding system, it is used for both decoding what received
11158 from the network stream and encoding what sent to the network stream.
11159 If VAL is a cons of coding systems, the car part is used for decoding,
11160 and the cdr part is used for encoding.
11161 If VAL is a function symbol, the function must return a coding system
11162 or a cons of coding systems which are used as above.
11163
11164 See also the function `find-operation-coding-system'.  */);
11165   Vnetwork_coding_system_alist = Qnil;
11166
11167   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11168                doc: /* Coding system to use with system messages.
11169 Also used for decoding keyboard input on X Window system.  */);
11170   Vlocale_coding_system = Qnil;
11171
11172   /* The eol mnemonics are reset in startup.el system-dependently.  */
11173   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11174                doc: /*
11175 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11176   eol_mnemonic_unix = build_pure_c_string (":");
11177
11178   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11179                doc: /*
11180 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11181   eol_mnemonic_dos = build_pure_c_string ("\\");
11182
11183   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11184                doc: /*
11185 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11186   eol_mnemonic_mac = build_pure_c_string ("/");
11187
11188   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11189                doc: /*
11190 *String displayed in mode line when end-of-line format is not yet determined.  */);
11191   eol_mnemonic_undecided = build_pure_c_string (":");
11192
11193   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11194                doc: /*
11195 *Non-nil enables character translation while encoding and decoding.  */);
11196   Venable_character_translation = Qt;
11197
11198   DEFVAR_LISP ("standard-translation-table-for-decode",
11199                Vstandard_translation_table_for_decode,
11200                doc: /* Table for translating characters while decoding.  */);
11201   Vstandard_translation_table_for_decode = Qnil;
11202
11203   DEFVAR_LISP ("standard-translation-table-for-encode",
11204                Vstandard_translation_table_for_encode,
11205                doc: /* Table for translating characters while encoding.  */);
11206   Vstandard_translation_table_for_encode = Qnil;
11207
11208   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11209                doc: /* Alist of charsets vs revision numbers.
11210 While encoding, if a charset (car part of an element) is found,
11211 designate it with the escape sequence identifying revision (cdr part
11212 of the element).  */);
11213   Vcharset_revision_table = Qnil;
11214
11215   DEFVAR_LISP ("default-process-coding-system",
11216                Vdefault_process_coding_system,
11217                doc: /* Cons of coding systems used for process I/O by default.
11218 The car part is used for decoding a process output,
11219 the cdr part is used for encoding a text to be sent to a process.  */);
11220   Vdefault_process_coding_system = Qnil;
11221
11222   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11223                doc: /*
11224 Table of extra Latin codes in the range 128..159 (inclusive).
11225 This is a vector of length 256.
11226 If Nth element is non-nil, the existence of code N in a file
11227 \(or output of subprocess) doesn't prevent it to be detected as
11228 a coding system of ISO 2022 variant which has a flag
11229 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11230 or reading output of a subprocess.
11231 Only 128th through 159th elements have a meaning.  */);
11232   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11233
11234   DEFVAR_LISP ("select-safe-coding-system-function",
11235                Vselect_safe_coding_system_function,
11236                doc: /*
11237 Function to call to select safe coding system for encoding a text.
11238
11239 If set, this function is called to force a user to select a proper
11240 coding system which can encode the text in the case that a default
11241 coding system used in each operation can't encode the text.  The
11242 function should take care that the buffer is not modified while
11243 the coding system is being selected.
11244
11245 The default value is `select-safe-coding-system' (which see).  */);
11246   Vselect_safe_coding_system_function = Qnil;
11247
11248   DEFVAR_BOOL ("coding-system-require-warning",
11249                coding_system_require_warning,
11250                doc: /* Internal use only.
11251 If non-nil, on writing a file, `select-safe-coding-system-function' is
11252 called even if `coding-system-for-write' is non-nil.  The command
11253 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11254   coding_system_require_warning = 0;
11255
11256
11257   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11258                inhibit_iso_escape_detection,
11259                doc: /*
11260 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11261
11262 When Emacs reads text, it tries to detect how the text is encoded.
11263 This code detection is sensitive to escape sequences.  If Emacs sees
11264 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11265 of the ISO2022 encodings, and decodes text by the corresponding coding
11266 system (e.g. `iso-2022-7bit').
11267
11268 However, there may be a case that you want to read escape sequences in
11269 a file as is.  In such a case, you can set this variable to non-nil.
11270 Then the code detection will ignore any escape sequences, and no text is
11271 detected as encoded in some ISO-2022 encoding.  The result is that all
11272 escape sequences become visible in a buffer.
11273
11274 The default value is nil, and it is strongly recommended not to change
11275 it.  That is because many Emacs Lisp source files that contain
11276 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11277 in Emacs's distribution, and they won't be decoded correctly on
11278 reading if you suppress escape sequence detection.
11279
11280 The other way to read escape sequences in a file without decoding is
11281 to explicitly specify some coding system that doesn't use ISO-2022
11282 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11283   inhibit_iso_escape_detection = 0;
11284
11285   DEFVAR_BOOL ("inhibit-null-byte-detection",
11286                inhibit_null_byte_detection,
11287                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11288 By default, Emacs treats it as binary data, and does not attempt to
11289 decode it.  The effect is as if you specified `no-conversion' for
11290 reading that text.
11291
11292 Set this to non-nil when a regular text happens to include null bytes.
11293 Examples are Index nodes of Info files and null-byte delimited output
11294 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11295 decode text as usual.  */);
11296   inhibit_null_byte_detection = 0;
11297
11298   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11299                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11300 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11301   disable_ascii_optimization = 0;
11302
11303   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11304                doc: /* Char table for translating self-inserting characters.
11305 This is applied to the result of input methods, not their input.
11306 See also `keyboard-translate-table'.
11307
11308 Use of this variable for character code unification was rendered
11309 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11310 internal character representation.  */);
11311     Vtranslation_table_for_input = Qnil;
11312
11313   {
11314     Lisp_Object args[coding_arg_undecided_max];
11315     Lisp_Object plist[16];
11316     int i;
11317
11318     for (i = 0; i < coding_arg_undecided_max; i++)
11319       args[i] = Qnil;
11320
11321     plist[0] = intern_c_string (":name");
11322     plist[1] = args[coding_arg_name] = Qno_conversion;
11323     plist[2] = intern_c_string (":mnemonic");
11324     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11325     plist[4] = intern_c_string (":coding-type");
11326     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11327     plist[6] = intern_c_string (":ascii-compatible-p");
11328     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11329     plist[8] = intern_c_string (":default-char");
11330     plist[9] = args[coding_arg_default_char] = make_number (0);
11331     plist[10] = intern_c_string (":for-unibyte");
11332     plist[11] = args[coding_arg_for_unibyte] = Qt;
11333     plist[12] = intern_c_string (":docstring");
11334     plist[13] = build_pure_c_string ("Do no conversion.\n\
11335 \n\
11336 When you visit a file with this coding, the file is read into a\n\
11337 unibyte buffer as is, thus each byte of a file is treated as a\n\
11338 character.");
11339     plist[14] = intern_c_string (":eol-type");
11340     plist[15] = args[coding_arg_eol_type] = Qunix;
11341     args[coding_arg_plist] = Flist (16, plist);
11342     Fdefine_coding_system_internal (coding_arg_max, args);
11343
11344     plist[1] = args[coding_arg_name] = Qundecided;
11345     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11346     plist[5] = args[coding_arg_coding_type] = Qundecided;
11347     /* This is already set.
11348        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11349     plist[8] = intern_c_string (":charset-list");
11350     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11351     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11352     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11353     plist[15] = args[coding_arg_eol_type] = Qnil;
11354     args[coding_arg_plist] = Flist (16, plist);
11355     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11356     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11357     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11358   }
11359
11360   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11361
11362   {
11363     int i;
11364
11365     for (i = 0; i < coding_category_max; i++)
11366       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11367   }
11368 #if defined (DOS_NT)
11369   system_eol_type = Qdos;
11370 #else
11371   system_eol_type = Qunix;
11372 #endif
11373   staticpro (&system_eol_type);
11374 }
11375
11376 char *
11377 emacs_strerror (int error_number)
11378 {
11379   char *str;
11380
11381   synchronize_system_messages_locale ();
11382   str = strerror (error_number);
11383
11384   if (! NILP (Vlocale_coding_system))
11385     {
11386       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11387                                                       Vlocale_coding_system,
11388                                                       0);
11389       str = SSDATA (dec);
11390     }
11391
11392   return str;
11393 }
11394
11395 #endif /* emacs */