src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2015 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   (CODING_ISO_INVOCATION (coding, plane) < 0 ? -1       \
 399    : CODING_ISO_DESIGNATION (coding, CODING_ISO_INVOCATION (coding, plane)))
 400 #define CODING_ISO_CMP_STATUS(coding)   \
 401   (&(coding)->spec.iso_2022.cmp_status)
 402 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 403   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 404 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 405   ((coding)->spec.iso_2022.embedded_utf_8)
 406
 407 /* Control characters of ISO2022.  */
 408                         /* code */      /* function */
 409 #define ISO_CODE_SO     0x0E            /* shift-out */
 410 #define ISO_CODE_SI     0x0F            /* shift-in */
 411 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 412 #define ISO_CODE_ESC    0x1B            /* escape */
 413 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 414 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 415 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 416
 417 /* All code (1-byte) of ISO2022 is classified into one of the
 418    followings.  */
 419 enum iso_code_class_type
 420   {
 421     ISO_control_0,              /* Control codes in the range
 422                                    0x00..0x1F and 0x7F, except for the
 423                                    following 5 codes.  */
 424     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 425     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 426     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 427     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 428     ISO_control_1,              /* Control codes in the range
 429                                    0x80..0x9F, except for the
 430                                    following 3 codes.  */
 431     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 432     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 433     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 434     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 435     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 436     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 437     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 438   };
 439
 440 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 441     `iso-flags' attribute of an iso2022 coding system.  */
 442
 443 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 444    instead of the correct short-form sequence (e.g. ESC $ A).  */
 445 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 446
 447 /* If set, reset graphic planes and registers at end-of-line to the
 448    initial state.  */
 449 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 450
 451 /* If set, reset graphic planes and registers before any control
 452    characters to the initial state.  */
 453 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 454
 455 /* If set, encode by 7-bit environment.  */
 456 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 457
 458 /* If set, use locking-shift function.  */
 459 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 460
 461 /* If set, use single-shift function.  Overwrite
 462    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 463 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 464
 465 /* If set, use designation escape sequence.  */
 466 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 467
 468 /* If set, produce revision number sequence.  */
 469 #define CODING_ISO_FLAG_REVISION        0x0080
 470
 471 /* If set, produce ISO6429's direction specifying sequence.  */
 472 #define CODING_ISO_FLAG_DIRECTION       0x0100
 473
 474 /* If set, assume designation states are reset at beginning of line on
 475    output.  */
 476 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 477
 478 /* If set, designation sequence should be placed at beginning of line
 479    on output.  */
 480 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 481
 482 /* If set, do not encode unsafe characters on output.  */
 483 #define CODING_ISO_FLAG_SAFE            0x0800
 484
 485 /* If set, extra latin codes (128..159) are accepted as a valid code
 486    on input.  */
 487 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 488
 489 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 490
 491 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 492
 493 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 494
 495 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 496
 497 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 498
 499 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 500
 501 /* A character to be produced on output if encoding of the original
 502    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 503 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 504
 505 /* UTF-8 section */
 506 #define CODING_UTF_8_BOM(coding)        \
 507   ((coding)->spec.utf_8_bom)
 508
 509 /* UTF-16 section */
 510 #define CODING_UTF_16_BOM(coding)       \
 511   ((coding)->spec.utf_16.bom)
 512
 513 #define CODING_UTF_16_ENDIAN(coding)    \
 514   ((coding)->spec.utf_16.endian)
 515
 516 #define CODING_UTF_16_SURROGATE(coding) \
 517   ((coding)->spec.utf_16.surrogate)
 518
 519
 520 /* CCL section */
 521 #define CODING_CCL_DECODER(coding)      \
 522   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 523 #define CODING_CCL_ENCODER(coding)      \
 524   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 525 #define CODING_CCL_VALIDS(coding)                                          \
 526   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 527
 528 /* Index for each coding category in `coding_categories' */
 529
 530 enum coding_category
 531   {
 532     coding_category_iso_7,
 533     coding_category_iso_7_tight,
 534     coding_category_iso_8_1,
 535     coding_category_iso_8_2,
 536     coding_category_iso_7_else,
 537     coding_category_iso_8_else,
 538     coding_category_utf_8_auto,
 539     coding_category_utf_8_nosig,
 540     coding_category_utf_8_sig,
 541     coding_category_utf_16_auto,
 542     coding_category_utf_16_be,
 543     coding_category_utf_16_le,
 544     coding_category_utf_16_be_nosig,
 545     coding_category_utf_16_le_nosig,
 546     coding_category_charset,
 547     coding_category_sjis,
 548     coding_category_big5,
 549     coding_category_ccl,
 550     coding_category_emacs_mule,
 551     /* All above are targets of code detection.  */
 552     coding_category_raw_text,
 553     coding_category_undecided,
 554     coding_category_max
 555   };
 556
 557 /* Definitions of flag bits used in detect_coding_XXXX.  */
 558 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 559 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 560 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 561 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 562 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 563 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 564 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 565 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 566 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 567 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 568 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 569 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 570 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 571 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 572 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 573 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 574 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 575 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 576 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 577 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 578
 579 /* This value is returned if detect_coding_mask () find nothing other
 580    than ASCII characters.  */
 581 #define CATEGORY_MASK_ANY               \
 582   (CATEGORY_MASK_ISO_7                  \
 583    | CATEGORY_MASK_ISO_7_TIGHT          \
 584    | CATEGORY_MASK_ISO_8_1              \
 585    | CATEGORY_MASK_ISO_8_2              \
 586    | CATEGORY_MASK_ISO_7_ELSE           \
 587    | CATEGORY_MASK_ISO_8_ELSE           \
 588    | CATEGORY_MASK_UTF_8_AUTO           \
 589    | CATEGORY_MASK_UTF_8_NOSIG          \
 590    | CATEGORY_MASK_UTF_8_SIG            \
 591    | CATEGORY_MASK_UTF_16_AUTO          \
 592    | CATEGORY_MASK_UTF_16_BE            \
 593    | CATEGORY_MASK_UTF_16_LE            \
 594    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 595    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 596    | CATEGORY_MASK_CHARSET              \
 597    | CATEGORY_MASK_SJIS                 \
 598    | CATEGORY_MASK_BIG5                 \
 599    | CATEGORY_MASK_CCL                  \
 600    | CATEGORY_MASK_EMACS_MULE)
 601
 602
 603 #define CATEGORY_MASK_ISO_7BIT \
 604   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 605
 606 #define CATEGORY_MASK_ISO_8BIT \
 607   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 608
 609 #define CATEGORY_MASK_ISO_ELSE \
 610   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 611
 612 #define CATEGORY_MASK_ISO_ESCAPE        \
 613   (CATEGORY_MASK_ISO_7                  \
 614    | CATEGORY_MASK_ISO_7_TIGHT          \
 615    | CATEGORY_MASK_ISO_7_ELSE           \
 616    | CATEGORY_MASK_ISO_8_ELSE)
 617
 618 #define CATEGORY_MASK_ISO       \
 619   (  CATEGORY_MASK_ISO_7BIT     \
 620      | CATEGORY_MASK_ISO_8BIT   \
 621      | CATEGORY_MASK_ISO_ELSE)
 622
 623 #define CATEGORY_MASK_UTF_16            \
 624   (CATEGORY_MASK_UTF_16_AUTO            \
 625    | CATEGORY_MASK_UTF_16_BE            \
 626    | CATEGORY_MASK_UTF_16_LE            \
 627    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 628    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 629
 630 #define CATEGORY_MASK_UTF_8     \
 631   (CATEGORY_MASK_UTF_8_AUTO     \
 632    | CATEGORY_MASK_UTF_8_NOSIG  \
 633    | CATEGORY_MASK_UTF_8_SIG)
 634
 635 /* Table of coding categories (Lisp symbols).  This variable is for
 636    internal use only.  */
 637 static Lisp_Object Vcoding_category_table;
 638
 639 /* Table of coding-categories ordered by priority.  */
 640 static enum coding_category coding_priorities[coding_category_max];
 641
 642 /* Nth element is a coding context for the coding system bound to the
 643    Nth coding category.  */
 644 static struct coding_system coding_categories[coding_category_max];
 645
 646 /*** Commonly used macros and functions ***/
 647
 648 #ifndef min
 649 #define min(a, b) ((a) < (b) ? (a) : (b))
 650 #endif
 651 #ifndef max
 652 #define max(a, b) ((a) > (b) ? (a) : (b))
 653 #endif
 654
 655 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 656
 657 static int
 658 encode_inhibit_flag (Lisp_Object flag)
 659 {
 660   return NILP (flag) ? -1 : EQ (flag, Qt);
 661 }
 662
 663 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 664    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 665
 666 static bool
 667 inhibit_flag (int encoded_flag, bool var)
 668 {
 669   return 0 < encoded_flag + var;
 670 }
 671
 672 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 673   do {                                                  \
 674     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 675     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 676   } while (0)
 677
 678 static void
 679 CHECK_NATNUM_CAR (Lisp_Object x)
 680 {
 681   Lisp_Object tmp = XCAR (x);
 682   CHECK_NATNUM (tmp);
 683   XSETCAR (x, tmp);
 684 }
 685
 686 static void
 687 CHECK_NATNUM_CDR (Lisp_Object x)
 688 {
 689   Lisp_Object tmp = XCDR (x);
 690   CHECK_NATNUM (tmp);
 691   XSETCDR (x, tmp);
 692 }
 693
 694
 695 /* Safely get one byte from the source text pointed by SRC which ends
 696    at SRC_END, and set C to that byte.  If there are not enough bytes
 697    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 698    and a multibyte character is found at SRC, set C to the
 699    negative value of the character code.  The caller should declare
 700    and set these variables appropriately in advance:
 701         src, src_end, multibytep */
 702
 703 #define ONE_MORE_BYTE(c)                                \
 704   do {                                                  \
 705     if (src == src_end)                                 \
 706       {                                                 \
 707         if (src_base < src)                             \
 708           record_conversion_result                      \
 709             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 710         goto no_more_source;                            \
 711       }                                                 \
 712     c = *src++;                                         \
 713     if (multibytep && (c & 0x80))                       \
 714       {                                                 \
 715         if ((c & 0xFE) == 0xC0)                         \
 716           c = ((c & 1) << 6) | *src++;                  \
 717         else                                            \
 718           {                                             \
 719             src--;                                      \
 720             c = - string_char (src, &src, NULL);        \
 721             record_conversion_result                    \
 722               (coding, CODING_RESULT_INVALID_SRC);      \
 723           }                                             \
 724       }                                                 \
 725     consumed_chars++;                                   \
 726   } while (0)
 727
 728 /* Safely get two bytes from the source text pointed by SRC which ends
 729    at SRC_END, and set C1 and C2 to those bytes while skipping the
 730    heading multibyte characters.  If there are not enough bytes in the
 731    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 732    a multibyte character is found for C2, set C2 to the negative value
 733    of the character code.  The caller should declare and set these
 734    variables appropriately in advance:
 735         src, src_end, multibytep
 736    It is intended that this macro is used in detect_coding_utf_16.  */
 737
 738 #define TWO_MORE_BYTES(c1, c2)                          \
 739   do {                                                  \
 740     do {                                                \
 741       if (src == src_end)                               \
 742         goto no_more_source;                            \
 743       c1 = *src++;                                      \
 744       if (multibytep && (c1 & 0x80))                    \
 745         {                                               \
 746           if ((c1 & 0xFE) == 0xC0)                      \
 747             c1 = ((c1 & 1) << 6) | *src++;              \
 748           else                                          \
 749             {                                           \
 750               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 751               c1 = -1;                                  \
 752             }                                           \
 753         }                                               \
 754     } while (c1 < 0);                                   \
 755     if (src == src_end)                                 \
 756       goto no_more_source;                              \
 757     c2 = *src++;                                        \
 758     if (multibytep && (c2 & 0x80))                      \
 759       {                                                 \
 760         if ((c2 & 0xFE) == 0xC0)                        \
 761           c2 = ((c2 & 1) << 6) | *src++;                \
 762         else                                            \
 763           c2 = -1;                                      \
 764       }                                                 \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  The caller should
 770    assure that C is 0..127, and declare and set the variable `dst'
 771    appropriately in advance.
 772 */
 773
 774
 775 #define EMIT_ONE_ASCII_BYTE(c)  \
 776   do {                          \
 777     produced_chars++;           \
 778     *dst++ = (c);               \
 779   } while (0)
 780
 781
 782 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 783
 784 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 785   do {                                  \
 786     produced_chars += 2;                \
 787     *dst++ = (c1), *dst++ = (c2);       \
 788   } while (0)
 789
 790
 791 /* Store a byte C in the place pointed by DST and increment DST to the
 792    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 793    store in an appropriate multibyte form.  The caller should
 794    declare and set the variables `dst' and `multibytep' appropriately
 795    in advance.  */
 796
 797 #define EMIT_ONE_BYTE(c)                \
 798   do {                                  \
 799     produced_chars++;                   \
 800     if (multibytep)                     \
 801       {                                 \
 802         unsigned ch = (c);              \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       *dst++ = (c);                     \
 809   } while (0)
 810
 811
 812 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 813
 814 #define EMIT_TWO_BYTES(c1, c2)          \
 815   do {                                  \
 816     produced_chars += 2;                \
 817     if (multibytep)                     \
 818       {                                 \
 819         unsigned ch;                    \
 820                                         \
 821         ch = (c1);                      \
 822         if (ch >= 0x80)                 \
 823           ch = BYTE8_TO_CHAR (ch);      \
 824         CHAR_STRING_ADVANCE (ch, dst);  \
 825         ch = (c2);                      \
 826         if (ch >= 0x80)                 \
 827           ch = BYTE8_TO_CHAR (ch);      \
 828         CHAR_STRING_ADVANCE (ch, dst);  \
 829       }                                 \
 830     else                                \
 831       {                                 \
 832         *dst++ = (c1);                  \
 833         *dst++ = (c2);                  \
 834       }                                 \
 835   } while (0)
 836
 837
 838 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 839   do {                                  \
 840     EMIT_ONE_BYTE (c1);                 \
 841     EMIT_TWO_BYTES (c2, c3);            \
 842   } while (0)
 843
 844
 845 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 846   do {                                          \
 847     EMIT_TWO_BYTES (c1, c2);                    \
 848     EMIT_TWO_BYTES (c3, c4);                    \
 849   } while (0)
 850
 851
 852 static void
 853 record_conversion_result (struct coding_system *coding,
 854                           enum coding_result_code result)
 855 {
 856   coding->result = result;
 857   switch (result)
 858     {
 859     case CODING_RESULT_INSUFFICIENT_SRC:
 860       Vlast_code_conversion_error = Qinsufficient_source;
 861       break;
 862     case CODING_RESULT_INVALID_SRC:
 863       Vlast_code_conversion_error = Qinvalid_source;
 864       break;
 865     case CODING_RESULT_INTERRUPT:
 866       Vlast_code_conversion_error = Qinterrupted;
 867       break;
 868     case CODING_RESULT_INSUFFICIENT_DST:
 869       /* Don't record this error in Vlast_code_conversion_error
 870          because it happens just temporarily and is resolved when the
 871          whole conversion is finished.  */
 872       break;
 873     case CODING_RESULT_SUCCESS:
 874       break;
 875     default:
 876       Vlast_code_conversion_error = intern ("Unknown error");
 877     }
 878 }
 879
 880 /* These wrapper macros are used to preserve validity of pointers into
 881    buffer text across calls to decode_char, encode_char, etc, which
 882    could cause relocation of buffers if it loads a charset map,
 883    because loading a charset map allocates large structures.  */
 884
 885 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 886   do {                                                                       \
 887     ptrdiff_t offset;                                                        \
 888                                                                              \
 889     charset_map_loaded = 0;                                                  \
 890     c = DECODE_CHAR (charset, code);                                         \
 891     if (charset_map_loaded                                                   \
 892         && (offset = coding_change_source (coding)))                         \
 893       {                                                                      \
 894         src += offset;                                                       \
 895         src_base += offset;                                                  \
 896         src_end += offset;                                                   \
 897       }                                                                      \
 898   } while (0)
 899
 900 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 901   do {                                                                  \
 902     ptrdiff_t offset;                                                   \
 903                                                                         \
 904     charset_map_loaded = 0;                                             \
 905     code = ENCODE_CHAR (charset, c);                                    \
 906     if (charset_map_loaded                                              \
 907         && (offset = coding_change_destination (coding)))               \
 908       {                                                                 \
 909         dst += offset;                                                  \
 910         dst_end += offset;                                              \
 911       }                                                                 \
 912   } while (0)
 913
 914 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 915   do {                                                                  \
 916     ptrdiff_t offset;                                                   \
 917                                                                         \
 918     charset_map_loaded = 0;                                             \
 919     charset = char_charset (c, charset_list, code_return);              \
 920     if (charset_map_loaded                                              \
 921         && (offset = coding_change_destination (coding)))               \
 922       {                                                                 \
 923         dst += offset;                                                  \
 924         dst_end += offset;                                              \
 925       }                                                                 \
 926   } while (0)
 927
 928 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 929   do {                                                                  \
 930     ptrdiff_t offset;                                                   \
 931                                                                         \
 932     charset_map_loaded = 0;                                             \
 933     result = CHAR_CHARSET_P (c, charset);                               \
 934     if (charset_map_loaded                                              \
 935         && (offset = coding_change_destination (coding)))               \
 936       {                                                                 \
 937         dst += offset;                                                  \
 938         dst_end += offset;                                              \
 939       }                                                                 \
 940   } while (0)
 941
 942
 943 /* If there are at least BYTES length of room at dst, allocate memory
 944    for coding->destination and update dst and dst_end.  We don't have
 945    to take care of coding->source which will be relocated.  It is
 946    handled by calling coding_set_source in encode_coding.  */
 947
 948 #define ASSURE_DESTINATION(bytes)                               \
 949   do {                                                          \
 950     if (dst + (bytes) >= dst_end)                               \
 951       {                                                         \
 952         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 953                                                                 \
 954         dst = alloc_destination (coding, more_bytes, dst);      \
 955         dst_end = coding->destination + coding->dst_bytes;      \
 956       }                                                         \
 957   } while (0)
 958
 959
 960 /* Store multibyte form of the character C in P, and advance P to the
 961    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 962    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 963    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 964
 965 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 966
 967 /* Return the character code of character whose multibyte form is at
 968    P, and advance P to the end of the multibyte form.  This used to be
 969    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 970    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 971
 972 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 973
 974 /* Set coding->source from coding->src_object.  */
 975
 976 static void
 977 coding_set_source (struct coding_system *coding)
 978 {
 979   if (BUFFERP (coding->src_object))
 980     {
 981       struct buffer *buf = XBUFFER (coding->src_object);
 982
 983       if (coding->src_pos < 0)
 984         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 985       else
 986         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 987     }
 988   else if (STRINGP (coding->src_object))
 989     {
 990       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 991     }
 992   else
 993     {
 994       /* Otherwise, the source is C string and is never relocated
 995          automatically.  Thus we don't have to update anything.  */
 996     }
 997 }
 998
 999
1000 /* Set coding->source from coding->src_object, and return how many
1001    bytes coding->source was changed.  */
1002
1003 static ptrdiff_t
1004 coding_change_source (struct coding_system *coding)
1005 {
1006   const unsigned char *orig = coding->source;
1007   coding_set_source (coding);
1008   return coding->source - orig;
1009 }
1010
1011
1012 /* Set coding->destination from coding->dst_object.  */
1013
1014 static void
1015 coding_set_destination (struct coding_system *coding)
1016 {
1017   if (BUFFERP (coding->dst_object))
1018     {
1019       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1020         {
1021           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1022           coding->dst_bytes = (GAP_END_ADDR
1023                                - (coding->src_bytes - coding->consumed)
1024                                - coding->destination);
1025         }
1026       else
1027         {
1028           /* We are sure that coding->dst_pos_byte is before the gap
1029              of the buffer. */
1030           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1031                                  + coding->dst_pos_byte - BEG_BYTE);
1032           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1033                                - coding->destination);
1034         }
1035     }
1036   else
1037     {
1038       /* Otherwise, the destination is C string and is never relocated
1039          automatically.  Thus we don't have to update anything.  */
1040     }
1041 }
1042
1043
1044 /* Set coding->destination from coding->dst_object, and return how
1045    many bytes coding->destination was changed.  */
1046
1047 static ptrdiff_t
1048 coding_change_destination (struct coding_system *coding)
1049 {
1050   const unsigned char *orig = coding->destination;
1051   coding_set_destination (coding);
1052   return coding->destination - orig;
1053 }
1054
1055
1056 static void
1057 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1058 {
1059   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1060     string_overflow ();
1061   coding->destination = xrealloc (coding->destination,
1062                                   coding->dst_bytes + bytes);
1063   coding->dst_bytes += bytes;
1064 }
1065
1066 static void
1067 coding_alloc_by_making_gap (struct coding_system *coding,
1068                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1069 {
1070   if (EQ (coding->src_object, coding->dst_object))
1071     {
1072       /* The gap may contain the produced data at the head and not-yet
1073          consumed data at the tail.  To preserve those data, we at
1074          first make the gap size to zero, then increase the gap
1075          size.  */
1076       ptrdiff_t add = GAP_SIZE;
1077
1078       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1079       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1080       make_gap (bytes);
1081       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1082       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1083     }
1084   else
1085     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1086 }
1087
1088
1089 static unsigned char *
1090 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1091                    unsigned char *dst)
1092 {
1093   ptrdiff_t offset = dst - coding->destination;
1094
1095   if (BUFFERP (coding->dst_object))
1096     {
1097       struct buffer *buf = XBUFFER (coding->dst_object);
1098
1099       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1100     }
1101   else
1102     coding_alloc_by_realloc (coding, nbytes);
1103   coding_set_destination (coding);
1104   dst = coding->destination + offset;
1105   return dst;
1106 }
1107
1108 /** Macros for annotations.  */
1109
1110 /* An annotation data is stored in the array coding->charbuf in this
1111    format:
1112      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1113    LENGTH is the number of elements in the annotation.
1114    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1115    NCHARS is the number of characters in the text annotated.
1116
1117    The format of the following elements depend on ANNOTATION_MASK.
1118
1119    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1120    follows:
1121      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1122
1123    NBYTES is the number of bytes specified in the header part of
1124    old-style emacs-mule encoding, or 0 for the other kind of
1125    composition.
1126
1127    METHOD is one of enum composition_method.
1128
1129    Optional COMPOSITION-COMPONENTS are characters and composition
1130    rules.
1131
1132    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1133    follows.
1134
1135    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1136    recover from an invalid annotation, and should be skipped by
1137    produce_annotation.  */
1138
1139 /* Maximum length of the header of annotation data.  */
1140 #define MAX_ANNOTATION_LENGTH 5
1141
1142 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1143   do {                                                  \
1144     *(buf)++ = -(len);                                  \
1145     *(buf)++ = (mask);                                  \
1146     *(buf)++ = (nchars);                                \
1147     coding->annotated = 1;                              \
1148   } while (0);
1149
1150 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1151   do {                                                                      \
1152     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1153     *buf++ = nbytes;                                                        \
1154     *buf++ = method;                                                        \
1155   } while (0)
1156
1157
1158 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1159   do {                                                                  \
1160     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1161     *buf++ = id;                                                        \
1162   } while (0)
1163
1164
1165 /* Bitmasks for coding->eol_seen.  */
1166
1167 #define EOL_SEEN_NONE   0
1168 #define EOL_SEEN_LF     1
1169 #define EOL_SEEN_CR     2
1170 #define EOL_SEEN_CRLF   4
1171
1172 \f
1173 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1174
1175
1176
1177 \f
1178 /*** 3. UTF-8 ***/
1179
1180 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1181    Return true if a text is encoded in UTF-8.  */
1182
1183 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1184 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1185 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1186 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1187 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1188 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1189
1190 #define UTF_8_BOM_1 0xEF
1191 #define UTF_8_BOM_2 0xBB
1192 #define UTF_8_BOM_3 0xBF
1193
1194 /* Unlike the other detect_coding_XXX, this function counts the number
1195    of characters and checks the EOL format.  */
1196
1197 static bool
1198 detect_coding_utf_8 (struct coding_system *coding,
1199                      struct coding_detection_info *detect_info)
1200 {
1201   const unsigned char *src = coding->source, *src_base;
1202   const unsigned char *src_end = coding->source + coding->src_bytes;
1203   bool multibytep = coding->src_multibyte;
1204   ptrdiff_t consumed_chars = 0;
1205   bool bom_found = 0;
1206   ptrdiff_t nchars = coding->head_ascii;
1207   int eol_seen = coding->eol_seen;
1208
1209   detect_info->checked |= CATEGORY_MASK_UTF_8;
1210   /* A coding system of this category is always ASCII compatible.  */
1211   src += nchars;
1212
1213   if (src == coding->source     /* BOM should be at the head.  */
1214       && src + 3 < src_end      /* BOM is 3-byte long.  */
1215       && src[0] == UTF_8_BOM_1
1216       && src[1] == UTF_8_BOM_2
1217       && src[2] == UTF_8_BOM_3)
1218     {
1219       bom_found = 1;
1220       src += 3;
1221       nchars++;
1222     }
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4;
1227
1228       src_base = src;
1229       ONE_MORE_BYTE (c);
1230       if (c < 0 || UTF_8_1_OCTET_P (c))
1231         {
1232           nchars++;
1233           if (c == '\r')
1234             {
1235               if (src < src_end && *src == '\n')
1236                 {
1237                   eol_seen |= EOL_SEEN_CRLF;
1238                   src++;
1239                   nchars++;
1240                 }
1241               else
1242                 eol_seen |= EOL_SEEN_CR;
1243             }
1244           else if (c == '\n')
1245             eol_seen |= EOL_SEEN_LF;
1246           continue;
1247         }
1248       ONE_MORE_BYTE (c1);
1249       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1250         break;
1251       if (UTF_8_2_OCTET_LEADING_P (c))
1252         {
1253           nchars++;
1254           continue;
1255         }
1256       ONE_MORE_BYTE (c2);
1257       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1258         break;
1259       if (UTF_8_3_OCTET_LEADING_P (c))
1260         {
1261           nchars++;
1262           continue;
1263         }
1264       ONE_MORE_BYTE (c3);
1265       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1266         break;
1267       if (UTF_8_4_OCTET_LEADING_P (c))
1268         {
1269           nchars++;
1270           continue;
1271         }
1272       ONE_MORE_BYTE (c4);
1273       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274         break;
1275       if (UTF_8_5_OCTET_LEADING_P (c))
1276         {
1277           nchars++;
1278           continue;
1279         }
1280       break;
1281     }
1282   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1283   return 0;
1284
1285  no_more_source:
1286   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1287     {
1288       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1289       return 0;
1290     }
1291   if (bom_found)
1292     {
1293       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1294       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1295     }
1296   else
1297     {
1298       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1299       if (nchars < src_end - coding->source)
1300         /* The found characters are less than source bytes, which
1301            means that we found a valid non-ASCII characters.  */
1302         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1303     }
1304   coding->detected_utf8_bytes = src_base - coding->source;
1305   coding->detected_utf8_chars = nchars;
1306   return 1;
1307 }
1308
1309
1310 static void
1311 decode_coding_utf_8 (struct coding_system *coding)
1312 {
1313   const unsigned char *src = coding->source + coding->consumed;
1314   const unsigned char *src_end = coding->source + coding->src_bytes;
1315   const unsigned char *src_base;
1316   int *charbuf = coding->charbuf + coding->charbuf_used;
1317   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1318   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1319   bool multibytep = coding->src_multibyte;
1320   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1321   bool eol_dos
1322     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1323   int byte_after_cr = -1;
1324
1325   if (bom != utf_without_bom)
1326     {
1327       int c1, c2, c3;
1328
1329       src_base = src;
1330       ONE_MORE_BYTE (c1);
1331       if (! UTF_8_3_OCTET_LEADING_P (c1))
1332         src = src_base;
1333       else
1334         {
1335           ONE_MORE_BYTE (c2);
1336           if (! UTF_8_EXTRA_OCTET_P (c2))
1337             src = src_base;
1338           else
1339             {
1340               ONE_MORE_BYTE (c3);
1341               if (! UTF_8_EXTRA_OCTET_P (c3))
1342                 src = src_base;
1343               else
1344                 {
1345                   if ((c1 != UTF_8_BOM_1)
1346                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1347                     src = src_base;
1348                   else
1349                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1350                 }
1351             }
1352         }
1353     }
1354   CODING_UTF_8_BOM (coding) = utf_without_bom;
1355
1356   while (1)
1357     {
1358       int c, c1, c2, c3, c4, c5;
1359
1360       src_base = src;
1361       consumed_chars_base = consumed_chars;
1362
1363       if (charbuf >= charbuf_end)
1364         {
1365           if (byte_after_cr >= 0)
1366             src_base--;
1367           break;
1368         }
1369
1370       /* In the simple case, rapidly handle ordinary characters */
1371       if (multibytep && ! eol_dos
1372           && charbuf < charbuf_end - 6 && src < src_end - 6)
1373         {
1374           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1375             {
1376               c1 = *src;
1377               if (c1 & 0x80)
1378                 break;
1379               src++;
1380               consumed_chars++;
1381               *charbuf++ = c1;
1382
1383               c1 = *src;
1384               if (c1 & 0x80)
1385                 break;
1386               src++;
1387               consumed_chars++;
1388               *charbuf++ = c1;
1389
1390               c1 = *src;
1391               if (c1 & 0x80)
1392                 break;
1393               src++;
1394               consumed_chars++;
1395               *charbuf++ = c1;
1396
1397               c1 = *src;
1398               if (c1 & 0x80)
1399                 break;
1400               src++;
1401               consumed_chars++;
1402               *charbuf++ = c1;
1403             }
1404           /* If we handled at least one character, restart the main loop.  */
1405           if (src != src_base)
1406             continue;
1407         }
1408
1409       if (byte_after_cr >= 0)
1410         c1 = byte_after_cr, byte_after_cr = -1;
1411       else
1412         ONE_MORE_BYTE (c1);
1413       if (c1 < 0)
1414         {
1415           c = - c1;
1416         }
1417       else if (UTF_8_1_OCTET_P (c1))
1418         {
1419           if (eol_dos && c1 == '\r')
1420             ONE_MORE_BYTE (byte_after_cr);
1421           c = c1;
1422         }
1423       else
1424         {
1425           ONE_MORE_BYTE (c2);
1426           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1427             goto invalid_code;
1428           if (UTF_8_2_OCTET_LEADING_P (c1))
1429             {
1430               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1431               /* Reject overlong sequences here and below.  Encoders
1432                  producing them are incorrect, they can be misleading,
1433                  and they mess up read/write invariance.  */
1434               if (c < 128)
1435                 goto invalid_code;
1436             }
1437           else
1438             {
1439               ONE_MORE_BYTE (c3);
1440               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1441                 goto invalid_code;
1442               if (UTF_8_3_OCTET_LEADING_P (c1))
1443                 {
1444                   c = (((c1 & 0xF) << 12)
1445                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1446                   if (c < 0x800
1447                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1448                     goto invalid_code;
1449                 }
1450               else
1451                 {
1452                   ONE_MORE_BYTE (c4);
1453                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1454                     goto invalid_code;
1455                   if (UTF_8_4_OCTET_LEADING_P (c1))
1456                     {
1457                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1458                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1459                     if (c < 0x10000)
1460                       goto invalid_code;
1461                     }
1462                   else
1463                     {
1464                       ONE_MORE_BYTE (c5);
1465                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1466                         goto invalid_code;
1467                       if (UTF_8_5_OCTET_LEADING_P (c1))
1468                         {
1469                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1470                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1471                                | (c5 & 0x3F));
1472                           if ((c > MAX_CHAR) || (c < 0x200000))
1473                             goto invalid_code;
1474                         }
1475                       else
1476                         goto invalid_code;
1477                     }
1478                 }
1479             }
1480         }
1481
1482       *charbuf++ = c;
1483       continue;
1484
1485     invalid_code:
1486       src = src_base;
1487       consumed_chars = consumed_chars_base;
1488       ONE_MORE_BYTE (c);
1489       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1490       coding->errors++;
1491     }
1492
1493  no_more_source:
1494   coding->consumed_char += consumed_chars_base;
1495   coding->consumed = src_base - coding->source;
1496   coding->charbuf_used = charbuf - coding->charbuf;
1497 }
1498
1499
1500 static bool
1501 encode_coding_utf_8 (struct coding_system *coding)
1502 {
1503   bool multibytep = coding->dst_multibyte;
1504   int *charbuf = coding->charbuf;
1505   int *charbuf_end = charbuf + coding->charbuf_used;
1506   unsigned char *dst = coding->destination + coding->produced;
1507   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1508   ptrdiff_t produced_chars = 0;
1509   int c;
1510
1511   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1512     {
1513       ASSURE_DESTINATION (3);
1514       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1515       CODING_UTF_8_BOM (coding) = utf_without_bom;
1516     }
1517
1518   if (multibytep)
1519     {
1520       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1521
1522       while (charbuf < charbuf_end)
1523         {
1524           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1525
1526           ASSURE_DESTINATION (safe_room);
1527           c = *charbuf++;
1528           if (CHAR_BYTE8_P (c))
1529             {
1530               c = CHAR_TO_BYTE8 (c);
1531               EMIT_ONE_BYTE (c);
1532             }
1533           else
1534             {
1535               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1536               for (p = str; p < pend; p++)
1537                 EMIT_ONE_BYTE (*p);
1538             }
1539         }
1540     }
1541   else
1542     {
1543       int safe_room = MAX_MULTIBYTE_LENGTH;
1544
1545       while (charbuf < charbuf_end)
1546         {
1547           ASSURE_DESTINATION (safe_room);
1548           c = *charbuf++;
1549           if (CHAR_BYTE8_P (c))
1550             *dst++ = CHAR_TO_BYTE8 (c);
1551           else
1552             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1553         }
1554       produced_chars = dst - (coding->destination + coding->produced);
1555     }
1556   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1557   coding->produced_char += produced_chars;
1558   coding->produced = dst - coding->destination;
1559   return 0;
1560 }
1561
1562
1563 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1564    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1565
1566 #define UTF_16_HIGH_SURROGATE_P(val) \
1567   (((val) & 0xFC00) == 0xD800)
1568
1569 #define UTF_16_LOW_SURROGATE_P(val) \
1570   (((val) & 0xFC00) == 0xDC00)
1571
1572
1573 static bool
1574 detect_coding_utf_16 (struct coding_system *coding,
1575                       struct coding_detection_info *detect_info)
1576 {
1577   const unsigned char *src = coding->source;
1578   const unsigned char *src_end = coding->source + coding->src_bytes;
1579   bool multibytep = coding->src_multibyte;
1580   int c1, c2;
1581
1582   detect_info->checked |= CATEGORY_MASK_UTF_16;
1583   if (coding->mode & CODING_MODE_LAST_BLOCK
1584       && (coding->src_chars & 1))
1585     {
1586       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1587       return 0;
1588     }
1589
1590   TWO_MORE_BYTES (c1, c2);
1591   if ((c1 == 0xFF) && (c2 == 0xFE))
1592     {
1593       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1594                              | CATEGORY_MASK_UTF_16_AUTO);
1595       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1596                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1597                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1598     }
1599   else if ((c1 == 0xFE) && (c2 == 0xFF))
1600     {
1601       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1602                              | CATEGORY_MASK_UTF_16_AUTO);
1603       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1604                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1605                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1606     }
1607   else if (c2 < 0)
1608     {
1609       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1610       return 0;
1611     }
1612   else
1613     {
1614       /* We check the dispersion of Eth and Oth bytes where E is even and
1615          O is odd.  If both are high, we assume binary data.*/
1616       unsigned char e[256], o[256];
1617       unsigned e_num = 1, o_num = 1;
1618
1619       memset (e, 0, 256);
1620       memset (o, 0, 256);
1621       e[c1] = 1;
1622       o[c2] = 1;
1623
1624       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1625                                 |CATEGORY_MASK_UTF_16_BE
1626                                 | CATEGORY_MASK_UTF_16_LE);
1627
1628       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1629              != CATEGORY_MASK_UTF_16)
1630         {
1631           TWO_MORE_BYTES (c1, c2);
1632           if (c2 < 0)
1633             break;
1634           if (! e[c1])
1635             {
1636               e[c1] = 1;
1637               e_num++;
1638               if (e_num >= 128)
1639                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1640             }
1641           if (! o[c2])
1642             {
1643               o[c2] = 1;
1644               o_num++;
1645               if (o_num >= 128)
1646                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1647             }
1648         }
1649       return 0;
1650     }
1651
1652  no_more_source:
1653   return 1;
1654 }
1655
1656 static void
1657 decode_coding_utf_16 (struct coding_system *coding)
1658 {
1659   const unsigned char *src = coding->source + coding->consumed;
1660   const unsigned char *src_end = coding->source + coding->src_bytes;
1661   const unsigned char *src_base;
1662   int *charbuf = coding->charbuf + coding->charbuf_used;
1663   /* We may produces at most 3 chars in one loop.  */
1664   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1665   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1666   bool multibytep = coding->src_multibyte;
1667   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1668   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1669   int surrogate = CODING_UTF_16_SURROGATE (coding);
1670   bool eol_dos
1671     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1672   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1673
1674   if (bom == utf_with_bom)
1675     {
1676       int c, c1, c2;
1677
1678       src_base = src;
1679       ONE_MORE_BYTE (c1);
1680       ONE_MORE_BYTE (c2);
1681       c = (c1 << 8) | c2;
1682
1683       if (endian == utf_16_big_endian
1684           ? c != 0xFEFF : c != 0xFFFE)
1685         {
1686           /* The first two bytes are not BOM.  Treat them as bytes
1687              for a normal character.  */
1688           src = src_base;
1689           coding->errors++;
1690         }
1691       CODING_UTF_16_BOM (coding) = utf_without_bom;
1692     }
1693   else if (bom == utf_detect_bom)
1694     {
1695       /* We have already tried to detect BOM and failed in
1696          detect_coding.  */
1697       CODING_UTF_16_BOM (coding) = utf_without_bom;
1698     }
1699
1700   while (1)
1701     {
1702       int c, c1, c2;
1703
1704       src_base = src;
1705       consumed_chars_base = consumed_chars;
1706
1707       if (charbuf >= charbuf_end)
1708         {
1709           if (byte_after_cr1 >= 0)
1710             src_base -= 2;
1711           break;
1712         }
1713
1714       if (byte_after_cr1 >= 0)
1715         c1 = byte_after_cr1, byte_after_cr1 = -1;
1716       else
1717         ONE_MORE_BYTE (c1);
1718       if (c1 < 0)
1719         {
1720           *charbuf++ = -c1;
1721           continue;
1722         }
1723       if (byte_after_cr2 >= 0)
1724         c2 = byte_after_cr2, byte_after_cr2 = -1;
1725       else
1726         ONE_MORE_BYTE (c2);
1727       if (c2 < 0)
1728         {
1729           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1730           *charbuf++ = -c2;
1731           continue;
1732         }
1733       c = (endian == utf_16_big_endian
1734            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1735
1736       if (surrogate)
1737         {
1738           if (! UTF_16_LOW_SURROGATE_P (c))
1739             {
1740               if (endian == utf_16_big_endian)
1741                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1742               else
1743                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1744               *charbuf++ = c1;
1745               *charbuf++ = c2;
1746               coding->errors++;
1747               if (UTF_16_HIGH_SURROGATE_P (c))
1748                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1749               else
1750                 *charbuf++ = c;
1751             }
1752           else
1753             {
1754               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1755               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1756               *charbuf++ = 0x10000 + c;
1757             }
1758         }
1759       else
1760         {
1761           if (UTF_16_HIGH_SURROGATE_P (c))
1762             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1763           else
1764             {
1765               if (eol_dos && c == '\r')
1766                 {
1767                   ONE_MORE_BYTE (byte_after_cr1);
1768                   ONE_MORE_BYTE (byte_after_cr2);
1769                 }
1770               *charbuf++ = c;
1771             }
1772         }
1773     }
1774
1775  no_more_source:
1776   coding->consumed_char += consumed_chars_base;
1777   coding->consumed = src_base - coding->source;
1778   coding->charbuf_used = charbuf - coding->charbuf;
1779 }
1780
1781 static bool
1782 encode_coding_utf_16 (struct coding_system *coding)
1783 {
1784   bool multibytep = coding->dst_multibyte;
1785   int *charbuf = coding->charbuf;
1786   int *charbuf_end = charbuf + coding->charbuf_used;
1787   unsigned char *dst = coding->destination + coding->produced;
1788   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1789   int safe_room = 8;
1790   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1791   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1792   ptrdiff_t produced_chars = 0;
1793   int c;
1794
1795   if (bom != utf_without_bom)
1796     {
1797       ASSURE_DESTINATION (safe_room);
1798       if (big_endian)
1799         EMIT_TWO_BYTES (0xFE, 0xFF);
1800       else
1801         EMIT_TWO_BYTES (0xFF, 0xFE);
1802       CODING_UTF_16_BOM (coding) = utf_without_bom;
1803     }
1804
1805   while (charbuf < charbuf_end)
1806     {
1807       ASSURE_DESTINATION (safe_room);
1808       c = *charbuf++;
1809       if (c > MAX_UNICODE_CHAR)
1810         c = coding->default_char;
1811
1812       if (c < 0x10000)
1813         {
1814           if (big_endian)
1815             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1816           else
1817             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1818         }
1819       else
1820         {
1821           int c1, c2;
1822
1823           c -= 0x10000;
1824           c1 = (c >> 10) + 0xD800;
1825           c2 = (c & 0x3FF) + 0xDC00;
1826           if (big_endian)
1827             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1828           else
1829             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1830         }
1831     }
1832   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1833   coding->produced = dst - coding->destination;
1834   coding->produced_char += produced_chars;
1835   return 0;
1836 }
1837
1838 \f
1839 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1840
1841 /* Emacs' internal format for representation of multiple character
1842    sets is a kind of multi-byte encoding, i.e. characters are
1843    represented by variable-length sequences of one-byte codes.
1844
1845    ASCII characters and control characters (e.g. `tab', `newline') are
1846    represented by one-byte sequences which are their ASCII codes, in
1847    the range 0x00 through 0x7F.
1848
1849    8-bit characters of the range 0x80..0x9F are represented by
1850    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1851    code + 0x20).
1852
1853    8-bit characters of the range 0xA0..0xFF are represented by
1854    one-byte sequences which are their 8-bit code.
1855
1856    The other characters are represented by a sequence of `base
1857    leading-code', optional `extended leading-code', and one or two
1858    `position-code's.  The length of the sequence is determined by the
1859    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1860    whereas extended leading-code and position-code take the range 0xA0
1861    through 0xFF.  See `charset.h' for more details about leading-code
1862    and position-code.
1863
1864    --- CODE RANGE of Emacs' internal format ---
1865    character set        range
1866    -------------        -----
1867    ascii                0x00..0x7F
1868    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1869    eight-bit-graphic    0xA0..0xBF
1870    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1871    ---------------------------------------------
1872
1873    As this is the internal character representation, the format is
1874    usually not used externally (i.e. in a file or in a data sent to a
1875    process).  But, it is possible to have a text externally in this
1876    format (i.e. by encoding by the coding system `emacs-mule').
1877
1878    In that case, a sequence of one-byte codes has a slightly different
1879    form.
1880
1881    At first, all characters in eight-bit-control are represented by
1882    one-byte sequences which are their 8-bit code.
1883
1884    Next, character composition data are represented by the byte
1885    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1886    where,
1887         METHOD is 0xF2 plus one of composition method (enum
1888         composition_method),
1889
1890         BYTES is 0xA0 plus a byte length of this composition data,
1891
1892         CHARS is 0xA0 plus a number of characters composed by this
1893         data,
1894
1895         COMPONENTs are characters of multibyte form or composition
1896         rules encoded by two-byte of ASCII codes.
1897
1898    In addition, for backward compatibility, the following formats are
1899    also recognized as composition data on decoding.
1900
1901    0x80 MSEQ ...
1902    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1903
1904    Here,
1905         MSEQ is a multibyte form but in these special format:
1906           ASCII: 0xA0 ASCII_CODE+0x80,
1907           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1908         RULE is a one byte code of the range 0xA0..0xF0 that
1909         represents a composition rule.
1910   */
1911
1912 char emacs_mule_bytes[256];
1913
1914
1915 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1916    Return true if a text is encoded in 'emacs-mule'.  */
1917
1918 static bool
1919 detect_coding_emacs_mule (struct coding_system *coding,
1920                           struct coding_detection_info *detect_info)
1921 {
1922   const unsigned char *src = coding->source, *src_base;
1923   const unsigned char *src_end = coding->source + coding->src_bytes;
1924   bool multibytep = coding->src_multibyte;
1925   ptrdiff_t consumed_chars = 0;
1926   int c;
1927   int found = 0;
1928
1929   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1930   /* A coding system of this category is always ASCII compatible.  */
1931   src += coding->head_ascii;
1932
1933   while (1)
1934     {
1935       src_base = src;
1936       ONE_MORE_BYTE (c);
1937       if (c < 0)
1938         continue;
1939       if (c == 0x80)
1940         {
1941           /* Perhaps the start of composite character.  We simply skip
1942              it because analyzing it is too heavy for detecting.  But,
1943              at least, we check that the composite character
1944              constitutes of more than 4 bytes.  */
1945           const unsigned char *src_start;
1946
1947         repeat:
1948           src_start = src;
1949           do
1950             {
1951               ONE_MORE_BYTE (c);
1952             }
1953           while (c >= 0xA0);
1954
1955           if (src - src_start <= 4)
1956             break;
1957           found = CATEGORY_MASK_EMACS_MULE;
1958           if (c == 0x80)
1959             goto repeat;
1960         }
1961
1962       if (c < 0x80)
1963         {
1964           if (c < 0x20
1965               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1966             break;
1967         }
1968       else
1969         {
1970           int more_bytes = emacs_mule_bytes[c] - 1;
1971
1972           while (more_bytes > 0)
1973             {
1974               ONE_MORE_BYTE (c);
1975               if (c < 0xA0)
1976                 {
1977                   src--;        /* Unread the last byte.  */
1978                   break;
1979                 }
1980               more_bytes--;
1981             }
1982           if (more_bytes != 0)
1983             break;
1984           found = CATEGORY_MASK_EMACS_MULE;
1985         }
1986     }
1987   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1988   return 0;
1989
1990  no_more_source:
1991   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1992     {
1993       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1994       return 0;
1995     }
1996   detect_info->found |= found;
1997   return 1;
1998 }
1999
2000
2001 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2002    character.  If CMP_STATUS indicates that we must expect MSEQ or
2003    RULE described above, decode it and return the negative value of
2004    the decoded character or rule.  If an invalid byte is found, return
2005    -1.  If SRC is too short, return -2.  */
2006
2007 static int
2008 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2009                  int *nbytes, int *nchars, int *id,
2010                  struct composition_status *cmp_status)
2011 {
2012   const unsigned char *src_end = coding->source + coding->src_bytes;
2013   const unsigned char *src_base = src;
2014   bool multibytep = coding->src_multibyte;
2015   int charset_ID;
2016   unsigned code;
2017   int c;
2018   ptrdiff_t consumed_chars = 0;
2019   bool mseq_found = 0;
2020
2021   ONE_MORE_BYTE (c);
2022   if (c < 0)
2023     {
2024       c = -c;
2025       charset_ID = emacs_mule_charset[0];
2026     }
2027   else
2028     {
2029       if (c >= 0xA0)
2030         {
2031           if (cmp_status->state != COMPOSING_NO
2032               && cmp_status->old_form)
2033             {
2034               if (cmp_status->state == COMPOSING_CHAR)
2035                 {
2036                   if (c == 0xA0)
2037                     {
2038                       ONE_MORE_BYTE (c);
2039                       c -= 0x80;
2040                       if (c < 0)
2041                         goto invalid_code;
2042                     }
2043                   else
2044                     c -= 0x20;
2045                   mseq_found = 1;
2046                 }
2047               else
2048                 {
2049                   *nbytes = src - src_base;
2050                   *nchars = consumed_chars;
2051                   return -c;
2052                 }
2053             }
2054           else
2055             goto invalid_code;
2056         }
2057
2058       switch (emacs_mule_bytes[c])
2059         {
2060         case 2:
2061           if ((charset_ID = emacs_mule_charset[c]) < 0)
2062             goto invalid_code;
2063           ONE_MORE_BYTE (c);
2064           if (c < 0xA0)
2065             goto invalid_code;
2066           code = c & 0x7F;
2067           break;
2068
2069         case 3:
2070           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2071               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2072             {
2073               ONE_MORE_BYTE (c);
2074               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2075                 goto invalid_code;
2076               ONE_MORE_BYTE (c);
2077               if (c < 0xA0)
2078                 goto invalid_code;
2079               code = c & 0x7F;
2080             }
2081           else
2082             {
2083               if ((charset_ID = emacs_mule_charset[c]) < 0)
2084                 goto invalid_code;
2085               ONE_MORE_BYTE (c);
2086               if (c < 0xA0)
2087                 goto invalid_code;
2088               code = (c & 0x7F) << 8;
2089               ONE_MORE_BYTE (c);
2090               if (c < 0xA0)
2091                 goto invalid_code;
2092               code |= c & 0x7F;
2093             }
2094           break;
2095
2096         case 4:
2097           ONE_MORE_BYTE (c);
2098           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2099             goto invalid_code;
2100           ONE_MORE_BYTE (c);
2101           if (c < 0xA0)
2102             goto invalid_code;
2103           code = (c & 0x7F) << 8;
2104           ONE_MORE_BYTE (c);
2105           if (c < 0xA0)
2106             goto invalid_code;
2107           code |= c & 0x7F;
2108           break;
2109
2110         case 1:
2111           code = c;
2112           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2113           break;
2114
2115         default:
2116           emacs_abort ();
2117         }
2118       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2119                           CHARSET_FROM_ID (charset_ID), code, c);
2120       if (c < 0)
2121         goto invalid_code;
2122     }
2123   *nbytes = src - src_base;
2124   *nchars = consumed_chars;
2125   if (id)
2126     *id = charset_ID;
2127   return (mseq_found ? -c : c);
2128
2129  no_more_source:
2130   return -2;
2131
2132  invalid_code:
2133   return -1;
2134 }
2135
2136
2137 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2138
2139 /* Handle these composition sequence ('|': the end of header elements,
2140    BYTES and CHARS >= 0xA0):
2141
2142    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2143    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2144    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2145
2146    and these old form:
2147
2148    (4) relative composition: 0x80 | MSEQ ... MSEQ
2149    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2150
2151    When the starter 0x80 and the following header elements are found,
2152    this annotation header is produced.
2153
2154         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2155
2156    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2157    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2158
2159    Then, upon reading the following elements, these codes are produced
2160    until the composition end is found:
2161
2162    (1) CHAR ... CHAR
2163    (2) ALT ... ALT CHAR ... CHAR
2164    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2165    (4) CHAR ... CHAR
2166    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2167
2168    When the composition end is found, LENGTH and NCHARS in the
2169    annotation header is updated as below:
2170
2171    (1) LENGTH: unchanged, NCHARS: unchanged
2172    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2173    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2174    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2175    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2176
2177    If an error is found while composing, the annotation header is
2178    changed to the original composition header (plus filler -1s) as
2179    below:
2180
2181    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2182    (5)          [ 0x80 0xFF -1 -1- -1 ]
2183
2184    and the sequence [ -2 DECODED-RULE ] is changed to the original
2185    byte sequence as below:
2186         o the original byte sequence is B: [ B -1 ]
2187         o the original byte sequence is B1 B2: [ B1 B2 ]
2188
2189    Most of the routines are implemented by macros because many
2190    variables and labels in the caller decode_coding_emacs_mule must be
2191    accessible, and they are usually called just once (thus doesn't
2192    increase the size of compiled object).  */
2193
2194 /* Decode a composition rule represented by C as a component of
2195    composition sequence of Emacs 20 style.  Set RULE to the decoded
2196    rule. */
2197
2198 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2199   do {                                                  \
2200     int gref, nref;                                     \
2201                                                         \
2202     c -= 0xA0;                                          \
2203     if (c < 0 || c >= 81)                               \
2204       goto invalid_code;                                \
2205     gref = c / 9, nref = c % 9;                         \
2206     if (gref == 4) gref = 10;                           \
2207     if (nref == 4) nref = 10;                           \
2208     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2209   } while (0)
2210
2211
2212 /* Decode a composition rule represented by C and the following byte
2213    at SRC as a component of composition sequence of Emacs 21 style.
2214    Set RULE to the decoded rule.  */
2215
2216 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2217   do {                                                  \
2218     int gref, nref;                                     \
2219                                                         \
2220     gref = c - 0x20;                                    \
2221     if (gref < 0 || gref >= 81)                         \
2222       goto invalid_code;                                \
2223     ONE_MORE_BYTE (c);                                  \
2224     nref = c - 0x20;                                    \
2225     if (nref < 0 || nref >= 81)                         \
2226       goto invalid_code;                                \
2227     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2228   } while (0)
2229
2230
2231 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2232    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2233    byte length of this composition information, CHARS is the number of
2234    characters composed by this composition.  */
2235
2236 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2237   do {                                                                  \
2238     enum composition_method method = c - 0xF2;                          \
2239     int nbytes, nchars;                                                 \
2240                                                                         \
2241     ONE_MORE_BYTE (c);                                                  \
2242     if (c < 0)                                                          \
2243       goto invalid_code;                                                \
2244     nbytes = c - 0xA0;                                                  \
2245     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2246       goto invalid_code;                                                \
2247     ONE_MORE_BYTE (c);                                                  \
2248     nchars = c - 0xA0;                                                  \
2249     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2250       goto invalid_code;                                                \
2251     cmp_status->old_form = 0;                                           \
2252     cmp_status->method = method;                                        \
2253     if (method == COMPOSITION_RELATIVE)                                 \
2254       cmp_status->state = COMPOSING_CHAR;                               \
2255     else                                                                \
2256       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2257     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2258     cmp_status->nchars = nchars;                                        \
2259     cmp_status->ncomps = nbytes - 4;                                    \
2260     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2261   } while (0)
2262
2263
2264 /* Start of Emacs 20 style format for relative composition.  */
2265
2266 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2267   do {                                                          \
2268     cmp_status->old_form = 1;                                   \
2269     cmp_status->method = COMPOSITION_RELATIVE;                  \
2270     cmp_status->state = COMPOSING_CHAR;                         \
2271     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2272     cmp_status->nchars = cmp_status->ncomps = 0;                \
2273     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2274   } while (0)
2275
2276
2277 /* Start of Emacs 20 style format for rule-base composition.  */
2278
2279 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2280   do {                                                          \
2281     cmp_status->old_form = 1;                                   \
2282     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2283     cmp_status->state = COMPOSING_CHAR;                         \
2284     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2285     cmp_status->nchars = cmp_status->ncomps = 0;                \
2286     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2287   } while (0)
2288
2289
2290 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2291   do {                                                  \
2292     const unsigned char *current_src = src;             \
2293                                                         \
2294     ONE_MORE_BYTE (c);                                  \
2295     if (c < 0)                                          \
2296       goto invalid_code;                                \
2297     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2298         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2299       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2300     else if (c < 0xA0)                                  \
2301       goto invalid_code;                                \
2302     else if (c < 0xC0)                                  \
2303       {                                                 \
2304         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2305         /* Re-read C as a composition component.  */    \
2306         src = current_src;                              \
2307       }                                                 \
2308     else if (c == 0xFF)                                 \
2309       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2310     else                                                \
2311       goto invalid_code;                                \
2312   } while (0)
2313
2314 #define EMACS_MULE_COMPOSITION_END()                            \
2315   do {                                                          \
2316     int idx = - cmp_status->length;                             \
2317                                                                 \
2318     if (cmp_status->old_form)                                   \
2319       charbuf[idx + 2] = cmp_status->nchars;                    \
2320     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2321       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2322     cmp_status->state = COMPOSING_NO;                           \
2323   } while (0)
2324
2325
2326 static int
2327 emacs_mule_finish_composition (int *charbuf,
2328                                struct composition_status *cmp_status)
2329 {
2330   int idx = - cmp_status->length;
2331   int new_chars;
2332
2333   if (cmp_status->old_form && cmp_status->nchars > 0)
2334     {
2335       charbuf[idx + 2] = cmp_status->nchars;
2336       new_chars = 0;
2337       if (cmp_status->method == COMPOSITION_WITH_RULE
2338           && cmp_status->state == COMPOSING_CHAR)
2339         {
2340           /* The last rule was invalid.  */
2341           int rule = charbuf[-1] + 0xA0;
2342
2343           charbuf[-2] = BYTE8_TO_CHAR (rule);
2344           charbuf[-1] = -1;
2345           new_chars = 1;
2346         }
2347     }
2348   else
2349     {
2350       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2351
2352       if (cmp_status->method == COMPOSITION_WITH_RULE)
2353         {
2354           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2355           charbuf[idx++] = -3;
2356           charbuf[idx++] = 0;
2357           new_chars = 1;
2358         }
2359       else
2360         {
2361           int nchars = charbuf[idx + 1] + 0xA0;
2362           int nbytes = charbuf[idx + 2] + 0xA0;
2363
2364           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2365           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2366           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2367           charbuf[idx++] = -1;
2368           new_chars = 4;
2369         }
2370     }
2371   cmp_status->state = COMPOSING_NO;
2372   return new_chars;
2373 }
2374
2375 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2376   do {                                                                    \
2377     if (cmp_status->state != COMPOSING_NO)                                \
2378       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2379   } while (0)
2380
2381
2382 static void
2383 decode_coding_emacs_mule (struct coding_system *coding)
2384 {
2385   const unsigned char *src = coding->source + coding->consumed;
2386   const unsigned char *src_end = coding->source + coding->src_bytes;
2387   const unsigned char *src_base;
2388   int *charbuf = coding->charbuf + coding->charbuf_used;
2389   /* We may produce two annotations (charset and composition) in one
2390      loop and one more charset annotation at the end.  */
2391   int *charbuf_end
2392     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2393       /* We can produce up to 2 characters in a loop.  */
2394       - 1;
2395   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2396   bool multibytep = coding->src_multibyte;
2397   ptrdiff_t char_offset = coding->produced_char;
2398   ptrdiff_t last_offset = char_offset;
2399   int last_id = charset_ascii;
2400   bool eol_dos
2401     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2402   int byte_after_cr = -1;
2403   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2404
2405   if (cmp_status->state != COMPOSING_NO)
2406     {
2407       int i;
2408
2409       if (charbuf_end - charbuf < cmp_status->length)
2410         emacs_abort ();
2411       for (i = 0; i < cmp_status->length; i++)
2412         *charbuf++ = cmp_status->carryover[i];
2413       coding->annotated = 1;
2414     }
2415
2416   while (1)
2417     {
2418       int c, id IF_LINT (= 0);
2419
2420       src_base = src;
2421       consumed_chars_base = consumed_chars;
2422
2423       if (charbuf >= charbuf_end)
2424         {
2425           if (byte_after_cr >= 0)
2426             src_base--;
2427           break;
2428         }
2429
2430       if (byte_after_cr >= 0)
2431         c = byte_after_cr, byte_after_cr = -1;
2432       else
2433         ONE_MORE_BYTE (c);
2434
2435       if (c < 0 || c == 0x80)
2436         {
2437           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2438           if (c < 0)
2439             {
2440               *charbuf++ = -c;
2441               char_offset++;
2442             }
2443           else
2444             DECODE_EMACS_MULE_COMPOSITION_START ();
2445           continue;
2446         }
2447
2448       if (c < 0x80)
2449         {
2450           if (eol_dos && c == '\r')
2451             ONE_MORE_BYTE (byte_after_cr);
2452           id = charset_ascii;
2453           if (cmp_status->state != COMPOSING_NO)
2454             {
2455               if (cmp_status->old_form)
2456                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2457               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2458                 cmp_status->ncomps--;
2459             }
2460         }
2461       else
2462         {
2463           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2464           /* emacs_mule_char can load a charset map from a file, which
2465              allocates a large structure and might cause buffer text
2466              to be relocated as result.  Thus, we need to remember the
2467              original pointer to buffer text, and fix up all related
2468              pointers after the call.  */
2469           const unsigned char *orig = coding->source;
2470           ptrdiff_t offset;
2471
2472           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2473                                cmp_status);
2474           offset = coding->source - orig;
2475           if (offset)
2476             {
2477               src += offset;
2478               src_base += offset;
2479               src_end += offset;
2480             }
2481           if (c < 0)
2482             {
2483               if (c == -1)
2484                 goto invalid_code;
2485               if (c == -2)
2486                 break;
2487             }
2488           src = src_base + nbytes;
2489           consumed_chars = consumed_chars_base + nchars;
2490           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2491             cmp_status->ncomps -= nchars;
2492         }
2493
2494       /* Now if C >= 0, we found a normally encoded character, if C <
2495          0, we found an old-style composition component character or
2496          rule.  */
2497
2498       if (cmp_status->state == COMPOSING_NO)
2499         {
2500           if (last_id != id)
2501             {
2502               if (last_id != charset_ascii)
2503                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2504                                   last_id);
2505               last_id = id;
2506               last_offset = char_offset;
2507             }
2508           *charbuf++ = c;
2509           char_offset++;
2510         }
2511       else if (cmp_status->state == COMPOSING_CHAR)
2512         {
2513           if (cmp_status->old_form)
2514             {
2515               if (c >= 0)
2516                 {
2517                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2518                   *charbuf++ = c;
2519                   char_offset++;
2520                 }
2521               else
2522                 {
2523                   *charbuf++ = -c;
2524                   cmp_status->nchars++;
2525                   cmp_status->length++;
2526                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2527                     EMACS_MULE_COMPOSITION_END ();
2528                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2529                     cmp_status->state = COMPOSING_RULE;
2530                 }
2531             }
2532           else
2533             {
2534               *charbuf++ = c;
2535               cmp_status->length++;
2536               cmp_status->nchars--;
2537               if (cmp_status->nchars == 0)
2538                 EMACS_MULE_COMPOSITION_END ();
2539             }
2540         }
2541       else if (cmp_status->state == COMPOSING_RULE)
2542         {
2543           int rule;
2544
2545           if (c >= 0)
2546             {
2547               EMACS_MULE_COMPOSITION_END ();
2548               *charbuf++ = c;
2549               char_offset++;
2550             }
2551           else
2552             {
2553               c = -c;
2554               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2555               if (rule < 0)
2556                 goto invalid_code;
2557               *charbuf++ = -2;
2558               *charbuf++ = rule;
2559               cmp_status->length += 2;
2560               cmp_status->state = COMPOSING_CHAR;
2561             }
2562         }
2563       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2564         {
2565           *charbuf++ = c;
2566           cmp_status->length++;
2567           if (cmp_status->ncomps == 0)
2568             cmp_status->state = COMPOSING_CHAR;
2569           else if (cmp_status->ncomps > 0)
2570             {
2571               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2572                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2573             }
2574           else
2575             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2576         }
2577       else                      /* COMPOSING_COMPONENT_RULE */
2578         {
2579           int rule;
2580
2581           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2582           if (rule < 0)
2583             goto invalid_code;
2584           *charbuf++ = -2;
2585           *charbuf++ = rule;
2586           cmp_status->length += 2;
2587           cmp_status->ncomps--;
2588           if (cmp_status->ncomps > 0)
2589             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2590           else
2591             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2592         }
2593       continue;
2594
2595     invalid_code:
2596       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2597       src = src_base;
2598       consumed_chars = consumed_chars_base;
2599       ONE_MORE_BYTE (c);
2600       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2601       char_offset++;
2602       coding->errors++;
2603     }
2604
2605  no_more_source:
2606   if (cmp_status->state != COMPOSING_NO)
2607     {
2608       if (coding->mode & CODING_MODE_LAST_BLOCK)
2609         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2610       else
2611         {
2612           int i;
2613
2614           charbuf -= cmp_status->length;
2615           for (i = 0; i < cmp_status->length; i++)
2616             cmp_status->carryover[i] = charbuf[i];
2617         }
2618     }
2619   if (last_id != charset_ascii)
2620     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2621   coding->consumed_char += consumed_chars_base;
2622   coding->consumed = src_base - coding->source;
2623   coding->charbuf_used = charbuf - coding->charbuf;
2624 }
2625
2626
2627 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2628   do {                                          \
2629     if (id < 0xA0)                              \
2630       codes[0] = id, codes[1] = 0;              \
2631     else if (id < 0xE0)                         \
2632       codes[0] = 0x9A, codes[1] = id;           \
2633     else if (id < 0xF0)                         \
2634       codes[0] = 0x9B, codes[1] = id;           \
2635     else if (id < 0xF5)                         \
2636       codes[0] = 0x9C, codes[1] = id;           \
2637     else                                        \
2638       codes[0] = 0x9D, codes[1] = id;           \
2639   } while (0);
2640
2641
2642 static bool
2643 encode_coding_emacs_mule (struct coding_system *coding)
2644 {
2645   bool multibytep = coding->dst_multibyte;
2646   int *charbuf = coding->charbuf;
2647   int *charbuf_end = charbuf + coding->charbuf_used;
2648   unsigned char *dst = coding->destination + coding->produced;
2649   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2650   int safe_room = 8;
2651   ptrdiff_t produced_chars = 0;
2652   Lisp_Object attrs, charset_list;
2653   int c;
2654   int preferred_charset_id = -1;
2655
2656   CODING_GET_INFO (coding, attrs, charset_list);
2657   if (! EQ (charset_list, Vemacs_mule_charset_list))
2658     {
2659       charset_list = Vemacs_mule_charset_list;
2660       ASET (attrs, coding_attr_charset_list, charset_list);
2661     }
2662
2663   while (charbuf < charbuf_end)
2664     {
2665       ASSURE_DESTINATION (safe_room);
2666       c = *charbuf++;
2667
2668       if (c < 0)
2669         {
2670           /* Handle an annotation.  */
2671           switch (*charbuf)
2672             {
2673             case CODING_ANNOTATE_COMPOSITION_MASK:
2674               /* Not yet implemented.  */
2675               break;
2676             case CODING_ANNOTATE_CHARSET_MASK:
2677               preferred_charset_id = charbuf[3];
2678               if (preferred_charset_id >= 0
2679                   && NILP (Fmemq (make_number (preferred_charset_id),
2680                                   charset_list)))
2681                 preferred_charset_id = -1;
2682               break;
2683             default:
2684               emacs_abort ();
2685             }
2686           charbuf += -c - 1;
2687           continue;
2688         }
2689
2690       if (ASCII_CHAR_P (c))
2691         EMIT_ONE_ASCII_BYTE (c);
2692       else if (CHAR_BYTE8_P (c))
2693         {
2694           c = CHAR_TO_BYTE8 (c);
2695           EMIT_ONE_BYTE (c);
2696         }
2697       else
2698         {
2699           struct charset *charset;
2700           unsigned code;
2701           int dimension;
2702           int emacs_mule_id;
2703           unsigned char leading_codes[2];
2704
2705           if (preferred_charset_id >= 0)
2706             {
2707               bool result;
2708
2709               charset = CHARSET_FROM_ID (preferred_charset_id);
2710               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2711               if (result)
2712                 code = ENCODE_CHAR (charset, c);
2713               else
2714                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2715                                      &code, charset);
2716             }
2717           else
2718             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2719                                  &code, charset);
2720           if (! charset)
2721             {
2722               c = coding->default_char;
2723               if (ASCII_CHAR_P (c))
2724                 {
2725                   EMIT_ONE_ASCII_BYTE (c);
2726                   continue;
2727                 }
2728               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2729                                    &code, charset);
2730             }
2731           dimension = CHARSET_DIMENSION (charset);
2732           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2733           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2734           EMIT_ONE_BYTE (leading_codes[0]);
2735           if (leading_codes[1])
2736             EMIT_ONE_BYTE (leading_codes[1]);
2737           if (dimension == 1)
2738             EMIT_ONE_BYTE (code | 0x80);
2739           else
2740             {
2741               code |= 0x8080;
2742               EMIT_ONE_BYTE (code >> 8);
2743               EMIT_ONE_BYTE (code & 0xFF);
2744             }
2745         }
2746     }
2747   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2748   coding->produced_char += produced_chars;
2749   coding->produced = dst - coding->destination;
2750   return 0;
2751 }
2752
2753 \f
2754 /*** 7. ISO2022 handlers ***/
2755
2756 /* The following note describes the coding system ISO2022 briefly.
2757    Since the intention of this note is to help understand the
2758    functions in this file, some parts are NOT ACCURATE or are OVERLY
2759    SIMPLIFIED.  For thorough understanding, please refer to the
2760    original document of ISO2022.  This is equivalent to the standard
2761    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2762
2763    ISO2022 provides many mechanisms to encode several character sets
2764    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2765    is encoded using bytes less than 128.  This may make the encoded
2766    text a little bit longer, but the text passes more easily through
2767    several types of gateway, some of which strip off the MSB (Most
2768    Significant Bit).
2769
2770    There are two kinds of character sets: control character sets and
2771    graphic character sets.  The former contain control characters such
2772    as `newline' and `escape' to provide control functions (control
2773    functions are also provided by escape sequences).  The latter
2774    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2775    two control character sets and many graphic character sets.
2776
2777    Graphic character sets are classified into one of the following
2778    four classes, according to the number of bytes (DIMENSION) and
2779    number of characters in one dimension (CHARS) of the set:
2780    - DIMENSION1_CHARS94
2781    - DIMENSION1_CHARS96
2782    - DIMENSION2_CHARS94
2783    - DIMENSION2_CHARS96
2784
2785    In addition, each character set is assigned an identification tag,
2786    unique for each set, called the "final character" (denoted as <F>
2787    hereafter).  The <F> of each character set is decided by ECMA(*)
2788    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2789    (0x30..0x3F are for private use only).
2790
2791    Note (*): ECMA = European Computer Manufacturers Association
2792
2793    Here are examples of graphic character sets [NAME(<F>)]:
2794         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2795         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2796         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2797         o DIMENSION2_CHARS96 -- none for the moment
2798
2799    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2800         C0 [0x00..0x1F] -- control character plane 0
2801         GL [0x20..0x7F] -- graphic character plane 0
2802         C1 [0x80..0x9F] -- control character plane 1
2803         GR [0xA0..0xFF] -- graphic character plane 1
2804
2805    A control character set is directly designated and invoked to C0 or
2806    C1 by an escape sequence.  The most common case is that:
2807    - ISO646's  control character set is designated/invoked to C0, and
2808    - ISO6429's control character set is designated/invoked to C1,
2809    and usually these designations/invocations are omitted in encoded
2810    text.  In a 7-bit environment, only C0 can be used, and a control
2811    character for C1 is encoded by an appropriate escape sequence to
2812    fit into the environment.  All control characters for C1 are
2813    defined to have corresponding escape sequences.
2814
2815    A graphic character set is at first designated to one of four
2816    graphic registers (G0 through G3), then these graphic registers are
2817    invoked to GL or GR.  These designations and invocations can be
2818    done independently.  The most common case is that G0 is invoked to
2819    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2820    these invocations and designations are omitted in encoded text.
2821    In a 7-bit environment, only GL can be used.
2822
2823    When a graphic character set of CHARS94 is invoked to GL, codes
2824    0x20 and 0x7F of the GL area work as control characters SPACE and
2825    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2826    be used.
2827
2828    There are two ways of invocation: locking-shift and single-shift.
2829    With locking-shift, the invocation lasts until the next different
2830    invocation, whereas with single-shift, the invocation affects the
2831    following character only and doesn't affect the locking-shift
2832    state.  Invocations are done by the following control characters or
2833    escape sequences:
2834
2835    ----------------------------------------------------------------------
2836    abbrev  function                  cntrl escape seq   description
2837    ----------------------------------------------------------------------
2838    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2839    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2840    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2841    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2842    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2843    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2844    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2845    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2846    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2847    ----------------------------------------------------------------------
2848    (*) These are not used by any known coding system.
2849
2850    Control characters for these functions are defined by macros
2851    ISO_CODE_XXX in `coding.h'.
2852
2853    Designations are done by the following escape sequences:
2854    ----------------------------------------------------------------------
2855    escape sequence      description
2856    ----------------------------------------------------------------------
2857    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2858    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2859    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2860    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2861    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2862    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2863    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2864    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2865    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2866    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2867    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2868    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2869    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2870    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2871    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2872    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2873    ----------------------------------------------------------------------
2874
2875    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2876    of dimension 1, chars 94, and final character <F>, etc...
2877
2878    Note (*): Although these designations are not allowed in ISO2022,
2879    Emacs accepts them on decoding, and produces them on encoding
2880    CHARS96 character sets in a coding system which is characterized as
2881    7-bit environment, non-locking-shift, and non-single-shift.
2882
2883    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2884    '(' must be omitted.  We refer to this as "short-form" hereafter.
2885
2886    Now you may notice that there are a lot of ways of encoding the
2887    same multilingual text in ISO2022.  Actually, there exist many
2888    coding systems such as Compound Text (used in X11's inter client
2889    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2890    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2891    localized platforms), and all of these are variants of ISO2022.
2892
2893    In addition to the above, Emacs handles two more kinds of escape
2894    sequences: ISO6429's direction specification and Emacs' private
2895    sequence for specifying character composition.
2896
2897    ISO6429's direction specification takes the following form:
2898         o CSI ']'      -- end of the current direction
2899         o CSI '0' ']'  -- end of the current direction
2900         o CSI '1' ']'  -- start of left-to-right text
2901         o CSI '2' ']'  -- start of right-to-left text
2902    The control character CSI (0x9B: control sequence introducer) is
2903    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2904
2905    Character composition specification takes the following form:
2906         o ESC '0' -- start relative composition
2907         o ESC '1' -- end composition
2908         o ESC '2' -- start rule-base composition (*)
2909         o ESC '3' -- start relative composition with alternate chars  (**)
2910         o ESC '4' -- start rule-base composition with alternate chars  (**)
2911   Since these are not standard escape sequences of any ISO standard,
2912   the use of them with these meanings is restricted to Emacs only.
2913
2914   (*) This form is used only in Emacs 20.7 and older versions,
2915   but newer versions can safely decode it.
2916   (**) This form is used only in Emacs 21.1 and newer versions,
2917   and older versions can't decode it.
2918
2919   Here's a list of example usages of these composition escape
2920   sequences (categorized by `enum composition_method').
2921
2922   COMPOSITION_RELATIVE:
2923         ESC 0 CHAR [ CHAR ] ESC 1
2924   COMPOSITION_WITH_RULE:
2925         ESC 2 CHAR [ RULE CHAR ] ESC 1
2926   COMPOSITION_WITH_ALTCHARS:
2927         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2928   COMPOSITION_WITH_RULE_ALTCHARS:
2929         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2930
2931 static enum iso_code_class_type iso_code_class[256];
2932
2933 #define SAFE_CHARSET_P(coding, id)      \
2934   ((id) <= (coding)->max_charset_id     \
2935    && (coding)->safe_charsets[id] != 255)
2936
2937 static void
2938 setup_iso_safe_charsets (Lisp_Object attrs)
2939 {
2940   Lisp_Object charset_list, safe_charsets;
2941   Lisp_Object request;
2942   Lisp_Object reg_usage;
2943   Lisp_Object tail;
2944   EMACS_INT reg94, reg96;
2945   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2946   int max_charset_id;
2947
2948   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2949   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2950       && ! EQ (charset_list, Viso_2022_charset_list))
2951     {
2952       charset_list = Viso_2022_charset_list;
2953       ASET (attrs, coding_attr_charset_list, charset_list);
2954       ASET (attrs, coding_attr_safe_charsets, Qnil);
2955     }
2956
2957   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2958     return;
2959
2960   max_charset_id = 0;
2961   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2962     {
2963       int id = XINT (XCAR (tail));
2964       if (max_charset_id < id)
2965         max_charset_id = id;
2966     }
2967
2968   safe_charsets = make_uninit_string (max_charset_id + 1);
2969   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2970   request = AREF (attrs, coding_attr_iso_request);
2971   reg_usage = AREF (attrs, coding_attr_iso_usage);
2972   reg94 = XINT (XCAR (reg_usage));
2973   reg96 = XINT (XCDR (reg_usage));
2974
2975   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2976     {
2977       Lisp_Object id;
2978       Lisp_Object reg;
2979       struct charset *charset;
2980
2981       id = XCAR (tail);
2982       charset = CHARSET_FROM_ID (XINT (id));
2983       reg = Fcdr (Fassq (id, request));
2984       if (! NILP (reg))
2985         SSET (safe_charsets, XINT (id), XINT (reg));
2986       else if (charset->iso_chars_96)
2987         {
2988           if (reg96 < 4)
2989             SSET (safe_charsets, XINT (id), reg96);
2990         }
2991       else
2992         {
2993           if (reg94 < 4)
2994             SSET (safe_charsets, XINT (id), reg94);
2995         }
2996     }
2997   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2998 }
2999
3000
3001 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3002    Return true if a text is encoded in one of ISO-2022 based coding
3003    systems.  */
3004
3005 static bool
3006 detect_coding_iso_2022 (struct coding_system *coding,
3007                         struct coding_detection_info *detect_info)
3008 {
3009   const unsigned char *src = coding->source, *src_base = src;
3010   const unsigned char *src_end = coding->source + coding->src_bytes;
3011   bool multibytep = coding->src_multibyte;
3012   bool single_shifting = 0;
3013   int id;
3014   int c, c1;
3015   ptrdiff_t consumed_chars = 0;
3016   int i;
3017   int rejected = 0;
3018   int found = 0;
3019   int composition_count = -1;
3020
3021   detect_info->checked |= CATEGORY_MASK_ISO;
3022
3023   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3024     {
3025       struct coding_system *this = &(coding_categories[i]);
3026       Lisp_Object attrs, val;
3027
3028       if (this->id < 0)
3029         continue;
3030       attrs = CODING_ID_ATTRS (this->id);
3031       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3032           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3033         setup_iso_safe_charsets (attrs);
3034       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3035       this->max_charset_id = SCHARS (val) - 1;
3036       this->safe_charsets = SDATA (val);
3037     }
3038
3039   /* A coding system of this category is always ASCII compatible.  */
3040   src += coding->head_ascii;
3041
3042   while (rejected != CATEGORY_MASK_ISO)
3043     {
3044       src_base = src;
3045       ONE_MORE_BYTE (c);
3046       switch (c)
3047         {
3048         case ISO_CODE_ESC:
3049           if (inhibit_iso_escape_detection)
3050             break;
3051           single_shifting = 0;
3052           ONE_MORE_BYTE (c);
3053           if (c == 'N' || c == 'O')
3054             {
3055               /* ESC <Fe> for SS2 or SS3.  */
3056               single_shifting = 1;
3057               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3058             }
3059           else if (c == '1')
3060             {
3061               /* End of composition.  */
3062               if (composition_count < 0
3063                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3064                 /* Invalid */
3065                 break;
3066               composition_count = -1;
3067               found |= CATEGORY_MASK_ISO;
3068             }
3069           else if (c >= '0' && c <= '4')
3070             {
3071               /* ESC <Fp> for start/end composition.  */
3072               composition_count = 0;
3073             }
3074           else
3075             {
3076               if (c >= '(' && c <= '/')
3077                 {
3078                   /* Designation sequence for a charset of dimension 1.  */
3079                   ONE_MORE_BYTE (c1);
3080                   if (c1 < ' ' || c1 >= 0x80
3081                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3082                     {
3083                       /* Invalid designation sequence.  Just ignore.  */
3084                       if (c1 >= 0x80)
3085                         rejected |= (CATEGORY_MASK_ISO_7BIT
3086                                      | CATEGORY_MASK_ISO_7_ELSE);
3087                       break;
3088                     }
3089                 }
3090               else if (c == '$')
3091                 {
3092                   /* Designation sequence for a charset of dimension 2.  */
3093                   ONE_MORE_BYTE (c);
3094                   if (c >= '@' && c <= 'B')
3095                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3096                     id = iso_charset_table[1][0][c];
3097                   else if (c >= '(' && c <= '/')
3098                     {
3099                       ONE_MORE_BYTE (c1);
3100                       if (c1 < ' ' || c1 >= 0x80
3101                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3102                         {
3103                           /* Invalid designation sequence.  Just ignore.  */
3104                           if (c1 >= 0x80)
3105                             rejected |= (CATEGORY_MASK_ISO_7BIT
3106                                          | CATEGORY_MASK_ISO_7_ELSE);
3107                           break;
3108                         }
3109                     }
3110                   else
3111                     {
3112                       /* Invalid designation sequence.  Just ignore it.  */
3113                       if (c >= 0x80)
3114                         rejected |= (CATEGORY_MASK_ISO_7BIT
3115                                      | CATEGORY_MASK_ISO_7_ELSE);
3116                       break;
3117                     }
3118                 }
3119               else
3120                 {
3121                   /* Invalid escape sequence.  Just ignore it.  */
3122                   if (c >= 0x80)
3123                     rejected |= (CATEGORY_MASK_ISO_7BIT
3124                                  | CATEGORY_MASK_ISO_7_ELSE);
3125                   break;
3126                 }
3127
3128               /* We found a valid designation sequence for CHARSET.  */
3129               rejected |= CATEGORY_MASK_ISO_8BIT;
3130               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3131                                   id))
3132                 found |= CATEGORY_MASK_ISO_7;
3133               else
3134                 rejected |= CATEGORY_MASK_ISO_7;
3135               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3136                                   id))
3137                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3138               else
3139                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3140               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3141                                   id))
3142                 found |= CATEGORY_MASK_ISO_7_ELSE;
3143               else
3144                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3145               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3146                                   id))
3147                 found |= CATEGORY_MASK_ISO_8_ELSE;
3148               else
3149                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3150             }
3151           break;
3152
3153         case ISO_CODE_SO:
3154         case ISO_CODE_SI:
3155           /* Locking shift out/in.  */
3156           if (inhibit_iso_escape_detection)
3157             break;
3158           single_shifting = 0;
3159           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3160           break;
3161
3162         case ISO_CODE_CSI:
3163           /* Control sequence introducer.  */
3164           single_shifting = 0;
3165           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3166           found |= CATEGORY_MASK_ISO_8_ELSE;
3167           goto check_extra_latin;
3168
3169         case ISO_CODE_SS2:
3170         case ISO_CODE_SS3:
3171           /* Single shift.   */
3172           if (inhibit_iso_escape_detection)
3173             break;
3174           single_shifting = 0;
3175           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3176           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3177               & CODING_ISO_FLAG_SINGLE_SHIFT)
3178             {
3179               found |= CATEGORY_MASK_ISO_8_1;
3180               single_shifting = 1;
3181             }
3182           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3183               & CODING_ISO_FLAG_SINGLE_SHIFT)
3184             {
3185               found |= CATEGORY_MASK_ISO_8_2;
3186               single_shifting = 1;
3187             }
3188           if (single_shifting)
3189             break;
3190           goto check_extra_latin;
3191
3192         default:
3193           if (c < 0)
3194             continue;
3195           if (c < 0x80)
3196             {
3197               if (composition_count >= 0)
3198                 composition_count++;
3199               single_shifting = 0;
3200               break;
3201             }
3202           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3203           if (c >= 0xA0)
3204             {
3205               found |= CATEGORY_MASK_ISO_8_1;
3206               /* Check the length of succeeding codes of the range
3207                  0xA0..0FF.  If the byte length is even, we include
3208                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3209                  only when we are not single shifting.  */
3210               if (! single_shifting
3211                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3212                 {
3213                   ptrdiff_t len = 1;
3214                   while (src < src_end)
3215                     {
3216                       src_base = src;
3217                       ONE_MORE_BYTE (c);
3218                       if (c < 0xA0)
3219                         {
3220                           src = src_base;
3221                           break;
3222                         }
3223                       len++;
3224                     }
3225
3226                   if (len & 1 && src < src_end)
3227                     {
3228                       rejected |= CATEGORY_MASK_ISO_8_2;
3229                       if (composition_count >= 0)
3230                         composition_count += len;
3231                     }
3232                   else
3233                     {
3234                       found |= CATEGORY_MASK_ISO_8_2;
3235                       if (composition_count >= 0)
3236                         composition_count += len / 2;
3237                     }
3238                 }
3239               break;
3240             }
3241         check_extra_latin:
3242           if (! VECTORP (Vlatin_extra_code_table)
3243               || NILP (AREF (Vlatin_extra_code_table, c)))
3244             {
3245               rejected = CATEGORY_MASK_ISO;
3246               break;
3247             }
3248           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3249               & CODING_ISO_FLAG_LATIN_EXTRA)
3250             found |= CATEGORY_MASK_ISO_8_1;
3251           else
3252             rejected |= CATEGORY_MASK_ISO_8_1;
3253           rejected |= CATEGORY_MASK_ISO_8_2;
3254           break;
3255         }
3256     }
3257   detect_info->rejected |= CATEGORY_MASK_ISO;
3258   return 0;
3259
3260  no_more_source:
3261   detect_info->rejected |= rejected;
3262   detect_info->found |= (found & ~rejected);
3263   return 1;
3264 }
3265
3266
3267 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3268    escape sequence should be kept.  */
3269 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3270   do {                                                                  \
3271     int id, prev;                                                       \
3272                                                                         \
3273     if (final < '0' || final >= 128                                     \
3274         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3275         || !SAFE_CHARSET_P (coding, id))                                \
3276       {                                                                 \
3277         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3278         chars_96 = -1;                                                  \
3279         break;                                                          \
3280       }                                                                 \
3281     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3282     if (id == charset_jisx0201_roman)                                   \
3283       {                                                                 \
3284         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3285           id = charset_ascii;                                           \
3286       }                                                                 \
3287     else if (id == charset_jisx0208_1978)                               \
3288       {                                                                 \
3289         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3290           id = charset_jisx0208;                                        \
3291       }                                                                 \
3292     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3293     /* If there was an invalid designation to REG previously, and this  \
3294        designation is ASCII to REG, we should keep this designation     \
3295        sequence.  */                                                    \
3296     if (prev == -2 && id == charset_ascii)                              \
3297       chars_96 = -1;                                                    \
3298   } while (0)
3299
3300
3301 /* Handle these composition sequence (ALT: alternate char):
3302
3303    (1) relative composition: ESC 0 CHAR ... ESC 1
3304    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3305    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3306    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3307
3308    When the start sequence (ESC 0/2/3/4) is found, this annotation
3309    header is produced.
3310
3311         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3312
3313    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3314    produced until the end sequence (ESC 1) is found:
3315
3316    (1) CHAR ... CHAR
3317    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3318    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3319    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3320
3321    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3322    annotation header is updated as below:
3323
3324    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3325    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3326    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3327    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3328
3329    If an error is found while composing, the annotation header is
3330    changed to:
3331
3332         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3333
3334    and the sequence [ -2 DECODED-RULE ] is changed to the original
3335    byte sequence as below:
3336         o the original byte sequence is B: [ B -1 ]
3337         o the original byte sequence is B1 B2: [ B1 B2 ]
3338    and the sequence [ -1 -1 ] is changed to the original byte
3339    sequence:
3340         [ ESC '0' ]
3341 */
3342
3343 /* Decode a composition rule C1 and maybe one more byte from the
3344    source, and set RULE to the encoded composition rule.  If the rule
3345    is invalid, goto invalid_code.  */
3346
3347 #define DECODE_COMPOSITION_RULE(rule)                                   \
3348   do {                                                                  \
3349     rule = c1 - 32;                                                     \
3350     if (rule < 0)                                                       \
3351       goto invalid_code;                                                \
3352     if (rule < 81)              /* old format (before ver.21) */        \
3353       {                                                                 \
3354         int gref = (rule) / 9;                                          \
3355         int nref = (rule) % 9;                                          \
3356         if (gref == 4) gref = 10;                                       \
3357         if (nref == 4) nref = 10;                                       \
3358         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3359       }                                                                 \
3360     else                        /* new format (after ver.21) */         \
3361       {                                                                 \
3362         int b;                                                          \
3363                                                                         \
3364         ONE_MORE_BYTE (b);                                              \
3365         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3366           goto invalid_code;                                            \
3367         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3368         rule += 0x100;   /* Distinguish it from the old format.  */     \
3369       }                                                                 \
3370   } while (0)
3371
3372 #define ENCODE_COMPOSITION_RULE(rule)                           \
3373   do {                                                          \
3374     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3375                                                                 \
3376     if (rule < 0x100)           /* old format */                \
3377       {                                                         \
3378         if (gref == 10) gref = 4;                               \
3379         if (nref == 10) nref = 4;                               \
3380         charbuf[idx] = 32 + gref * 9 + nref;                    \
3381         charbuf[idx + 1] = -1;                                  \
3382         new_chars++;                                            \
3383       }                                                         \
3384     else                                /* new format */        \
3385       {                                                         \
3386         charbuf[idx] = 32 + 81 + gref;                          \
3387         charbuf[idx + 1] = 32 + nref;                           \
3388         new_chars += 2;                                         \
3389       }                                                         \
3390   } while (0)
3391
3392 /* Finish the current composition as invalid.  */
3393
3394 static int
3395 finish_composition (int *charbuf, struct composition_status *cmp_status)
3396 {
3397   int idx = - cmp_status->length;
3398   int new_chars;
3399
3400   /* Recover the original ESC sequence */
3401   charbuf[idx++] = ISO_CODE_ESC;
3402   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3403                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3404                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3405                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3406                     : '4');
3407   charbuf[idx++] = -2;
3408   charbuf[idx++] = 0;
3409   charbuf[idx++] = -1;
3410   new_chars = cmp_status->nchars;
3411   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3412     for (; idx < 0; idx++)
3413       {
3414         int elt = charbuf[idx];
3415
3416         if (elt == -2)
3417           {
3418             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3419             idx++;
3420           }
3421         else if (elt == -1)
3422           {
3423             charbuf[idx++] = ISO_CODE_ESC;
3424             charbuf[idx] = '0';
3425             new_chars += 2;
3426           }
3427       }
3428   cmp_status->state = COMPOSING_NO;
3429   return new_chars;
3430 }
3431
3432 /* If characters are under composition, finish the composition.  */
3433 #define MAYBE_FINISH_COMPOSITION()                              \
3434   do {                                                          \
3435     if (cmp_status->state != COMPOSING_NO)                      \
3436       char_offset += finish_composition (charbuf, cmp_status);  \
3437   } while (0)
3438
3439 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3440
3441    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3442    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3443    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3444    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3445
3446    Produce this annotation sequence now:
3447
3448    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3449 */
3450
3451 #define DECODE_COMPOSITION_START(c1)                                       \
3452   do {                                                                     \
3453     if (c1 == '0'                                                          \
3454         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3455              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3456             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3457                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3458       {                                                                    \
3459         *charbuf++ = -1;                                                   \
3460         *charbuf++= -1;                                                    \
3461         cmp_status->state = COMPOSING_CHAR;                                \
3462         cmp_status->length += 2;                                           \
3463       }                                                                    \
3464     else                                                                   \
3465       {                                                                    \
3466         MAYBE_FINISH_COMPOSITION ();                                       \
3467         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3468                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3469                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3470                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3471         cmp_status->state                                                  \
3472           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3473         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3474         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3475         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3476         coding->annotated = 1;                                             \
3477       }                                                                    \
3478   } while (0)
3479
3480
3481 /* Handle composition end sequence ESC 1.  */
3482
3483 #define DECODE_COMPOSITION_END()                                        \
3484   do {                                                                  \
3485     if (cmp_status->nchars == 0                                         \
3486         || ((cmp_status->state == COMPOSING_CHAR)                       \
3487             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3488       {                                                                 \
3489         MAYBE_FINISH_COMPOSITION ();                                    \
3490         goto invalid_code;                                              \
3491       }                                                                 \
3492     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3493       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3494     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3495       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3496     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3497     char_offset += cmp_status->nchars;                                  \
3498     cmp_status->state = COMPOSING_NO;                                   \
3499   } while (0)
3500
3501 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3502
3503 #define STORE_COMPOSITION_RULE(rule)    \
3504   do {                                  \
3505     *charbuf++ = -2;                    \
3506     *charbuf++ = rule;                  \
3507     cmp_status->length += 2;            \
3508     cmp_status->state--;                \
3509   } while (0)
3510
3511 /* Store a composed char or a component char C in charbuf, and update
3512    cmp_status.  */
3513
3514 #define STORE_COMPOSITION_CHAR(c)                                       \
3515   do {                                                                  \
3516     *charbuf++ = (c);                                                   \
3517     cmp_status->length++;                                               \
3518     if (cmp_status->state == COMPOSING_CHAR)                            \
3519       cmp_status->nchars++;                                             \
3520     else                                                                \
3521       cmp_status->ncomps++;                                             \
3522     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3523         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3524             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3525       cmp_status->state++;                                              \
3526   } while (0)
3527
3528
3529 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3530
3531 static void
3532 decode_coding_iso_2022 (struct coding_system *coding)
3533 {
3534   const unsigned char *src = coding->source + coding->consumed;
3535   const unsigned char *src_end = coding->source + coding->src_bytes;
3536   const unsigned char *src_base;
3537   int *charbuf = coding->charbuf + coding->charbuf_used;
3538   /* We may produce two annotations (charset and composition) in one
3539      loop and one more charset annotation at the end.  */
3540   int *charbuf_end
3541     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3542   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3543   bool multibytep = coding->src_multibyte;
3544   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3545   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3546   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3547   int charset_id_2, charset_id_3;
3548   struct charset *charset;
3549   int c;
3550   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3551   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3552   ptrdiff_t char_offset = coding->produced_char;
3553   ptrdiff_t last_offset = char_offset;
3554   int last_id = charset_ascii;
3555   bool eol_dos
3556     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3557   int byte_after_cr = -1;
3558   int i;
3559
3560   setup_iso_safe_charsets (attrs);
3561   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3562
3563   if (cmp_status->state != COMPOSING_NO)
3564     {
3565       if (charbuf_end - charbuf < cmp_status->length)
3566         emacs_abort ();
3567       for (i = 0; i < cmp_status->length; i++)
3568         *charbuf++ = cmp_status->carryover[i];
3569       coding->annotated = 1;
3570     }
3571
3572   while (1)
3573     {
3574       int c1, c2, c3;
3575
3576       src_base = src;
3577       consumed_chars_base = consumed_chars;
3578
3579       if (charbuf >= charbuf_end)
3580         {
3581           if (byte_after_cr >= 0)
3582             src_base--;
3583           break;
3584         }
3585
3586       if (byte_after_cr >= 0)
3587         c1 = byte_after_cr, byte_after_cr = -1;
3588       else
3589         ONE_MORE_BYTE (c1);
3590       if (c1 < 0)
3591         goto invalid_code;
3592
3593       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3594         {
3595           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3596           char_offset++;
3597           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3598           continue;
3599         }
3600
3601       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3602         {
3603           if (c1 == ISO_CODE_ESC)
3604             {
3605               if (src + 1 >= src_end)
3606                 goto no_more_source;
3607               *charbuf++ = ISO_CODE_ESC;
3608               char_offset++;
3609               if (src[0] == '%' && src[1] == '@')
3610                 {
3611                   src += 2;
3612                   consumed_chars += 2;
3613                   char_offset += 2;
3614                   /* We are sure charbuf can contain two more chars. */
3615                   *charbuf++ = '%';
3616                   *charbuf++ = '@';
3617                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3618                 }
3619             }
3620           else
3621             {
3622               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3623               char_offset++;
3624             }
3625           continue;
3626         }
3627
3628       if ((cmp_status->state == COMPOSING_RULE
3629            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3630           && c1 != ISO_CODE_ESC)
3631         {
3632           int rule;
3633
3634           DECODE_COMPOSITION_RULE (rule);
3635           STORE_COMPOSITION_RULE (rule);
3636           continue;
3637         }
3638
3639       /* We produce at most one character.  */
3640       switch (iso_code_class [c1])
3641         {
3642         case ISO_0x20_or_0x7F:
3643           if (charset_id_0 < 0
3644               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3645             /* This is SPACE or DEL.  */
3646             charset = CHARSET_FROM_ID (charset_ascii);
3647           else
3648             charset = CHARSET_FROM_ID (charset_id_0);
3649           break;
3650
3651         case ISO_graphic_plane_0:
3652           if (charset_id_0 < 0)
3653             charset = CHARSET_FROM_ID (charset_ascii);
3654           else
3655             charset = CHARSET_FROM_ID (charset_id_0);
3656           break;
3657
3658         case ISO_0xA0_or_0xFF:
3659           if (charset_id_1 < 0
3660               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3661               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3662             goto invalid_code;
3663           /* This is a graphic character, we fall down ... */
3664
3665         case ISO_graphic_plane_1:
3666           if (charset_id_1 < 0)
3667             goto invalid_code;
3668           charset = CHARSET_FROM_ID (charset_id_1);
3669           break;
3670
3671         case ISO_control_0:
3672           if (eol_dos && c1 == '\r')
3673             ONE_MORE_BYTE (byte_after_cr);
3674           MAYBE_FINISH_COMPOSITION ();
3675           charset = CHARSET_FROM_ID (charset_ascii);
3676           break;
3677
3678         case ISO_control_1:
3679           goto invalid_code;
3680
3681         case ISO_shift_out:
3682           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3683               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3684             goto invalid_code;
3685           CODING_ISO_INVOCATION (coding, 0) = 1;
3686           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3687           continue;
3688
3689         case ISO_shift_in:
3690           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3691             goto invalid_code;
3692           CODING_ISO_INVOCATION (coding, 0) = 0;
3693           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3694           continue;
3695
3696         case ISO_single_shift_2_7:
3697           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3698             goto invalid_code;
3699         case ISO_single_shift_2:
3700           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3701             goto invalid_code;
3702           /* SS2 is handled as an escape sequence of ESC 'N' */
3703           c1 = 'N';
3704           goto label_escape_sequence;
3705
3706         case ISO_single_shift_3:
3707           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3708             goto invalid_code;
3709           /* SS2 is handled as an escape sequence of ESC 'O' */
3710           c1 = 'O';
3711           goto label_escape_sequence;
3712
3713         case ISO_control_sequence_introducer:
3714           /* CSI is handled as an escape sequence of ESC '[' ...  */
3715           c1 = '[';
3716           goto label_escape_sequence;
3717
3718         case ISO_escape:
3719           ONE_MORE_BYTE (c1);
3720         label_escape_sequence:
3721           /* Escape sequences handled here are invocation,
3722              designation, direction specification, and character
3723              composition specification.  */
3724           switch (c1)
3725             {
3726             case '&':           /* revision of following character set */
3727               ONE_MORE_BYTE (c1);
3728               if (!(c1 >= '@' && c1 <= '~'))
3729                 goto invalid_code;
3730               ONE_MORE_BYTE (c1);
3731               if (c1 != ISO_CODE_ESC)
3732                 goto invalid_code;
3733               ONE_MORE_BYTE (c1);
3734               goto label_escape_sequence;
3735
3736             case '$':           /* designation of 2-byte character set */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3738                 goto invalid_code;
3739               {
3740                 int reg, chars96;
3741
3742                 ONE_MORE_BYTE (c1);
3743                 if (c1 >= '@' && c1 <= 'B')
3744                   {     /* designation of JISX0208.1978, GB2312.1980,
3745                            or JISX0208.1980 */
3746                     reg = 0, chars96 = 0;
3747                   }
3748                 else if (c1 >= 0x28 && c1 <= 0x2B)
3749                   { /* designation of DIMENSION2_CHARS94 character set */
3750                     reg = c1 - 0x28, chars96 = 0;
3751                     ONE_MORE_BYTE (c1);
3752                   }
3753                 else if (c1 >= 0x2C && c1 <= 0x2F)
3754                   { /* designation of DIMENSION2_CHARS96 character set */
3755                     reg = c1 - 0x2C, chars96 = 1;
3756                     ONE_MORE_BYTE (c1);
3757                   }
3758                 else
3759                   goto invalid_code;
3760                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3761                 /* We must update these variables now.  */
3762                 if (reg == 0)
3763                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3764                 else if (reg == 1)
3765                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3766                 if (chars96 < 0)
3767                   goto invalid_code;
3768               }
3769               continue;
3770
3771             case 'n':           /* invocation of locking-shift-2 */
3772               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3773                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3774                 goto invalid_code;
3775               CODING_ISO_INVOCATION (coding, 0) = 2;
3776               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3777               continue;
3778
3779             case 'o':           /* invocation of locking-shift-3 */
3780               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3781                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3782                 goto invalid_code;
3783               CODING_ISO_INVOCATION (coding, 0) = 3;
3784               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3785               continue;
3786
3787             case 'N':           /* invocation of single-shift-2 */
3788               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3789                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3790                 goto invalid_code;
3791               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3792               if (charset_id_2 < 0)
3793                 charset = CHARSET_FROM_ID (charset_ascii);
3794               else
3795                 charset = CHARSET_FROM_ID (charset_id_2);
3796               ONE_MORE_BYTE (c1);
3797               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3798                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3799                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3800                           ? c1 >= 0x80 : c1 < 0x80)))
3801                 goto invalid_code;
3802               break;
3803
3804             case 'O':           /* invocation of single-shift-3 */
3805               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3806                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3807                 goto invalid_code;
3808               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3809               if (charset_id_3 < 0)
3810                 charset = CHARSET_FROM_ID (charset_ascii);
3811               else
3812                 charset = CHARSET_FROM_ID (charset_id_3);
3813               ONE_MORE_BYTE (c1);
3814               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3815                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3816                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3817                           ? c1 >= 0x80 : c1 < 0x80)))
3818                 goto invalid_code;
3819               break;
3820
3821             case '0': case '2': case '3': case '4': /* start composition */
3822               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3823                 goto invalid_code;
3824               if (last_id != charset_ascii)
3825                 {
3826                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3827                   last_id = charset_ascii;
3828                   last_offset = char_offset;
3829                 }
3830               DECODE_COMPOSITION_START (c1);
3831               continue;
3832
3833             case '1':           /* end composition */
3834               if (cmp_status->state == COMPOSING_NO)
3835                 goto invalid_code;
3836               DECODE_COMPOSITION_END ();
3837               continue;
3838
3839             case '[':           /* specification of direction */
3840               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3841                 goto invalid_code;
3842               /* For the moment, nested direction is not supported.
3843                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3844                  left-to-right, and nonzero means right-to-left.  */
3845               ONE_MORE_BYTE (c1);
3846               switch (c1)
3847                 {
3848                 case ']':       /* end of the current direction */
3849                   coding->mode &= ~CODING_MODE_DIRECTION;
3850
3851                 case '0':       /* end of the current direction */
3852                 case '1':       /* start of left-to-right direction */
3853                   ONE_MORE_BYTE (c1);
3854                   if (c1 == ']')
3855                     coding->mode &= ~CODING_MODE_DIRECTION;
3856                   else
3857                     goto invalid_code;
3858                   break;
3859
3860                 case '2':       /* start of right-to-left direction */
3861                   ONE_MORE_BYTE (c1);
3862                   if (c1 == ']')
3863                     coding->mode |= CODING_MODE_DIRECTION;
3864                   else
3865                     goto invalid_code;
3866                   break;
3867
3868                 default:
3869                   goto invalid_code;
3870                 }
3871               continue;
3872
3873             case '%':
3874               ONE_MORE_BYTE (c1);
3875               if (c1 == '/')
3876                 {
3877                   /* CTEXT extended segment:
3878                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3879                      We keep these bytes as is for the moment.
3880                      They may be decoded by post-read-conversion.  */
3881                   int dim, M, L;
3882                   int size;
3883
3884                   ONE_MORE_BYTE (dim);
3885                   if (dim < '0' || dim > '4')
3886                     goto invalid_code;
3887                   ONE_MORE_BYTE (M);
3888                   if (M < 128)
3889                     goto invalid_code;
3890                   ONE_MORE_BYTE (L);
3891                   if (L < 128)
3892                     goto invalid_code;
3893                   size = ((M - 128) * 128) + (L - 128);
3894                   if (charbuf + 6 > charbuf_end)
3895                     goto break_loop;
3896                   *charbuf++ = ISO_CODE_ESC;
3897                   *charbuf++ = '%';
3898                   *charbuf++ = '/';
3899                   *charbuf++ = dim;
3900                   *charbuf++ = BYTE8_TO_CHAR (M);
3901                   *charbuf++ = BYTE8_TO_CHAR (L);
3902                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3903                 }
3904               else if (c1 == 'G')
3905                 {
3906                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3907                      ESC % G --UTF-8-BYTES-- ESC % @
3908                      We keep these bytes as is for the moment.
3909                      They may be decoded by post-read-conversion.  */
3910                   if (charbuf + 3 > charbuf_end)
3911                     goto break_loop;
3912                   *charbuf++ = ISO_CODE_ESC;
3913                   *charbuf++ = '%';
3914                   *charbuf++ = 'G';
3915                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3916                 }
3917               else
3918                 goto invalid_code;
3919               continue;
3920               break;
3921
3922             default:
3923               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3924                 goto invalid_code;
3925               {
3926                 int reg, chars96;
3927
3928                 if (c1 >= 0x28 && c1 <= 0x2B)
3929                   { /* designation of DIMENSION1_CHARS94 character set */
3930                     reg = c1 - 0x28, chars96 = 0;
3931                     ONE_MORE_BYTE (c1);
3932                   }
3933                 else if (c1 >= 0x2C && c1 <= 0x2F)
3934                   { /* designation of DIMENSION1_CHARS96 character set */
3935                     reg = c1 - 0x2C, chars96 = 1;
3936                     ONE_MORE_BYTE (c1);
3937                   }
3938                 else
3939                   goto invalid_code;
3940                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3941                 /* We must update these variables now.  */
3942                 if (reg == 0)
3943                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3944                 else if (reg == 1)
3945                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3946                 if (chars96 < 0)
3947                   goto invalid_code;
3948               }
3949               continue;
3950             }
3951           break;
3952
3953         default:
3954           emacs_abort ();
3955         }
3956
3957       if (cmp_status->state == COMPOSING_NO
3958           && charset->id != charset_ascii
3959           && last_id != charset->id)
3960         {
3961           if (last_id != charset_ascii)
3962             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3963           last_id = charset->id;
3964           last_offset = char_offset;
3965         }
3966
3967       /* Now we know CHARSET and 1st position code C1 of a character.
3968          Produce a decoded character while getting 2nd and 3rd
3969          position codes C2, C3 if necessary.  */
3970       if (CHARSET_DIMENSION (charset) > 1)
3971         {
3972           ONE_MORE_BYTE (c2);
3973           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3974               || ((c1 & 0x80) != (c2 & 0x80)))
3975             /* C2 is not in a valid range.  */
3976             goto invalid_code;
3977           if (CHARSET_DIMENSION (charset) == 2)
3978             c1 = (c1 << 8) | c2;
3979           else
3980             {
3981               ONE_MORE_BYTE (c3);
3982               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3983                   || ((c1 & 0x80) != (c3 & 0x80)))
3984                 /* C3 is not in a valid range.  */
3985                 goto invalid_code;
3986               c1 = (c1 << 16) | (c2 << 8) | c2;
3987             }
3988         }
3989       c1 &= 0x7F7F7F;
3990       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3991       if (c < 0)
3992         {
3993           MAYBE_FINISH_COMPOSITION ();
3994           for (; src_base < src; src_base++, char_offset++)
3995             {
3996               if (ASCII_BYTE_P (*src_base))
3997                 *charbuf++ = *src_base;
3998               else
3999                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4000             }
4001         }
4002       else if (cmp_status->state == COMPOSING_NO)
4003         {
4004           *charbuf++ = c;
4005           char_offset++;
4006         }
4007       else if ((cmp_status->state == COMPOSING_CHAR
4008                 ? cmp_status->nchars
4009                 : cmp_status->ncomps)
4010                >= MAX_COMPOSITION_COMPONENTS)
4011         {
4012           /* Too long composition.  */
4013           MAYBE_FINISH_COMPOSITION ();
4014           *charbuf++ = c;
4015           char_offset++;
4016         }
4017       else
4018         STORE_COMPOSITION_CHAR (c);
4019       continue;
4020
4021     invalid_code:
4022       MAYBE_FINISH_COMPOSITION ();
4023       src = src_base;
4024       consumed_chars = consumed_chars_base;
4025       ONE_MORE_BYTE (c);
4026       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4027       char_offset++;
4028       coding->errors++;
4029       /* Reset the invocation and designation status to the safest
4030          one; i.e. designate ASCII to the graphic register 0, and
4031          invoke that register to the graphic plane 0.  This typically
4032          helps the case that an designation sequence for ASCII "ESC (
4033          B" is somehow broken (e.g. broken by a newline).  */
4034       CODING_ISO_INVOCATION (coding, 0) = 0;
4035       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4036       charset_id_0 = charset_ascii;
4037       continue;
4038
4039     break_loop:
4040       break;
4041     }
4042
4043  no_more_source:
4044   if (cmp_status->state != COMPOSING_NO)
4045     {
4046       if (coding->mode & CODING_MODE_LAST_BLOCK)
4047         MAYBE_FINISH_COMPOSITION ();
4048       else
4049         {
4050           charbuf -= cmp_status->length;
4051           for (i = 0; i < cmp_status->length; i++)
4052             cmp_status->carryover[i] = charbuf[i];
4053         }
4054     }
4055   else if (last_id != charset_ascii)
4056     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4057   coding->consumed_char += consumed_chars_base;
4058   coding->consumed = src_base - coding->source;
4059   coding->charbuf_used = charbuf - coding->charbuf;
4060 }
4061
4062
4063 /* ISO2022 encoding stuff.  */
4064
4065 /*
4066    It is not enough to say just "ISO2022" on encoding, we have to
4067    specify more details.  In Emacs, each coding system of ISO2022
4068    variant has the following specifications:
4069         1. Initial designation to G0 thru G3.
4070         2. Allows short-form designation?
4071         3. ASCII should be designated to G0 before control characters?
4072         4. ASCII should be designated to G0 at end of line?
4073         5. 7-bit environment or 8-bit environment?
4074         6. Use locking-shift?
4075         7. Use Single-shift?
4076    And the following two are only for Japanese:
4077         8. Use ASCII in place of JIS0201-1976-Roman?
4078         9. Use JISX0208-1983 in place of JISX0208-1978?
4079    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4080    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4081    details.
4082 */
4083
4084 /* Produce codes (escape sequence) for designating CHARSET to graphic
4085    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4086    '@', 'A', or 'B' and the coding system CODING allows, produce
4087    designation sequence of short-form.  */
4088
4089 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4090   do {                                                                  \
4091     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4092     const char *intermediate_char_94 = "()*+";                          \
4093     const char *intermediate_char_96 = ",-./";                          \
4094     int revision = -1;                                                  \
4095                                                                         \
4096     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4097       revision = CHARSET_ISO_REVISION (charset);                        \
4098                                                                         \
4099     if (revision >= 0)                                                  \
4100       {                                                                 \
4101         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4102         EMIT_ONE_BYTE ('@' + revision);                                 \
4103       }                                                                 \
4104     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4105     if (CHARSET_DIMENSION (charset) == 1)                               \
4106       {                                                                 \
4107         int b;                                                          \
4108         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4109           b = intermediate_char_94[reg];                                \
4110         else                                                            \
4111           b = intermediate_char_96[reg];                                \
4112         EMIT_ONE_ASCII_BYTE (b);                                        \
4113       }                                                                 \
4114     else                                                                \
4115       {                                                                 \
4116         EMIT_ONE_ASCII_BYTE ('$');                                      \
4117         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4118           {                                                             \
4119             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4120                 || reg != 0                                             \
4121                 || final_char < '@' || final_char > 'B')                \
4122               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4123           }                                                             \
4124         else                                                            \
4125           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4126       }                                                                 \
4127     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4128                                                                         \
4129     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4130   } while (0)
4131
4132
4133 /* The following two macros produce codes (control character or escape
4134    sequence) for ISO2022 single-shift functions (single-shift-2 and
4135    single-shift-3).  */
4136
4137 #define ENCODE_SINGLE_SHIFT_2                                           \
4138   do {                                                                  \
4139     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4140       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4141     else                                                                \
4142       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4143     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4144   } while (0)
4145
4146
4147 #define ENCODE_SINGLE_SHIFT_3                                           \
4148   do {                                                                  \
4149     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4150       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4151     else                                                                \
4152       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4153     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4154   } while (0)
4155
4156
4157 /* The following four macros produce codes (control character or
4158    escape sequence) for ISO2022 locking-shift functions (shift-in,
4159    shift-out, locking-shift-2, and locking-shift-3).  */
4160
4161 #define ENCODE_SHIFT_IN                                 \
4162   do {                                                  \
4163     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4164     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4165   } while (0)
4166
4167
4168 #define ENCODE_SHIFT_OUT                                \
4169   do {                                                  \
4170     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4171     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4172   } while (0)
4173
4174
4175 #define ENCODE_LOCKING_SHIFT_2                          \
4176   do {                                                  \
4177     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4178     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4179   } while (0)
4180
4181
4182 #define ENCODE_LOCKING_SHIFT_3                          \
4183   do {                                                  \
4184     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4185     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4186   } while (0)
4187
4188
4189 /* Produce codes for a DIMENSION1 character whose character set is
4190    CHARSET and whose position-code is C1.  Designation and invocation
4191    sequences are also produced in advance if necessary.  */
4192
4193 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4194   do {                                                                  \
4195     int id = CHARSET_ID (charset);                                      \
4196                                                                         \
4197     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4198         && id == charset_ascii)                                         \
4199       {                                                                 \
4200         id = charset_jisx0201_roman;                                    \
4201         charset = CHARSET_FROM_ID (id);                                 \
4202       }                                                                 \
4203                                                                         \
4204     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4205       {                                                                 \
4206         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4207           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4208         else                                                            \
4209           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4210         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4211         break;                                                          \
4212       }                                                                 \
4213     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4214       {                                                                 \
4215         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4216         break;                                                          \
4217       }                                                                 \
4218     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4219       {                                                                 \
4220         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4221         break;                                                          \
4222       }                                                                 \
4223     else                                                                \
4224       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4225          must invoke it, or, at first, designate it to some graphic     \
4226          register.  Then repeat the loop to actually produce the        \
4227          character.  */                                                 \
4228       dst = encode_invocation_designation (charset, coding, dst,        \
4229                                            &produced_chars);            \
4230   } while (1)
4231
4232
4233 /* Produce codes for a DIMENSION2 character whose character set is
4234    CHARSET and whose position-codes are C1 and C2.  Designation and
4235    invocation codes are also produced in advance if necessary.  */
4236
4237 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4238   do {                                                                  \
4239     int id = CHARSET_ID (charset);                                      \
4240                                                                         \
4241     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4242         && id == charset_jisx0208)                                      \
4243       {                                                                 \
4244         id = charset_jisx0208_1978;                                     \
4245         charset = CHARSET_FROM_ID (id);                                 \
4246       }                                                                 \
4247                                                                         \
4248     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4249       {                                                                 \
4250         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4251           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4252         else                                                            \
4253           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4254         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4255         break;                                                          \
4256       }                                                                 \
4257     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4258       {                                                                 \
4259         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4260         break;                                                          \
4261       }                                                                 \
4262     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4263       {                                                                 \
4264         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4265         break;                                                          \
4266       }                                                                 \
4267     else                                                                \
4268       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4269          must invoke it, or, at first, designate it to some graphic     \
4270          register.  Then repeat the loop to actually produce the        \
4271          character.  */                                                 \
4272       dst = encode_invocation_designation (charset, coding, dst,        \
4273                                            &produced_chars);            \
4274   } while (1)
4275
4276
4277 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4278   do {                                                                     \
4279     unsigned code;                                                         \
4280     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4281                                                                            \
4282     if (CHARSET_DIMENSION (charset) == 1)                                  \
4283       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4284     else                                                                   \
4285       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4286   } while (0)
4287
4288
4289 /* Produce designation and invocation codes at a place pointed by DST
4290    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4291    Return new DST.  */
4292
4293 static unsigned char *
4294 encode_invocation_designation (struct charset *charset,
4295                                struct coding_system *coding,
4296                                unsigned char *dst, ptrdiff_t *p_nchars)
4297 {
4298   bool multibytep = coding->dst_multibyte;
4299   ptrdiff_t produced_chars = *p_nchars;
4300   int reg;                      /* graphic register number */
4301   int id = CHARSET_ID (charset);
4302
4303   /* At first, check designations.  */
4304   for (reg = 0; reg < 4; reg++)
4305     if (id == CODING_ISO_DESIGNATION (coding, reg))
4306       break;
4307
4308   if (reg >= 4)
4309     {
4310       /* CHARSET is not yet designated to any graphic registers.  */
4311       /* At first check the requested designation.  */
4312       reg = CODING_ISO_REQUEST (coding, id);
4313       if (reg < 0)
4314         /* Since CHARSET requests no special designation, designate it
4315            to graphic register 0.  */
4316         reg = 0;
4317
4318       ENCODE_DESIGNATION (charset, reg, coding);
4319     }
4320
4321   if (CODING_ISO_INVOCATION (coding, 0) != reg
4322       && CODING_ISO_INVOCATION (coding, 1) != reg)
4323     {
4324       /* Since the graphic register REG is not invoked to any graphic
4325          planes, invoke it to graphic plane 0.  */
4326       switch (reg)
4327         {
4328         case 0:                 /* graphic register 0 */
4329           ENCODE_SHIFT_IN;
4330           break;
4331
4332         case 1:                 /* graphic register 1 */
4333           ENCODE_SHIFT_OUT;
4334           break;
4335
4336         case 2:                 /* graphic register 2 */
4337           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4338             ENCODE_SINGLE_SHIFT_2;
4339           else
4340             ENCODE_LOCKING_SHIFT_2;
4341           break;
4342
4343         case 3:                 /* graphic register 3 */
4344           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4345             ENCODE_SINGLE_SHIFT_3;
4346           else
4347             ENCODE_LOCKING_SHIFT_3;
4348           break;
4349         }
4350     }
4351
4352   *p_nchars = produced_chars;
4353   return dst;
4354 }
4355
4356
4357 /* Produce codes for designation and invocation to reset the graphic
4358    planes and registers to initial state.  */
4359 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4360   do {                                                                  \
4361     int reg;                                                            \
4362     struct charset *charset;                                            \
4363                                                                         \
4364     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4365       ENCODE_SHIFT_IN;                                                  \
4366     for (reg = 0; reg < 4; reg++)                                       \
4367       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4368           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4369               != CODING_ISO_INITIAL (coding, reg)))                     \
4370         {                                                               \
4371           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4372           ENCODE_DESIGNATION (charset, reg, coding);                    \
4373         }                                                               \
4374   } while (0)
4375
4376
4377 /* Produce designation sequences of charsets in the line started from
4378    CHARBUF to a place pointed by DST, and return the number of
4379    produced bytes.  DST should not directly point a buffer text area
4380    which may be relocated by char_charset call.
4381
4382    If the current block ends before any end-of-line, we may fail to
4383    find all the necessary designations.  */
4384
4385 static ptrdiff_t
4386 encode_designation_at_bol (struct coding_system *coding,
4387                            int *charbuf, int *charbuf_end,
4388                            unsigned char *dst)
4389 {
4390   unsigned char *orig = dst;
4391   struct charset *charset;
4392   /* Table of charsets to be designated to each graphic register.  */
4393   int r[4];
4394   int c, found = 0, reg;
4395   ptrdiff_t produced_chars = 0;
4396   bool multibytep = coding->dst_multibyte;
4397   Lisp_Object attrs;
4398   Lisp_Object charset_list;
4399
4400   attrs = CODING_ID_ATTRS (coding->id);
4401   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4402   if (EQ (charset_list, Qiso_2022))
4403     charset_list = Viso_2022_charset_list;
4404
4405   for (reg = 0; reg < 4; reg++)
4406     r[reg] = -1;
4407
4408   while (charbuf < charbuf_end && found < 4)
4409     {
4410       int id;
4411
4412       c = *charbuf++;
4413       if (c == '\n')
4414         break;
4415       charset = char_charset (c, charset_list, NULL);
4416       id = CHARSET_ID (charset);
4417       reg = CODING_ISO_REQUEST (coding, id);
4418       if (reg >= 0 && r[reg] < 0)
4419         {
4420           found++;
4421           r[reg] = id;
4422         }
4423     }
4424
4425   if (found)
4426     {
4427       for (reg = 0; reg < 4; reg++)
4428         if (r[reg] >= 0
4429             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4430           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4431     }
4432
4433   return dst - orig;
4434 }
4435
4436 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4437
4438 static bool
4439 encode_coding_iso_2022 (struct coding_system *coding)
4440 {
4441   bool multibytep = coding->dst_multibyte;
4442   int *charbuf = coding->charbuf;
4443   int *charbuf_end = charbuf + coding->charbuf_used;
4444   unsigned char *dst = coding->destination + coding->produced;
4445   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4446   int safe_room = 16;
4447   bool bol_designation
4448     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4449        && CODING_ISO_BOL (coding));
4450   ptrdiff_t produced_chars = 0;
4451   Lisp_Object attrs, eol_type, charset_list;
4452   bool ascii_compatible;
4453   int c;
4454   int preferred_charset_id = -1;
4455
4456   CODING_GET_INFO (coding, attrs, charset_list);
4457   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4458   if (VECTORP (eol_type))
4459     eol_type = Qunix;
4460
4461   setup_iso_safe_charsets (attrs);
4462   /* Charset list may have been changed.  */
4463   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4464   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4465
4466   ascii_compatible
4467     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4468        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4469                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4470
4471   while (charbuf < charbuf_end)
4472     {
4473       ASSURE_DESTINATION (safe_room);
4474
4475       if (bol_designation)
4476         {
4477           /* We have to produce designation sequences if any now.  */
4478           unsigned char desig_buf[16];
4479           ptrdiff_t nbytes;
4480           ptrdiff_t offset;
4481
4482           charset_map_loaded = 0;
4483           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4484                                               desig_buf);
4485           if (charset_map_loaded
4486               && (offset = coding_change_destination (coding)))
4487             {
4488               dst += offset;
4489               dst_end += offset;
4490             }
4491           memcpy (dst, desig_buf, nbytes);
4492           dst += nbytes;
4493           /* We are sure that designation sequences are all ASCII bytes.  */
4494           produced_chars += nbytes;
4495           bol_designation = 0;
4496           ASSURE_DESTINATION (safe_room);
4497         }
4498
4499       c = *charbuf++;
4500
4501       if (c < 0)
4502         {
4503           /* Handle an annotation.  */
4504           switch (*charbuf)
4505             {
4506             case CODING_ANNOTATE_COMPOSITION_MASK:
4507               /* Not yet implemented.  */
4508               break;
4509             case CODING_ANNOTATE_CHARSET_MASK:
4510               preferred_charset_id = charbuf[2];
4511               if (preferred_charset_id >= 0
4512                   && NILP (Fmemq (make_number (preferred_charset_id),
4513                                   charset_list)))
4514                 preferred_charset_id = -1;
4515               break;
4516             default:
4517               emacs_abort ();
4518             }
4519           charbuf += -c - 1;
4520           continue;
4521         }
4522
4523       /* Now encode the character C.  */
4524       if (c < 0x20 || c == 0x7F)
4525         {
4526           if (c == '\n'
4527               || (c == '\r' && EQ (eol_type, Qmac)))
4528             {
4529               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4530                 ENCODE_RESET_PLANE_AND_REGISTER ();
4531               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4532                 {
4533                   int i;
4534
4535                   for (i = 0; i < 4; i++)
4536                     CODING_ISO_DESIGNATION (coding, i)
4537                       = CODING_ISO_INITIAL (coding, i);
4538                 }
4539               bol_designation = ((CODING_ISO_FLAGS (coding)
4540                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4541                                  != 0);
4542             }
4543           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4544             ENCODE_RESET_PLANE_AND_REGISTER ();
4545           EMIT_ONE_ASCII_BYTE (c);
4546         }
4547       else if (ASCII_CHAR_P (c))
4548         {
4549           if (ascii_compatible)
4550             EMIT_ONE_ASCII_BYTE (c);
4551           else
4552             {
4553               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4554               ENCODE_ISO_CHARACTER (charset, c);
4555             }
4556         }
4557       else if (CHAR_BYTE8_P (c))
4558         {
4559           c = CHAR_TO_BYTE8 (c);
4560           EMIT_ONE_BYTE (c);
4561         }
4562       else
4563         {
4564           struct charset *charset;
4565
4566           if (preferred_charset_id >= 0)
4567             {
4568               bool result;
4569
4570               charset = CHARSET_FROM_ID (preferred_charset_id);
4571               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4572               if (! result)
4573                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4574                                      NULL, charset);
4575             }
4576           else
4577             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4578                                  NULL, charset);
4579           if (!charset)
4580             {
4581               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4582                 {
4583                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4584                   charset = CHARSET_FROM_ID (charset_ascii);
4585                 }
4586               else
4587                 {
4588                   c = coding->default_char;
4589                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4590                                        charset_list, NULL, charset);
4591                 }
4592             }
4593           ENCODE_ISO_CHARACTER (charset, c);
4594         }
4595     }
4596
4597   if (coding->mode & CODING_MODE_LAST_BLOCK
4598       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4599     {
4600       ASSURE_DESTINATION (safe_room);
4601       ENCODE_RESET_PLANE_AND_REGISTER ();
4602     }
4603   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4604   CODING_ISO_BOL (coding) = bol_designation;
4605   coding->produced_char += produced_chars;
4606   coding->produced = dst - coding->destination;
4607   return 0;
4608 }
4609
4610 \f
4611 /*** 8,9. SJIS and BIG5 handlers ***/
4612
4613 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4614    quite widely.  So, for the moment, Emacs supports them in the bare
4615    C code.  But, in the future, they may be supported only by CCL.  */
4616
4617 /* SJIS is a coding system encoding three character sets: ASCII, right
4618    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4619    as is.  A character of charset katakana-jisx0201 is encoded by
4620    "position-code + 0x80".  A character of charset japanese-jisx0208
4621    is encoded in 2-byte but two position-codes are divided and shifted
4622    so that it fit in the range below.
4623
4624    --- CODE RANGE of SJIS ---
4625    (character set)      (range)
4626    ASCII                0x00 .. 0x7F
4627    KATAKANA-JISX0201    0xA0 .. 0xDF
4628    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4629             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4630    -------------------------------
4631
4632 */
4633
4634 /* BIG5 is a coding system encoding two character sets: ASCII and
4635    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4636    character set and is encoded in two-byte.
4637
4638    --- CODE RANGE of BIG5 ---
4639    (character set)      (range)
4640    ASCII                0x00 .. 0x7F
4641    Big5 (1st byte)      0xA1 .. 0xFE
4642         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4643    --------------------------
4644
4645   */
4646
4647 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4648    Return true if a text is encoded in SJIS.  */
4649
4650 static bool
4651 detect_coding_sjis (struct coding_system *coding,
4652                     struct coding_detection_info *detect_info)
4653 {
4654   const unsigned char *src = coding->source, *src_base;
4655   const unsigned char *src_end = coding->source + coding->src_bytes;
4656   bool multibytep = coding->src_multibyte;
4657   ptrdiff_t consumed_chars = 0;
4658   int found = 0;
4659   int c;
4660   Lisp_Object attrs, charset_list;
4661   int max_first_byte_of_2_byte_code;
4662
4663   CODING_GET_INFO (coding, attrs, charset_list);
4664   max_first_byte_of_2_byte_code
4665     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4666
4667   detect_info->checked |= CATEGORY_MASK_SJIS;
4668   /* A coding system of this category is always ASCII compatible.  */
4669   src += coding->head_ascii;
4670
4671   while (1)
4672     {
4673       src_base = src;
4674       ONE_MORE_BYTE (c);
4675       if (c < 0x80)
4676         continue;
4677       if ((c >= 0x81 && c <= 0x9F)
4678           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4679         {
4680           ONE_MORE_BYTE (c);
4681           if (c < 0x40 || c == 0x7F || c > 0xFC)
4682             break;
4683           found = CATEGORY_MASK_SJIS;
4684         }
4685       else if (c >= 0xA0 && c < 0xE0)
4686         found = CATEGORY_MASK_SJIS;
4687       else
4688         break;
4689     }
4690   detect_info->rejected |= CATEGORY_MASK_SJIS;
4691   return 0;
4692
4693  no_more_source:
4694   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4695     {
4696       detect_info->rejected |= CATEGORY_MASK_SJIS;
4697       return 0;
4698     }
4699   detect_info->found |= found;
4700   return 1;
4701 }
4702
4703 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4704    Return true if a text is encoded in BIG5.  */
4705
4706 static bool
4707 detect_coding_big5 (struct coding_system *coding,
4708                     struct coding_detection_info *detect_info)
4709 {
4710   const unsigned char *src = coding->source, *src_base;
4711   const unsigned char *src_end = coding->source + coding->src_bytes;
4712   bool multibytep = coding->src_multibyte;
4713   ptrdiff_t consumed_chars = 0;
4714   int found = 0;
4715   int c;
4716
4717   detect_info->checked |= CATEGORY_MASK_BIG5;
4718   /* A coding system of this category is always ASCII compatible.  */
4719   src += coding->head_ascii;
4720
4721   while (1)
4722     {
4723       src_base = src;
4724       ONE_MORE_BYTE (c);
4725       if (c < 0x80)
4726         continue;
4727       if (c >= 0xA1)
4728         {
4729           ONE_MORE_BYTE (c);
4730           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4731             return 0;
4732           found = CATEGORY_MASK_BIG5;
4733         }
4734       else
4735         break;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_BIG5;
4738   return 0;
4739
4740  no_more_source:
4741   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4742     {
4743       detect_info->rejected |= CATEGORY_MASK_BIG5;
4744       return 0;
4745     }
4746   detect_info->found |= found;
4747   return 1;
4748 }
4749
4750 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4751
4752 static void
4753 decode_coding_sjis (struct coding_system *coding)
4754 {
4755   const unsigned char *src = coding->source + coding->consumed;
4756   const unsigned char *src_end = coding->source + coding->src_bytes;
4757   const unsigned char *src_base;
4758   int *charbuf = coding->charbuf + coding->charbuf_used;
4759   /* We may produce one charset annotation in one loop and one more at
4760      the end.  */
4761   int *charbuf_end
4762     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4763   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4764   bool multibytep = coding->src_multibyte;
4765   struct charset *charset_roman, *charset_kanji, *charset_kana;
4766   struct charset *charset_kanji2;
4767   Lisp_Object attrs, charset_list, val;
4768   ptrdiff_t char_offset = coding->produced_char;
4769   ptrdiff_t last_offset = char_offset;
4770   int last_id = charset_ascii;
4771   bool eol_dos
4772     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4773   int byte_after_cr = -1;
4774
4775   CODING_GET_INFO (coding, attrs, charset_list);
4776
4777   val = charset_list;
4778   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4779   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4780   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4781   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4782
4783   while (1)
4784     {
4785       int c, c1;
4786       struct charset *charset;
4787
4788       src_base = src;
4789       consumed_chars_base = consumed_chars;
4790
4791       if (charbuf >= charbuf_end)
4792         {
4793           if (byte_after_cr >= 0)
4794             src_base--;
4795           break;
4796         }
4797
4798       if (byte_after_cr >= 0)
4799         c = byte_after_cr, byte_after_cr = -1;
4800       else
4801         ONE_MORE_BYTE (c);
4802       if (c < 0)
4803         goto invalid_code;
4804       if (c < 0x80)
4805         {
4806           if (eol_dos && c == '\r')
4807             ONE_MORE_BYTE (byte_after_cr);
4808           charset = charset_roman;
4809         }
4810       else if (c == 0x80 || c == 0xA0)
4811         goto invalid_code;
4812       else if (c >= 0xA1 && c <= 0xDF)
4813         {
4814           /* SJIS -> JISX0201-Kana */
4815           c &= 0x7F;
4816           charset = charset_kana;
4817         }
4818       else if (c <= 0xEF)
4819         {
4820           /* SJIS -> JISX0208 */
4821           ONE_MORE_BYTE (c1);
4822           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4823             goto invalid_code;
4824           c = (c << 8) | c1;
4825           SJIS_TO_JIS (c);
4826           charset = charset_kanji;
4827         }
4828       else if (c <= 0xFC && charset_kanji2)
4829         {
4830           /* SJIS -> JISX0213-2 */
4831           ONE_MORE_BYTE (c1);
4832           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4833             goto invalid_code;
4834           c = (c << 8) | c1;
4835           SJIS_TO_JIS2 (c);
4836           charset = charset_kanji2;
4837         }
4838       else
4839         goto invalid_code;
4840       if (charset->id != charset_ascii
4841           && last_id != charset->id)
4842         {
4843           if (last_id != charset_ascii)
4844             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845           last_id = charset->id;
4846           last_offset = char_offset;
4847         }
4848       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4849       *charbuf++ = c;
4850       char_offset++;
4851       continue;
4852
4853     invalid_code:
4854       src = src_base;
4855       consumed_chars = consumed_chars_base;
4856       ONE_MORE_BYTE (c);
4857       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4858       char_offset++;
4859       coding->errors++;
4860     }
4861
4862  no_more_source:
4863   if (last_id != charset_ascii)
4864     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4865   coding->consumed_char += consumed_chars_base;
4866   coding->consumed = src_base - coding->source;
4867   coding->charbuf_used = charbuf - coding->charbuf;
4868 }
4869
4870 static void
4871 decode_coding_big5 (struct coding_system *coding)
4872 {
4873   const unsigned char *src = coding->source + coding->consumed;
4874   const unsigned char *src_end = coding->source + coding->src_bytes;
4875   const unsigned char *src_base;
4876   int *charbuf = coding->charbuf + coding->charbuf_used;
4877   /* We may produce one charset annotation in one loop and one more at
4878      the end.  */
4879   int *charbuf_end
4880     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4881   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4882   bool multibytep = coding->src_multibyte;
4883   struct charset *charset_roman, *charset_big5;
4884   Lisp_Object attrs, charset_list, val;
4885   ptrdiff_t char_offset = coding->produced_char;
4886   ptrdiff_t last_offset = char_offset;
4887   int last_id = charset_ascii;
4888   bool eol_dos
4889     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4890   int byte_after_cr = -1;
4891
4892   CODING_GET_INFO (coding, attrs, charset_list);
4893   val = charset_list;
4894   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4895   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4896
4897   while (1)
4898     {
4899       int c, c1;
4900       struct charset *charset;
4901
4902       src_base = src;
4903       consumed_chars_base = consumed_chars;
4904
4905       if (charbuf >= charbuf_end)
4906         {
4907           if (byte_after_cr >= 0)
4908             src_base--;
4909           break;
4910         }
4911
4912       if (byte_after_cr >= 0)
4913         c = byte_after_cr, byte_after_cr = -1;
4914       else
4915         ONE_MORE_BYTE (c);
4916
4917       if (c < 0)
4918         goto invalid_code;
4919       if (c < 0x80)
4920         {
4921           if (eol_dos && c == '\r')
4922             ONE_MORE_BYTE (byte_after_cr);
4923           charset = charset_roman;
4924         }
4925       else
4926         {
4927           /* BIG5 -> Big5 */
4928           if (c < 0xA1 || c > 0xFE)
4929             goto invalid_code;
4930           ONE_MORE_BYTE (c1);
4931           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4932             goto invalid_code;
4933           c = c << 8 | c1;
4934           charset = charset_big5;
4935         }
4936       if (charset->id != charset_ascii
4937           && last_id != charset->id)
4938         {
4939           if (last_id != charset_ascii)
4940             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4941           last_id = charset->id;
4942           last_offset = char_offset;
4943         }
4944       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4945       *charbuf++ = c;
4946       char_offset++;
4947       continue;
4948
4949     invalid_code:
4950       src = src_base;
4951       consumed_chars = consumed_chars_base;
4952       ONE_MORE_BYTE (c);
4953       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4954       char_offset++;
4955       coding->errors++;
4956     }
4957
4958  no_more_source:
4959   if (last_id != charset_ascii)
4960     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4961   coding->consumed_char += consumed_chars_base;
4962   coding->consumed = src_base - coding->source;
4963   coding->charbuf_used = charbuf - coding->charbuf;
4964 }
4965
4966 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4967    This function can encode charsets `ascii', `katakana-jisx0201',
4968    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4969    are sure that all these charsets are registered as official charset
4970    (i.e. do not have extended leading-codes).  Characters of other
4971    charsets are produced without any encoding.  */
4972
4973 static bool
4974 encode_coding_sjis (struct coding_system *coding)
4975 {
4976   bool multibytep = coding->dst_multibyte;
4977   int *charbuf = coding->charbuf;
4978   int *charbuf_end = charbuf + coding->charbuf_used;
4979   unsigned char *dst = coding->destination + coding->produced;
4980   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4981   int safe_room = 4;
4982   ptrdiff_t produced_chars = 0;
4983   Lisp_Object attrs, charset_list, val;
4984   bool ascii_compatible;
4985   struct charset *charset_kanji, *charset_kana;
4986   struct charset *charset_kanji2;
4987   int c;
4988
4989   CODING_GET_INFO (coding, attrs, charset_list);
4990   val = XCDR (charset_list);
4991   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4992   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4993   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4994
4995   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4996
4997   while (charbuf < charbuf_end)
4998     {
4999       ASSURE_DESTINATION (safe_room);
5000       c = *charbuf++;
5001       /* Now encode the character C.  */
5002       if (ASCII_CHAR_P (c) && ascii_compatible)
5003         EMIT_ONE_ASCII_BYTE (c);
5004       else if (CHAR_BYTE8_P (c))
5005         {
5006           c = CHAR_TO_BYTE8 (c);
5007           EMIT_ONE_BYTE (c);
5008         }
5009       else
5010         {
5011           unsigned code;
5012           struct charset *charset;
5013           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5014                                &code, charset);
5015
5016           if (!charset)
5017             {
5018               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5019                 {
5020                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5021                   charset = CHARSET_FROM_ID (charset_ascii);
5022                 }
5023               else
5024                 {
5025                   c = coding->default_char;
5026                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5027                                        charset_list, &code, charset);
5028                 }
5029             }
5030           if (code == CHARSET_INVALID_CODE (charset))
5031             emacs_abort ();
5032           if (charset == charset_kanji)
5033             {
5034               int c1, c2;
5035               JIS_TO_SJIS (code);
5036               c1 = code >> 8, c2 = code & 0xFF;
5037               EMIT_TWO_BYTES (c1, c2);
5038             }
5039           else if (charset == charset_kana)
5040             EMIT_ONE_BYTE (code | 0x80);
5041           else if (charset_kanji2 && charset == charset_kanji2)
5042             {
5043               int c1, c2;
5044
5045               c1 = code >> 8;
5046               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5047                   || c1 == 0x28
5048                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5049                 {
5050                   JIS_TO_SJIS2 (code);
5051                   c1 = code >> 8, c2 = code & 0xFF;
5052                   EMIT_TWO_BYTES (c1, c2);
5053                 }
5054               else
5055                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5056             }
5057           else
5058             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5059         }
5060     }
5061   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5062   coding->produced_char += produced_chars;
5063   coding->produced = dst - coding->destination;
5064   return 0;
5065 }
5066
5067 static bool
5068 encode_coding_big5 (struct coding_system *coding)
5069 {
5070   bool multibytep = coding->dst_multibyte;
5071   int *charbuf = coding->charbuf;
5072   int *charbuf_end = charbuf + coding->charbuf_used;
5073   unsigned char *dst = coding->destination + coding->produced;
5074   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5075   int safe_room = 4;
5076   ptrdiff_t produced_chars = 0;
5077   Lisp_Object attrs, charset_list, val;
5078   bool ascii_compatible;
5079   struct charset *charset_big5;
5080   int c;
5081
5082   CODING_GET_INFO (coding, attrs, charset_list);
5083   val = XCDR (charset_list);
5084   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5085   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5086
5087   while (charbuf < charbuf_end)
5088     {
5089       ASSURE_DESTINATION (safe_room);
5090       c = *charbuf++;
5091       /* Now encode the character C.  */
5092       if (ASCII_CHAR_P (c) && ascii_compatible)
5093         EMIT_ONE_ASCII_BYTE (c);
5094       else if (CHAR_BYTE8_P (c))
5095         {
5096           c = CHAR_TO_BYTE8 (c);
5097           EMIT_ONE_BYTE (c);
5098         }
5099       else
5100         {
5101           unsigned code;
5102           struct charset *charset;
5103           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5104                                &code, charset);
5105
5106           if (! charset)
5107             {
5108               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5109                 {
5110                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5111                   charset = CHARSET_FROM_ID (charset_ascii);
5112                 }
5113               else
5114                 {
5115                   c = coding->default_char;
5116                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5117                                        charset_list, &code, charset);
5118                 }
5119             }
5120           if (code == CHARSET_INVALID_CODE (charset))
5121             emacs_abort ();
5122           if (charset == charset_big5)
5123             {
5124               int c1, c2;
5125
5126               c1 = code >> 8, c2 = code & 0xFF;
5127               EMIT_TWO_BYTES (c1, c2);
5128             }
5129           else
5130             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5131         }
5132     }
5133   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5134   coding->produced_char += produced_chars;
5135   coding->produced = dst - coding->destination;
5136   return 0;
5137 }
5138
5139 \f
5140 /*** 10. CCL handlers ***/
5141
5142 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5143    Return true if a text is encoded in a coding system of which
5144    encoder/decoder are written in CCL program.  */
5145
5146 static bool
5147 detect_coding_ccl (struct coding_system *coding,
5148                    struct coding_detection_info *detect_info)
5149 {
5150   const unsigned char *src = coding->source, *src_base;
5151   const unsigned char *src_end = coding->source + coding->src_bytes;
5152   bool multibytep = coding->src_multibyte;
5153   ptrdiff_t consumed_chars = 0;
5154   int found = 0;
5155   unsigned char *valids;
5156   ptrdiff_t head_ascii = coding->head_ascii;
5157   Lisp_Object attrs;
5158
5159   detect_info->checked |= CATEGORY_MASK_CCL;
5160
5161   coding = &coding_categories[coding_category_ccl];
5162   valids = CODING_CCL_VALIDS (coding);
5163   attrs = CODING_ID_ATTRS (coding->id);
5164   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5165     src += head_ascii;
5166
5167   while (1)
5168     {
5169       int c;
5170
5171       src_base = src;
5172       ONE_MORE_BYTE (c);
5173       if (c < 0 || ! valids[c])
5174         break;
5175       if ((valids[c] > 1))
5176         found = CATEGORY_MASK_CCL;
5177     }
5178   detect_info->rejected |= CATEGORY_MASK_CCL;
5179   return 0;
5180
5181  no_more_source:
5182   detect_info->found |= found;
5183   return 1;
5184 }
5185
5186 static void
5187 decode_coding_ccl (struct coding_system *coding)
5188 {
5189   const unsigned char *src = coding->source + coding->consumed;
5190   const unsigned char *src_end = coding->source + coding->src_bytes;
5191   int *charbuf = coding->charbuf + coding->charbuf_used;
5192   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5193   ptrdiff_t consumed_chars = 0;
5194   bool multibytep = coding->src_multibyte;
5195   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5196   int source_charbuf[1024];
5197   int source_byteidx[1025];
5198   Lisp_Object attrs, charset_list;
5199
5200   CODING_GET_INFO (coding, attrs, charset_list);
5201
5202   while (1)
5203     {
5204       const unsigned char *p = src;
5205       ptrdiff_t offset;
5206       int i = 0;
5207
5208       if (multibytep)
5209         {
5210           while (i < 1024 && p < src_end)
5211             {
5212               source_byteidx[i] = p - src;
5213               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5214             }
5215           source_byteidx[i] = p - src;
5216         }
5217       else
5218         while (i < 1024 && p < src_end)
5219           source_charbuf[i++] = *p++;
5220
5221       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5222         ccl->last_block = true;
5223       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5224       charset_map_loaded = 0;
5225       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5226                   charset_list);
5227       if (charset_map_loaded
5228           && (offset = coding_change_source (coding)))
5229         {
5230           p += offset;
5231           src += offset;
5232           src_end += offset;
5233         }
5234       charbuf += ccl->produced;
5235       if (multibytep)
5236         src += source_byteidx[ccl->consumed];
5237       else
5238         src += ccl->consumed;
5239       consumed_chars += ccl->consumed;
5240       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5241         break;
5242     }
5243
5244   switch (ccl->status)
5245     {
5246     case CCL_STAT_SUSPEND_BY_SRC:
5247       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5248       break;
5249     case CCL_STAT_SUSPEND_BY_DST:
5250       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5251       break;
5252     case CCL_STAT_QUIT:
5253     case CCL_STAT_INVALID_CMD:
5254       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5255       break;
5256     default:
5257       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5258       break;
5259     }
5260   coding->consumed_char += consumed_chars;
5261   coding->consumed = src - coding->source;
5262   coding->charbuf_used = charbuf - coding->charbuf;
5263 }
5264
5265 static bool
5266 encode_coding_ccl (struct coding_system *coding)
5267 {
5268   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5269   bool multibytep = coding->dst_multibyte;
5270   int *charbuf = coding->charbuf;
5271   int *charbuf_end = charbuf + coding->charbuf_used;
5272   unsigned char *dst = coding->destination + coding->produced;
5273   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5274   int destination_charbuf[1024];
5275   ptrdiff_t produced_chars = 0;
5276   int i;
5277   Lisp_Object attrs, charset_list;
5278
5279   CODING_GET_INFO (coding, attrs, charset_list);
5280   if (coding->consumed_char == coding->src_chars
5281       && coding->mode & CODING_MODE_LAST_BLOCK)
5282     ccl->last_block = true;
5283
5284   do
5285     {
5286       ptrdiff_t offset;
5287
5288       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5289       charset_map_loaded = 0;
5290       ccl_driver (ccl, charbuf, destination_charbuf,
5291                   charbuf_end - charbuf, 1024, charset_list);
5292       if (charset_map_loaded
5293           && (offset = coding_change_destination (coding)))
5294         dst += offset;
5295       if (multibytep)
5296         {
5297           ASSURE_DESTINATION (ccl->produced * 2);
5298           for (i = 0; i < ccl->produced; i++)
5299             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5300         }
5301       else
5302         {
5303           ASSURE_DESTINATION (ccl->produced);
5304           for (i = 0; i < ccl->produced; i++)
5305             *dst++ = destination_charbuf[i] & 0xFF;
5306           produced_chars += ccl->produced;
5307         }
5308       charbuf += ccl->consumed;
5309       if (ccl->status == CCL_STAT_QUIT
5310           || ccl->status == CCL_STAT_INVALID_CMD)
5311         break;
5312     }
5313   while (charbuf < charbuf_end);
5314
5315   switch (ccl->status)
5316     {
5317     case CCL_STAT_SUSPEND_BY_SRC:
5318       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5319       break;
5320     case CCL_STAT_SUSPEND_BY_DST:
5321       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5322       break;
5323     case CCL_STAT_QUIT:
5324     case CCL_STAT_INVALID_CMD:
5325       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5326       break;
5327     default:
5328       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5329       break;
5330     }
5331
5332   coding->produced_char += produced_chars;
5333   coding->produced = dst - coding->destination;
5334   return 0;
5335 }
5336
5337 \f
5338 /*** 10, 11. no-conversion handlers ***/
5339
5340 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5341
5342 static void
5343 decode_coding_raw_text (struct coding_system *coding)
5344 {
5345   bool eol_dos
5346     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5347
5348   coding->chars_at_source = 1;
5349   coding->consumed_char = coding->src_chars;
5350   coding->consumed = coding->src_bytes;
5351   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5352     {
5353       coding->consumed_char--;
5354       coding->consumed--;
5355       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5356     }
5357   else
5358     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5359 }
5360
5361 static bool
5362 encode_coding_raw_text (struct coding_system *coding)
5363 {
5364   bool multibytep = coding->dst_multibyte;
5365   int *charbuf = coding->charbuf;
5366   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5367   unsigned char *dst = coding->destination + coding->produced;
5368   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5369   ptrdiff_t produced_chars = 0;
5370   int c;
5371
5372   if (multibytep)
5373     {
5374       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5375
5376       if (coding->src_multibyte)
5377         while (charbuf < charbuf_end)
5378           {
5379             ASSURE_DESTINATION (safe_room);
5380             c = *charbuf++;
5381             if (ASCII_CHAR_P (c))
5382               EMIT_ONE_ASCII_BYTE (c);
5383             else if (CHAR_BYTE8_P (c))
5384               {
5385                 c = CHAR_TO_BYTE8 (c);
5386                 EMIT_ONE_BYTE (c);
5387               }
5388             else
5389               {
5390                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5391
5392                 CHAR_STRING_ADVANCE (c, p1);
5393                 do
5394                   {
5395                     EMIT_ONE_BYTE (*p0);
5396                     p0++;
5397                   }
5398                 while (p0 < p1);
5399               }
5400           }
5401       else
5402         while (charbuf < charbuf_end)
5403           {
5404             ASSURE_DESTINATION (safe_room);
5405             c = *charbuf++;
5406             EMIT_ONE_BYTE (c);
5407           }
5408     }
5409   else
5410     {
5411       if (coding->src_multibyte)
5412         {
5413           int safe_room = MAX_MULTIBYTE_LENGTH;
5414
5415           while (charbuf < charbuf_end)
5416             {
5417               ASSURE_DESTINATION (safe_room);
5418               c = *charbuf++;
5419               if (ASCII_CHAR_P (c))
5420                 *dst++ = c;
5421               else if (CHAR_BYTE8_P (c))
5422                 *dst++ = CHAR_TO_BYTE8 (c);
5423               else
5424                 CHAR_STRING_ADVANCE (c, dst);
5425             }
5426         }
5427       else
5428         {
5429           ASSURE_DESTINATION (charbuf_end - charbuf);
5430           while (charbuf < charbuf_end && dst < dst_end)
5431             *dst++ = *charbuf++;
5432         }
5433       produced_chars = dst - (coding->destination + coding->produced);
5434     }
5435   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5436   coding->produced_char += produced_chars;
5437   coding->produced = dst - coding->destination;
5438   return 0;
5439 }
5440
5441 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5442    Return true if a text is encoded in a charset-based coding system.  */
5443
5444 static bool
5445 detect_coding_charset (struct coding_system *coding,
5446                        struct coding_detection_info *detect_info)
5447 {
5448   const unsigned char *src = coding->source, *src_base;
5449   const unsigned char *src_end = coding->source + coding->src_bytes;
5450   bool multibytep = coding->src_multibyte;
5451   ptrdiff_t consumed_chars = 0;
5452   Lisp_Object attrs, valids, name;
5453   int found = 0;
5454   ptrdiff_t head_ascii = coding->head_ascii;
5455   bool check_latin_extra = 0;
5456
5457   detect_info->checked |= CATEGORY_MASK_CHARSET;
5458
5459   coding = &coding_categories[coding_category_charset];
5460   attrs = CODING_ID_ATTRS (coding->id);
5461   valids = AREF (attrs, coding_attr_charset_valids);
5462   name = CODING_ID_NAME (coding->id);
5463   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5464                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5465       || strncmp (SSDATA (SYMBOL_NAME (name)),
5466                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5467     check_latin_extra = 1;
5468
5469   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5470     src += head_ascii;
5471
5472   while (1)
5473     {
5474       int c;
5475       Lisp_Object val;
5476       struct charset *charset;
5477       int dim, idx;
5478
5479       src_base = src;
5480       ONE_MORE_BYTE (c);
5481       if (c < 0)
5482         continue;
5483       val = AREF (valids, c);
5484       if (NILP (val))
5485         break;
5486       if (c >= 0x80)
5487         {
5488           if (c < 0xA0
5489               && check_latin_extra
5490               && (!VECTORP (Vlatin_extra_code_table)
5491                   || NILP (AREF (Vlatin_extra_code_table, c))))
5492             break;
5493           found = CATEGORY_MASK_CHARSET;
5494         }
5495       if (INTEGERP (val))
5496         {
5497           charset = CHARSET_FROM_ID (XFASTINT (val));
5498           dim = CHARSET_DIMENSION (charset);
5499           for (idx = 1; idx < dim; idx++)
5500             {
5501               if (src == src_end)
5502                 goto too_short;
5503               ONE_MORE_BYTE (c);
5504               if (c < charset->code_space[(dim - 1 - idx) * 4]
5505                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5506                 break;
5507             }
5508           if (idx < dim)
5509             break;
5510         }
5511       else
5512         {
5513           idx = 1;
5514           for (; CONSP (val); val = XCDR (val))
5515             {
5516               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5517               dim = CHARSET_DIMENSION (charset);
5518               while (idx < dim)
5519                 {
5520                   if (src == src_end)
5521                     goto too_short;
5522                   ONE_MORE_BYTE (c);
5523                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5524                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5525                     break;
5526                   idx++;
5527                 }
5528               if (idx == dim)
5529                 {
5530                   val = Qnil;
5531                   break;
5532                 }
5533             }
5534           if (CONSP (val))
5535             break;
5536         }
5537     }
5538  too_short:
5539   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5540   return 0;
5541
5542  no_more_source:
5543   detect_info->found |= found;
5544   return 1;
5545 }
5546
5547 static void
5548 decode_coding_charset (struct coding_system *coding)
5549 {
5550   const unsigned char *src = coding->source + coding->consumed;
5551   const unsigned char *src_end = coding->source + coding->src_bytes;
5552   const unsigned char *src_base;
5553   int *charbuf = coding->charbuf + coding->charbuf_used;
5554   /* We may produce one charset annotation in one loop and one more at
5555      the end.  */
5556   int *charbuf_end
5557     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5558   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5559   bool multibytep = coding->src_multibyte;
5560   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5561   Lisp_Object valids;
5562   ptrdiff_t char_offset = coding->produced_char;
5563   ptrdiff_t last_offset = char_offset;
5564   int last_id = charset_ascii;
5565   bool eol_dos
5566     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5567   int byte_after_cr = -1;
5568
5569   valids = AREF (attrs, coding_attr_charset_valids);
5570
5571   while (1)
5572     {
5573       int c;
5574       Lisp_Object val;
5575       struct charset *charset;
5576       int dim;
5577       int len = 1;
5578       unsigned code;
5579
5580       src_base = src;
5581       consumed_chars_base = consumed_chars;
5582
5583       if (charbuf >= charbuf_end)
5584         {
5585           if (byte_after_cr >= 0)
5586             src_base--;
5587           break;
5588         }
5589
5590       if (byte_after_cr >= 0)
5591         {
5592           c = byte_after_cr;
5593           byte_after_cr = -1;
5594         }
5595       else
5596         {
5597           ONE_MORE_BYTE (c);
5598           if (eol_dos && c == '\r')
5599             ONE_MORE_BYTE (byte_after_cr);
5600         }
5601       if (c < 0)
5602         goto invalid_code;
5603       code = c;
5604
5605       val = AREF (valids, c);
5606       if (! INTEGERP (val) && ! CONSP (val))
5607         goto invalid_code;
5608       if (INTEGERP (val))
5609         {
5610           charset = CHARSET_FROM_ID (XFASTINT (val));
5611           dim = CHARSET_DIMENSION (charset);
5612           while (len < dim)
5613             {
5614               ONE_MORE_BYTE (c);
5615               code = (code << 8) | c;
5616               len++;
5617             }
5618           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5619                               charset, code, c);
5620         }
5621       else
5622         {
5623           /* VAL is a list of charset IDs.  It is assured that the
5624              list is sorted by charset dimensions (smaller one
5625              comes first).  */
5626           while (CONSP (val))
5627             {
5628               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5629               dim = CHARSET_DIMENSION (charset);
5630               while (len < dim)
5631                 {
5632                   ONE_MORE_BYTE (c);
5633                   code = (code << 8) | c;
5634                   len++;
5635                 }
5636               CODING_DECODE_CHAR (coding, src, src_base,
5637                                   src_end, charset, code, c);
5638               if (c >= 0)
5639                 break;
5640               val = XCDR (val);
5641             }
5642         }
5643       if (c < 0)
5644         goto invalid_code;
5645       if (charset->id != charset_ascii
5646           && last_id != charset->id)
5647         {
5648           if (last_id != charset_ascii)
5649             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5650           last_id = charset->id;
5651           last_offset = char_offset;
5652         }
5653
5654       *charbuf++ = c;
5655       char_offset++;
5656       continue;
5657
5658     invalid_code:
5659       src = src_base;
5660       consumed_chars = consumed_chars_base;
5661       ONE_MORE_BYTE (c);
5662       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5663       char_offset++;
5664       coding->errors++;
5665     }
5666
5667  no_more_source:
5668   if (last_id != charset_ascii)
5669     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5670   coding->consumed_char += consumed_chars_base;
5671   coding->consumed = src_base - coding->source;
5672   coding->charbuf_used = charbuf - coding->charbuf;
5673 }
5674
5675 static bool
5676 encode_coding_charset (struct coding_system *coding)
5677 {
5678   bool multibytep = coding->dst_multibyte;
5679   int *charbuf = coding->charbuf;
5680   int *charbuf_end = charbuf + coding->charbuf_used;
5681   unsigned char *dst = coding->destination + coding->produced;
5682   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5683   int safe_room = MAX_MULTIBYTE_LENGTH;
5684   ptrdiff_t produced_chars = 0;
5685   Lisp_Object attrs, charset_list;
5686   bool ascii_compatible;
5687   int c;
5688
5689   CODING_GET_INFO (coding, attrs, charset_list);
5690   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5691
5692   while (charbuf < charbuf_end)
5693     {
5694       struct charset *charset;
5695       unsigned code;
5696
5697       ASSURE_DESTINATION (safe_room);
5698       c = *charbuf++;
5699       if (ascii_compatible && ASCII_CHAR_P (c))
5700         EMIT_ONE_ASCII_BYTE (c);
5701       else if (CHAR_BYTE8_P (c))
5702         {
5703           c = CHAR_TO_BYTE8 (c);
5704           EMIT_ONE_BYTE (c);
5705         }
5706       else
5707         {
5708           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5709                                &code, charset);
5710
5711           if (charset)
5712             {
5713               if (CHARSET_DIMENSION (charset) == 1)
5714                 EMIT_ONE_BYTE (code);
5715               else if (CHARSET_DIMENSION (charset) == 2)
5716                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5717               else if (CHARSET_DIMENSION (charset) == 3)
5718                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5719               else
5720                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5721                                  (code >> 8) & 0xFF, code & 0xFF);
5722             }
5723           else
5724             {
5725               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5726                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5727               else
5728                 c = coding->default_char;
5729               EMIT_ONE_BYTE (c);
5730             }
5731         }
5732     }
5733
5734   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5735   coding->produced_char += produced_chars;
5736   coding->produced = dst - coding->destination;
5737   return 0;
5738 }
5739
5740 \f
5741 /*** 7. C library functions ***/
5742
5743 /* Setup coding context CODING from information about CODING_SYSTEM.
5744    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5745    CODING_SYSTEM is invalid, signal an error.  */
5746
5747 void
5748 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5749 {
5750   Lisp_Object attrs;
5751   Lisp_Object eol_type;
5752   Lisp_Object coding_type;
5753   Lisp_Object val;
5754
5755   if (NILP (coding_system))
5756     coding_system = Qundecided;
5757
5758   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5759
5760   attrs = CODING_ID_ATTRS (coding->id);
5761   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5762
5763   coding->mode = 0;
5764   if (VECTORP (eol_type))
5765     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5766                             | CODING_REQUIRE_DETECTION_MASK);
5767   else if (! EQ (eol_type, Qunix))
5768     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5769                             | CODING_REQUIRE_ENCODING_MASK);
5770   else
5771     coding->common_flags = 0;
5772   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5773     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5774   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5775     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5776   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5777     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5778
5779   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5780   coding->max_charset_id = SCHARS (val) - 1;
5781   coding->safe_charsets = SDATA (val);
5782   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5783   coding->carryover_bytes = 0;
5784   coding->raw_destination = 0;
5785
5786   coding_type = CODING_ATTR_TYPE (attrs);
5787   if (EQ (coding_type, Qundecided))
5788     {
5789       coding->detector = NULL;
5790       coding->decoder = decode_coding_raw_text;
5791       coding->encoder = encode_coding_raw_text;
5792       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5793       coding->spec.undecided.inhibit_nbd
5794         = (encode_inhibit_flag
5795            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5796       coding->spec.undecided.inhibit_ied
5797         = (encode_inhibit_flag
5798            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5799       coding->spec.undecided.prefer_utf_8
5800         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5801     }
5802   else if (EQ (coding_type, Qiso_2022))
5803     {
5804       int i;
5805       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5806
5807       /* Invoke graphic register 0 to plane 0.  */
5808       CODING_ISO_INVOCATION (coding, 0) = 0;
5809       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5810       CODING_ISO_INVOCATION (coding, 1)
5811         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5812       /* Setup the initial status of designation.  */
5813       for (i = 0; i < 4; i++)
5814         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5815       /* Not single shifting initially.  */
5816       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5817       /* Beginning of buffer should also be regarded as bol. */
5818       CODING_ISO_BOL (coding) = 1;
5819       coding->detector = detect_coding_iso_2022;
5820       coding->decoder = decode_coding_iso_2022;
5821       coding->encoder = encode_coding_iso_2022;
5822       if (flags & CODING_ISO_FLAG_SAFE)
5823         coding->mode |= CODING_MODE_SAFE_ENCODING;
5824       coding->common_flags
5825         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5826             | CODING_REQUIRE_FLUSHING_MASK);
5827       if (flags & CODING_ISO_FLAG_COMPOSITION)
5828         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5829       if (flags & CODING_ISO_FLAG_DESIGNATION)
5830         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5831       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5832         {
5833           setup_iso_safe_charsets (attrs);
5834           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5835           coding->max_charset_id = SCHARS (val) - 1;
5836           coding->safe_charsets = SDATA (val);
5837         }
5838       CODING_ISO_FLAGS (coding) = flags;
5839       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5840       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5841       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5842       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5843     }
5844   else if (EQ (coding_type, Qcharset))
5845     {
5846       coding->detector = detect_coding_charset;
5847       coding->decoder = decode_coding_charset;
5848       coding->encoder = encode_coding_charset;
5849       coding->common_flags
5850         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5851     }
5852   else if (EQ (coding_type, Qutf_8))
5853     {
5854       val = AREF (attrs, coding_attr_utf_bom);
5855       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5856                                    : EQ (val, Qt) ? utf_with_bom
5857                                    : utf_without_bom);
5858       coding->detector = detect_coding_utf_8;
5859       coding->decoder = decode_coding_utf_8;
5860       coding->encoder = encode_coding_utf_8;
5861       coding->common_flags
5862         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5863       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5864         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5865     }
5866   else if (EQ (coding_type, Qutf_16))
5867     {
5868       val = AREF (attrs, coding_attr_utf_bom);
5869       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5870                                     : EQ (val, Qt) ? utf_with_bom
5871                                     : utf_without_bom);
5872       val = AREF (attrs, coding_attr_utf_16_endian);
5873       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5874                                        : utf_16_little_endian);
5875       CODING_UTF_16_SURROGATE (coding) = 0;
5876       coding->detector = detect_coding_utf_16;
5877       coding->decoder = decode_coding_utf_16;
5878       coding->encoder = encode_coding_utf_16;
5879       coding->common_flags
5880         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5881       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5882         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5883     }
5884   else if (EQ (coding_type, Qccl))
5885     {
5886       coding->detector = detect_coding_ccl;
5887       coding->decoder = decode_coding_ccl;
5888       coding->encoder = encode_coding_ccl;
5889       coding->common_flags
5890         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5891             | CODING_REQUIRE_FLUSHING_MASK);
5892     }
5893   else if (EQ (coding_type, Qemacs_mule))
5894     {
5895       coding->detector = detect_coding_emacs_mule;
5896       coding->decoder = decode_coding_emacs_mule;
5897       coding->encoder = encode_coding_emacs_mule;
5898       coding->common_flags
5899         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5900       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5901           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5902         {
5903           Lisp_Object tail, safe_charsets;
5904           int max_charset_id = 0;
5905
5906           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5907                tail = XCDR (tail))
5908             if (max_charset_id < XFASTINT (XCAR (tail)))
5909               max_charset_id = XFASTINT (XCAR (tail));
5910           safe_charsets = make_uninit_string (max_charset_id + 1);
5911           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5912           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5913                tail = XCDR (tail))
5914             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5915           coding->max_charset_id = max_charset_id;
5916           coding->safe_charsets = SDATA (safe_charsets);
5917         }
5918       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5919       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5920     }
5921   else if (EQ (coding_type, Qshift_jis))
5922     {
5923       coding->detector = detect_coding_sjis;
5924       coding->decoder = decode_coding_sjis;
5925       coding->encoder = encode_coding_sjis;
5926       coding->common_flags
5927         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5928     }
5929   else if (EQ (coding_type, Qbig5))
5930     {
5931       coding->detector = detect_coding_big5;
5932       coding->decoder = decode_coding_big5;
5933       coding->encoder = encode_coding_big5;
5934       coding->common_flags
5935         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5936     }
5937   else                          /* EQ (coding_type, Qraw_text) */
5938     {
5939       coding->detector = NULL;
5940       coding->decoder = decode_coding_raw_text;
5941       coding->encoder = encode_coding_raw_text;
5942       if (! EQ (eol_type, Qunix))
5943         {
5944           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5945           if (! VECTORP (eol_type))
5946             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5947         }
5948
5949     }
5950
5951   return;
5952 }
5953
5954 /* Return a list of charsets supported by CODING.  */
5955
5956 Lisp_Object
5957 coding_charset_list (struct coding_system *coding)
5958 {
5959   Lisp_Object attrs, charset_list;
5960
5961   CODING_GET_INFO (coding, attrs, charset_list);
5962   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5963     {
5964       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5965
5966       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5967         charset_list = Viso_2022_charset_list;
5968     }
5969   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5970     {
5971       charset_list = Vemacs_mule_charset_list;
5972     }
5973   return charset_list;
5974 }
5975
5976
5977 /* Return a list of charsets supported by CODING-SYSTEM.  */
5978
5979 Lisp_Object
5980 coding_system_charset_list (Lisp_Object coding_system)
5981 {
5982   ptrdiff_t id;
5983   Lisp_Object attrs, charset_list;
5984
5985   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5986   attrs = CODING_ID_ATTRS (id);
5987
5988   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5989     {
5990       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5991
5992       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5993         charset_list = Viso_2022_charset_list;
5994       else
5995         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5996     }
5997   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5998     {
5999       charset_list = Vemacs_mule_charset_list;
6000     }
6001   else
6002     {
6003       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6004     }
6005   return charset_list;
6006 }
6007
6008
6009 /* Return raw-text or one of its subsidiaries that has the same
6010    eol_type as CODING-SYSTEM.  */
6011
6012 Lisp_Object
6013 raw_text_coding_system (Lisp_Object coding_system)
6014 {
6015   Lisp_Object spec, attrs;
6016   Lisp_Object eol_type, raw_text_eol_type;
6017
6018   if (NILP (coding_system))
6019     return Qraw_text;
6020   spec = CODING_SYSTEM_SPEC (coding_system);
6021   attrs = AREF (spec, 0);
6022
6023   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6024     return coding_system;
6025
6026   eol_type = AREF (spec, 2);
6027   if (VECTORP (eol_type))
6028     return Qraw_text;
6029   spec = CODING_SYSTEM_SPEC (Qraw_text);
6030   raw_text_eol_type = AREF (spec, 2);
6031   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6032           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6033           : AREF (raw_text_eol_type, 2));
6034 }
6035
6036
6037 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6038    the subsidiary that has the same eol-spec as PARENT (if it is not
6039    nil and specifies end-of-line format) or the system's setting
6040    (system_eol_type).  */
6041
6042 Lisp_Object
6043 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6044 {
6045   Lisp_Object spec, eol_type;
6046
6047   if (NILP (coding_system))
6048     coding_system = Qraw_text;
6049   spec = CODING_SYSTEM_SPEC (coding_system);
6050   eol_type = AREF (spec, 2);
6051   if (VECTORP (eol_type))
6052     {
6053       Lisp_Object parent_eol_type;
6054
6055       if (! NILP (parent))
6056         {
6057           Lisp_Object parent_spec;
6058
6059           parent_spec = CODING_SYSTEM_SPEC (parent);
6060           parent_eol_type = AREF (parent_spec, 2);
6061           if (VECTORP (parent_eol_type))
6062             parent_eol_type = system_eol_type;
6063         }
6064       else
6065         parent_eol_type = system_eol_type;
6066       if (EQ (parent_eol_type, Qunix))
6067         coding_system = AREF (eol_type, 0);
6068       else if (EQ (parent_eol_type, Qdos))
6069         coding_system = AREF (eol_type, 1);
6070       else if (EQ (parent_eol_type, Qmac))
6071         coding_system = AREF (eol_type, 2);
6072     }
6073   return coding_system;
6074 }
6075
6076
6077 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6078    decided for writing to a process.  If not, complement them, and
6079    return a new coding system.  */
6080
6081 Lisp_Object
6082 complement_process_encoding_system (Lisp_Object coding_system)
6083 {
6084   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6085   Lisp_Object spec, attrs;
6086   int i;
6087
6088   for (i = 0; i < 3; i++)
6089     {
6090       if (i == 1)
6091         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6092       else if (i == 2)
6093         coding_system = preferred_coding_system ();
6094       spec = CODING_SYSTEM_SPEC (coding_system);
6095       if (NILP (spec))
6096         continue;
6097       attrs = AREF (spec, 0);
6098       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6099         coding_base = CODING_ATTR_BASE_NAME (attrs);
6100       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6101         eol_base = coding_system;
6102       if (! NILP (coding_base) && ! NILP (eol_base))
6103         break;
6104     }
6105
6106   if (i > 0)
6107     /* The original CODING_SYSTEM didn't specify text-conversion or
6108        eol-conversion.  Be sure that we return a fully complemented
6109        coding system.  */
6110     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6111   return coding_system;
6112 }
6113
6114
6115 /* Emacs has a mechanism to automatically detect a coding system if it
6116    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6117    it's impossible to distinguish some coding systems accurately
6118    because they use the same range of codes.  So, at first, coding
6119    systems are categorized into 7, those are:
6120
6121    o coding-category-emacs-mule
6122
6123         The category for a coding system which has the same code range
6124         as Emacs' internal format.  Assigned the coding-system (Lisp
6125         symbol) `emacs-mule' by default.
6126
6127    o coding-category-sjis
6128
6129         The category for a coding system which has the same code range
6130         as SJIS.  Assigned the coding-system (Lisp
6131         symbol) `japanese-shift-jis' by default.
6132
6133    o coding-category-iso-7
6134
6135         The category for a coding system which has the same code range
6136         as ISO2022 of 7-bit environment.  This doesn't use any locking
6137         shift and single shift functions.  This can encode/decode all
6138         charsets.  Assigned the coding-system (Lisp symbol)
6139         `iso-2022-7bit' by default.
6140
6141    o coding-category-iso-7-tight
6142
6143         Same as coding-category-iso-7 except that this can
6144         encode/decode only the specified charsets.
6145
6146    o coding-category-iso-8-1
6147
6148         The category for a coding system which has the same code range
6149         as ISO2022 of 8-bit environment and graphic plane 1 used only
6150         for DIMENSION1 charset.  This doesn't use any locking shift
6151         and single shift functions.  Assigned the coding-system (Lisp
6152         symbol) `iso-latin-1' by default.
6153
6154    o coding-category-iso-8-2
6155
6156         The category for a coding system which has the same code range
6157         as ISO2022 of 8-bit environment and graphic plane 1 used only
6158         for DIMENSION2 charset.  This doesn't use any locking shift
6159         and single shift functions.  Assigned the coding-system (Lisp
6160         symbol) `japanese-iso-8bit' by default.
6161
6162    o coding-category-iso-7-else
6163
6164         The category for a coding system which has the same code range
6165         as ISO2022 of 7-bit environment but uses locking shift or
6166         single shift functions.  Assigned the coding-system (Lisp
6167         symbol) `iso-2022-7bit-lock' by default.
6168
6169    o coding-category-iso-8-else
6170
6171         The category for a coding system which has the same code range
6172         as ISO2022 of 8-bit environment but uses locking shift or
6173         single shift functions.  Assigned the coding-system (Lisp
6174         symbol) `iso-2022-8bit-ss2' by default.
6175
6176    o coding-category-big5
6177
6178         The category for a coding system which has the same code range
6179         as BIG5.  Assigned the coding-system (Lisp symbol)
6180         `cn-big5' by default.
6181
6182    o coding-category-utf-8
6183
6184         The category for a coding system which has the same code range
6185         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6186         symbol) `utf-8' by default.
6187
6188    o coding-category-utf-16-be
6189
6190         The category for a coding system in which a text has an
6191         Unicode signature (cf. Unicode Standard) in the order of BIG
6192         endian at the head.  Assigned the coding-system (Lisp symbol)
6193         `utf-16-be' by default.
6194
6195    o coding-category-utf-16-le
6196
6197         The category for a coding system in which a text has an
6198         Unicode signature (cf. Unicode Standard) in the order of
6199         LITTLE endian at the head.  Assigned the coding-system (Lisp
6200         symbol) `utf-16-le' by default.
6201
6202    o coding-category-ccl
6203
6204         The category for a coding system of which encoder/decoder is
6205         written in CCL programs.  The default value is nil, i.e., no
6206         coding system is assigned.
6207
6208    o coding-category-binary
6209
6210         The category for a coding system not categorized in any of the
6211         above.  Assigned the coding-system (Lisp symbol)
6212         `no-conversion' by default.
6213
6214    Each of them is a Lisp symbol and the value is an actual
6215    `coding-system's (this is also a Lisp symbol) assigned by a user.
6216    What Emacs does actually is to detect a category of coding system.
6217    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6218    decide only one possible category, it selects a category of the
6219    highest priority.  Priorities of categories are also specified by a
6220    user in a Lisp variable `coding-category-list'.
6221
6222 */
6223
6224 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6225                                            int eol_seen);
6226
6227
6228 /* Return the number of ASCII characters at the head of the source.
6229    By side effects, set coding->head_ascii and update
6230    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6231    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6232    reliable only when all the source bytes are ASCII.  */
6233
6234 static ptrdiff_t
6235 check_ascii (struct coding_system *coding)
6236 {
6237   const unsigned char *src, *end;
6238   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6239   int eol_seen = coding->eol_seen;
6240
6241   coding_set_source (coding);
6242   src = coding->source;
6243   end = src + coding->src_bytes;
6244
6245   if (inhibit_eol_conversion
6246       || SYMBOLP (eol_type))
6247     {
6248       /* We don't have to check EOL format.  */
6249       while (src < end && !( *src & 0x80))
6250         {
6251           if (*src++ == '\n')
6252             eol_seen |= EOL_SEEN_LF;
6253         }
6254     }
6255   else
6256     {
6257       end--;                /* We look ahead one byte for "CR LF".  */
6258       while (src < end)
6259         {
6260           int c = *src;
6261
6262           if (c & 0x80)
6263             break;
6264           src++;
6265           if (c == '\r')
6266             {
6267               if (*src == '\n')
6268                 {
6269                   eol_seen |= EOL_SEEN_CRLF;
6270                   src++;
6271                 }
6272               else
6273                 eol_seen |= EOL_SEEN_CR;
6274             }
6275           else if (c == '\n')
6276             eol_seen |= EOL_SEEN_LF;
6277         }
6278       if (src == end)
6279         {
6280           int c = *src;
6281
6282           /* All bytes but the last one C are ASCII.  */
6283           if (! (c & 0x80))
6284             {
6285               if (c == '\r')
6286                 eol_seen |= EOL_SEEN_CR;
6287               else if (c  == '\n')
6288                 eol_seen |= EOL_SEEN_LF;
6289               src++;
6290             }
6291         }
6292     }
6293   coding->head_ascii = src - coding->source;
6294   coding->eol_seen = eol_seen;
6295   return (coding->head_ascii);
6296 }
6297
6298
6299 /* Return the number of characters at the source if all the bytes are
6300    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6301    effects, update coding->eol_seen.  The value of coding->eol_seen is
6302    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6303    the value is reliable only when all the source bytes are valid
6304    UTF-8.  */
6305
6306 static ptrdiff_t
6307 check_utf_8 (struct coding_system *coding)
6308 {
6309   const unsigned char *src, *end;
6310   int eol_seen;
6311   ptrdiff_t nchars = coding->head_ascii;
6312
6313   if (coding->head_ascii < 0)
6314     check_ascii (coding);
6315   else
6316     coding_set_source (coding);
6317   src = coding->source + coding->head_ascii;
6318   /* We look ahead one byte for CR LF.  */
6319   end = coding->source + coding->src_bytes - 1;
6320   eol_seen = coding->eol_seen;
6321   while (src < end)
6322     {
6323       int c = *src;
6324
6325       if (UTF_8_1_OCTET_P (*src))
6326         {
6327           src++;
6328           if (c < 0x20)
6329             {
6330               if (c == '\r')
6331                 {
6332                   if (*src == '\n')
6333                     {
6334                       eol_seen |= EOL_SEEN_CRLF;
6335                       src++;
6336                       nchars++;
6337                     }
6338                   else
6339                     eol_seen |= EOL_SEEN_CR;
6340                 }
6341               else if (c == '\n')
6342                 eol_seen |= EOL_SEEN_LF;
6343             }
6344         }
6345       else if (UTF_8_2_OCTET_LEADING_P (c))
6346         {
6347           if (c < 0xC2          /* overlong sequence */
6348               || src + 1 >= end
6349               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6350             return -1;
6351           src += 2;
6352         }
6353       else if (UTF_8_3_OCTET_LEADING_P (c))
6354         {
6355           if (src + 2 >= end
6356               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6357                     && UTF_8_EXTRA_OCTET_P (src[2])))
6358             return -1;
6359           c = (((c & 0xF) << 12)
6360                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6361           if (c < 0x800                       /* overlong sequence */
6362               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6363             return -1;
6364           src += 3;
6365         }
6366       else if (UTF_8_4_OCTET_LEADING_P (c))
6367         {
6368           if (src + 3 >= end
6369               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6370                     && UTF_8_EXTRA_OCTET_P (src[2])
6371                     && UTF_8_EXTRA_OCTET_P (src[3])))
6372             return -1;
6373           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6374                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6375           if (c < 0x10000       /* overlong sequence */
6376               || c >= 0x110000) /* non-Unicode character  */
6377             return -1;
6378           src += 4;
6379         }
6380       else
6381         return -1;
6382       nchars++;
6383     }
6384
6385   if (src == end)
6386     {
6387       if (! UTF_8_1_OCTET_P (*src))
6388         return -1;
6389       nchars++;
6390       if (*src == '\r')
6391         eol_seen |= EOL_SEEN_CR;
6392       else if (*src  == '\n')
6393         eol_seen |= EOL_SEEN_LF;
6394     }
6395   coding->eol_seen = eol_seen;
6396   return nchars;
6397 }
6398
6399
6400 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6401    SOURCE is encoded.  If CATEGORY is one of
6402    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6403    two-byte, else they are encoded by one-byte.
6404
6405    Return one of EOL_SEEN_XXX.  */
6406
6407 #define MAX_EOL_CHECK_COUNT 3
6408
6409 static int
6410 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6411             enum coding_category category)
6412 {
6413   const unsigned char *src = source, *src_end = src + src_bytes;
6414   unsigned char c;
6415   int total  = 0;
6416   int eol_seen = EOL_SEEN_NONE;
6417
6418   if ((1 << category) & CATEGORY_MASK_UTF_16)
6419     {
6420       bool msb = category == (coding_category_utf_16_le
6421                               | coding_category_utf_16_le_nosig);
6422       bool lsb = !msb;
6423
6424       while (src + 1 < src_end)
6425         {
6426           c = src[lsb];
6427           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6428             {
6429               int this_eol;
6430
6431               if (c == '\n')
6432                 this_eol = EOL_SEEN_LF;
6433               else if (src + 3 >= src_end
6434                        || src[msb + 2] != 0
6435                        || src[lsb + 2] != '\n')
6436                 this_eol = EOL_SEEN_CR;
6437               else
6438                 {
6439                   this_eol = EOL_SEEN_CRLF;
6440                   src += 2;
6441                 }
6442
6443               if (eol_seen == EOL_SEEN_NONE)
6444                 /* This is the first end-of-line.  */
6445                 eol_seen = this_eol;
6446               else if (eol_seen != this_eol)
6447                 {
6448                   /* The found type is different from what found before.
6449                      Allow for stray ^M characters in DOS EOL files.  */
6450                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6451                       || (eol_seen == EOL_SEEN_CRLF
6452                           && this_eol == EOL_SEEN_CR))
6453                     eol_seen = EOL_SEEN_CRLF;
6454                   else
6455                     {
6456                       eol_seen = EOL_SEEN_LF;
6457                       break;
6458                     }
6459                 }
6460               if (++total == MAX_EOL_CHECK_COUNT)
6461                 break;
6462             }
6463           src += 2;
6464         }
6465     }
6466   else
6467     while (src < src_end)
6468       {
6469         c = *src++;
6470         if (c == '\n' || c == '\r')
6471           {
6472             int this_eol;
6473
6474             if (c == '\n')
6475               this_eol = EOL_SEEN_LF;
6476             else if (src >= src_end || *src != '\n')
6477               this_eol = EOL_SEEN_CR;
6478             else
6479               this_eol = EOL_SEEN_CRLF, src++;
6480
6481             if (eol_seen == EOL_SEEN_NONE)
6482               /* This is the first end-of-line.  */
6483               eol_seen = this_eol;
6484             else if (eol_seen != this_eol)
6485               {
6486                 /* The found type is different from what found before.
6487                    Allow for stray ^M characters in DOS EOL files.  */
6488                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6489                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6490                   eol_seen = EOL_SEEN_CRLF;
6491                 else
6492                   {
6493                     eol_seen = EOL_SEEN_LF;
6494                     break;
6495                   }
6496               }
6497             if (++total == MAX_EOL_CHECK_COUNT)
6498               break;
6499           }
6500       }
6501   return eol_seen;
6502 }
6503
6504
6505 static Lisp_Object
6506 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6507 {
6508   Lisp_Object eol_type;
6509
6510   eol_type = CODING_ID_EOL_TYPE (coding->id);
6511   if (! VECTORP (eol_type))
6512     /* Already adjusted.  */
6513     return eol_type;
6514   if (eol_seen & EOL_SEEN_LF)
6515     {
6516       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6517       eol_type = Qunix;
6518     }
6519   else if (eol_seen & EOL_SEEN_CRLF)
6520     {
6521       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6522       eol_type = Qdos;
6523     }
6524   else if (eol_seen & EOL_SEEN_CR)
6525     {
6526       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6527       eol_type = Qmac;
6528     }
6529   return eol_type;
6530 }
6531
6532 /* Detect how a text specified in CODING is encoded.  If a coding
6533    system is detected, update fields of CODING by the detected coding
6534    system.  */
6535
6536 static void
6537 detect_coding (struct coding_system *coding)
6538 {
6539   const unsigned char *src, *src_end;
6540   unsigned int saved_mode = coding->mode;
6541   Lisp_Object found = Qnil;
6542   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6543
6544   coding->consumed = coding->consumed_char = 0;
6545   coding->produced = coding->produced_char = 0;
6546   coding_set_source (coding);
6547
6548   src_end = coding->source + coding->src_bytes;
6549
6550   coding->eol_seen = EOL_SEEN_NONE;
6551   /* If we have not yet decided the text encoding type, detect it
6552      now.  */
6553   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6554     {
6555       int c, i;
6556       struct coding_detection_info detect_info;
6557       bool null_byte_found = 0, eight_bit_found = 0;
6558       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6559                                        inhibit_null_byte_detection);
6560       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6561                                        inhibit_iso_escape_detection);
6562       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6563
6564       coding->head_ascii = 0;
6565       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6566       for (src = coding->source; src < src_end; src++)
6567         {
6568           c = *src;
6569           if (c & 0x80)
6570             {
6571               eight_bit_found = 1;
6572               if (null_byte_found)
6573                 break;
6574             }
6575           else if (c < 0x20)
6576             {
6577               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6578                   && ! inhibit_ied
6579                   && ! detect_info.checked)
6580                 {
6581                   if (detect_coding_iso_2022 (coding, &detect_info))
6582                     {
6583                       /* We have scanned the whole data.  */
6584                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6585                         {
6586                           /* We didn't find an 8-bit code.  We may
6587                              have found a null-byte, but it's very
6588                              rare that a binary file conforms to
6589                              ISO-2022.  */
6590                           src = src_end;
6591                           coding->head_ascii = src - coding->source;
6592                         }
6593                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6594                       break;
6595                     }
6596                 }
6597               else if (! c && !inhibit_nbd)
6598                 {
6599                   null_byte_found = 1;
6600                   if (eight_bit_found)
6601                     break;
6602                 }
6603               else if (! disable_ascii_optimization
6604                        && ! inhibit_eol_conversion)
6605                 {
6606                   if (c == '\r')
6607                     {
6608                       if (src < src_end && src[1] == '\n')
6609                         {
6610                           coding->eol_seen |= EOL_SEEN_CRLF;
6611                           src++;
6612                           if (! eight_bit_found)
6613                             coding->head_ascii++;
6614                         }
6615                       else
6616                         coding->eol_seen |= EOL_SEEN_CR;
6617                     }
6618                   else if (c == '\n')
6619                     {
6620                       coding->eol_seen |= EOL_SEEN_LF;
6621                     }
6622                 }
6623
6624               if (! eight_bit_found)
6625                 coding->head_ascii++;
6626             }
6627           else if (! eight_bit_found)
6628             coding->head_ascii++;
6629         }
6630
6631       if (null_byte_found || eight_bit_found
6632           || coding->head_ascii < coding->src_bytes
6633           || detect_info.found)
6634         {
6635           enum coding_category category;
6636           struct coding_system *this;
6637
6638           if (coding->head_ascii == coding->src_bytes)
6639             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6640             for (i = 0; i < coding_category_raw_text; i++)
6641               {
6642                 category = coding_priorities[i];
6643                 this = coding_categories + category;
6644                 if (detect_info.found & (1 << category))
6645                   break;
6646               }
6647           else
6648             {
6649               if (null_byte_found)
6650                 {
6651                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6652                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6653                 }
6654               else if (prefer_utf_8
6655                        && detect_coding_utf_8 (coding, &detect_info))
6656                 {
6657                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6658                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6659                 }
6660               for (i = 0; i < coding_category_raw_text; i++)
6661                 {
6662                   category = coding_priorities[i];
6663                   this = coding_categories + category;
6664                   /* Some of this->detector (e.g. detect_coding_sjis)
6665                      require this information.  */
6666                   coding->id = this->id;
6667                   if (this->id < 0)
6668                     {
6669                       /* No coding system of this category is defined.  */
6670                       detect_info.rejected |= (1 << category);
6671                     }
6672                   else if (category >= coding_category_raw_text)
6673                     continue;
6674                   else if (detect_info.checked & (1 << category))
6675                     {
6676                       if (detect_info.found & (1 << category))
6677                         break;
6678                     }
6679                   else if ((*(this->detector)) (coding, &detect_info)
6680                            && detect_info.found & (1 << category))
6681                     break;
6682                 }
6683             }
6684
6685           if (i < coding_category_raw_text)
6686             {
6687               if (category == coding_category_utf_8_auto)
6688                 {
6689                   Lisp_Object coding_systems;
6690
6691                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6692                                          coding_attr_utf_bom);
6693                   if (CONSP (coding_systems))
6694                     {
6695                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6696                         found = XCAR (coding_systems);
6697                       else
6698                         found = XCDR (coding_systems);
6699                     }
6700                   else
6701                     found = CODING_ID_NAME (this->id);
6702                 }
6703               else if (category == coding_category_utf_16_auto)
6704                 {
6705                   Lisp_Object coding_systems;
6706
6707                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6708                                          coding_attr_utf_bom);
6709                   if (CONSP (coding_systems))
6710                     {
6711                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6712                         found = XCAR (coding_systems);
6713                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6714                         found = XCDR (coding_systems);
6715                     }
6716                   else
6717                     found = CODING_ID_NAME (this->id);
6718                 }
6719               else
6720                 found = CODING_ID_NAME (this->id);
6721             }
6722           else if (null_byte_found)
6723             found = Qno_conversion;
6724           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6725                    == CATEGORY_MASK_ANY)
6726             found = Qraw_text;
6727           else if (detect_info.rejected)
6728             for (i = 0; i < coding_category_raw_text; i++)
6729               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6730                 {
6731                   this = coding_categories + coding_priorities[i];
6732                   found = CODING_ID_NAME (this->id);
6733                   break;
6734                 }
6735         }
6736     }
6737   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6738            == coding_category_utf_8_auto)
6739     {
6740       Lisp_Object coding_systems;
6741       struct coding_detection_info detect_info;
6742
6743       coding_systems
6744         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6745       detect_info.found = detect_info.rejected = 0;
6746       if (check_ascii (coding) == coding->src_bytes)
6747         {
6748           if (CONSP (coding_systems))
6749             found = XCDR (coding_systems);
6750         }
6751       else
6752         {
6753           if (CONSP (coding_systems)
6754               && detect_coding_utf_8 (coding, &detect_info))
6755             {
6756               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6757                 found = XCAR (coding_systems);
6758               else
6759                 found = XCDR (coding_systems);
6760             }
6761         }
6762     }
6763   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6764            == coding_category_utf_16_auto)
6765     {
6766       Lisp_Object coding_systems;
6767       struct coding_detection_info detect_info;
6768
6769       coding_systems
6770         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6771       detect_info.found = detect_info.rejected = 0;
6772       coding->head_ascii = 0;
6773       if (CONSP (coding_systems)
6774           && detect_coding_utf_16 (coding, &detect_info))
6775         {
6776           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6777             found = XCAR (coding_systems);
6778           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6779             found = XCDR (coding_systems);
6780         }
6781     }
6782
6783   if (! NILP (found))
6784     {
6785       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6786                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6787                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6788                            : EOL_SEEN_LF);
6789
6790       setup_coding_system (found, coding);
6791       if (specified_eol != EOL_SEEN_NONE)
6792         adjust_coding_eol_type (coding, specified_eol);
6793     }
6794
6795   coding->mode = saved_mode;
6796 }
6797
6798
6799 static void
6800 decode_eol (struct coding_system *coding)
6801 {
6802   Lisp_Object eol_type;
6803   unsigned char *p, *pbeg, *pend;
6804
6805   eol_type = CODING_ID_EOL_TYPE (coding->id);
6806   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6807     return;
6808
6809   if (NILP (coding->dst_object))
6810     pbeg = coding->destination;
6811   else
6812     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6813   pend = pbeg + coding->produced;
6814
6815   if (VECTORP (eol_type))
6816     {
6817       int eol_seen = EOL_SEEN_NONE;
6818
6819       for (p = pbeg; p < pend; p++)
6820         {
6821           if (*p == '\n')
6822             eol_seen |= EOL_SEEN_LF;
6823           else if (*p == '\r')
6824             {
6825               if (p + 1 < pend && *(p + 1) == '\n')
6826                 {
6827                   eol_seen |= EOL_SEEN_CRLF;
6828                   p++;
6829                 }
6830               else
6831                 eol_seen |= EOL_SEEN_CR;
6832             }
6833         }
6834       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6835       if ((eol_seen & EOL_SEEN_CRLF) != 0
6836           && (eol_seen & EOL_SEEN_CR) != 0
6837           && (eol_seen & EOL_SEEN_LF) == 0)
6838         eol_seen = EOL_SEEN_CRLF;
6839       else if (eol_seen != EOL_SEEN_NONE
6840           && eol_seen != EOL_SEEN_LF
6841           && eol_seen != EOL_SEEN_CRLF
6842           && eol_seen != EOL_SEEN_CR)
6843         eol_seen = EOL_SEEN_LF;
6844       if (eol_seen != EOL_SEEN_NONE)
6845         eol_type = adjust_coding_eol_type (coding, eol_seen);
6846     }
6847
6848   if (EQ (eol_type, Qmac))
6849     {
6850       for (p = pbeg; p < pend; p++)
6851         if (*p == '\r')
6852           *p = '\n';
6853     }
6854   else if (EQ (eol_type, Qdos))
6855     {
6856       ptrdiff_t n = 0;
6857
6858       if (NILP (coding->dst_object))
6859         {
6860           /* Start deleting '\r' from the tail to minimize the memory
6861              movement.  */
6862           for (p = pend - 2; p >= pbeg; p--)
6863             if (*p == '\r')
6864               {
6865                 memmove (p, p + 1, pend-- - p - 1);
6866                 n++;
6867               }
6868         }
6869       else
6870         {
6871           ptrdiff_t pos_byte = coding->dst_pos_byte;
6872           ptrdiff_t pos = coding->dst_pos;
6873           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6874
6875           while (pos < pos_end)
6876             {
6877               p = BYTE_POS_ADDR (pos_byte);
6878               if (*p == '\r' && p[1] == '\n')
6879                 {
6880                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6881                   n++;
6882                   pos_end--;
6883                 }
6884               pos++;
6885               if (coding->dst_multibyte)
6886                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6887               else
6888                 pos_byte++;
6889             }
6890         }
6891       coding->produced -= n;
6892       coding->produced_char -= n;
6893     }
6894 }
6895
6896
6897 /* Return a translation table (or list of them) from coding system
6898    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6899    not ENCODEP). */
6900
6901 static Lisp_Object
6902 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6903 {
6904   Lisp_Object standard, translation_table;
6905   Lisp_Object val;
6906
6907   if (NILP (Venable_character_translation))
6908     {
6909       if (max_lookup)
6910         *max_lookup = 0;
6911       return Qnil;
6912     }
6913   if (encodep)
6914     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6915       standard = Vstandard_translation_table_for_encode;
6916   else
6917     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6918       standard = Vstandard_translation_table_for_decode;
6919   if (NILP (translation_table))
6920     translation_table = standard;
6921   else
6922     {
6923       if (SYMBOLP (translation_table))
6924         translation_table = Fget (translation_table, Qtranslation_table);
6925       else if (CONSP (translation_table))
6926         {
6927           translation_table = Fcopy_sequence (translation_table);
6928           for (val = translation_table; CONSP (val); val = XCDR (val))
6929             if (SYMBOLP (XCAR (val)))
6930               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6931         }
6932       if (CHAR_TABLE_P (standard))
6933         {
6934           if (CONSP (translation_table))
6935             translation_table = nconc2 (translation_table, list1 (standard));
6936           else
6937             translation_table = list2 (translation_table, standard);
6938         }
6939     }
6940
6941   if (max_lookup)
6942     {
6943       *max_lookup = 1;
6944       if (CHAR_TABLE_P (translation_table)
6945           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6946         {
6947           val = XCHAR_TABLE (translation_table)->extras[1];
6948           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6949             *max_lookup = XFASTINT (val);
6950         }
6951       else if (CONSP (translation_table))
6952         {
6953           Lisp_Object tail;
6954
6955           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6956             if (CHAR_TABLE_P (XCAR (tail))
6957                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6958               {
6959                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6960                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6961                   *max_lookup = XFASTINT (tailval);
6962               }
6963         }
6964     }
6965   return translation_table;
6966 }
6967
6968 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6969   do {                                                          \
6970     trans = Qnil;                                               \
6971     if (CHAR_TABLE_P (table))                                   \
6972       {                                                         \
6973         trans = CHAR_TABLE_REF (table, c);                      \
6974         if (CHARACTERP (trans))                                 \
6975           c = XFASTINT (trans), trans = Qnil;                   \
6976       }                                                         \
6977     else if (CONSP (table))                                     \
6978       {                                                         \
6979         Lisp_Object tail;                                       \
6980                                                                 \
6981         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6982           if (CHAR_TABLE_P (XCAR (tail)))                       \
6983             {                                                   \
6984               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6985               if (CHARACTERP (trans))                           \
6986                 c = XFASTINT (trans), trans = Qnil;             \
6987               else if (! NILP (trans))                          \
6988                 break;                                          \
6989             }                                                   \
6990       }                                                         \
6991   } while (0)
6992
6993
6994 /* Return a translation of character(s) at BUF according to TRANS.
6995    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6996    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6997    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6998    translation is found, and Qnil if not found..
6999    If BUF is too short to lookup characters in FROM, return Qt.  */
7000
7001 static Lisp_Object
7002 get_translation (Lisp_Object trans, int *buf, int *buf_end)
7003 {
7004
7005   if (INTEGERP (trans))
7006     return trans;
7007   for (; CONSP (trans); trans = XCDR (trans))
7008     {
7009       Lisp_Object val = XCAR (trans);
7010       Lisp_Object from = XCAR (val);
7011       ptrdiff_t len = ASIZE (from);
7012       ptrdiff_t i;
7013
7014       for (i = 0; i < len; i++)
7015         {
7016           if (buf + i == buf_end)
7017             return Qt;
7018           if (XINT (AREF (from, i)) != buf[i])
7019             break;
7020         }
7021       if (i == len)
7022         return val;
7023     }
7024   return Qnil;
7025 }
7026
7027
7028 static int
7029 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7030                bool last_block)
7031 {
7032   unsigned char *dst = coding->destination + coding->produced;
7033   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7034   ptrdiff_t produced;
7035   ptrdiff_t produced_chars = 0;
7036   int carryover = 0;
7037
7038   if (! coding->chars_at_source)
7039     {
7040       /* Source characters are in coding->charbuf.  */
7041       int *buf = coding->charbuf;
7042       int *buf_end = buf + coding->charbuf_used;
7043
7044       if (EQ (coding->src_object, coding->dst_object))
7045         {
7046           coding_set_source (coding);
7047           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7048         }
7049
7050       while (buf < buf_end)
7051         {
7052           int c = *buf;
7053           ptrdiff_t i;
7054
7055           if (c >= 0)
7056             {
7057               ptrdiff_t from_nchars = 1, to_nchars = 1;
7058               Lisp_Object trans = Qnil;
7059
7060               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7061               if (! NILP (trans))
7062                 {
7063                   trans = get_translation (trans, buf, buf_end);
7064                   if (INTEGERP (trans))
7065                     c = XINT (trans);
7066                   else if (CONSP (trans))
7067                     {
7068                       from_nchars = ASIZE (XCAR (trans));
7069                       trans = XCDR (trans);
7070                       if (INTEGERP (trans))
7071                         c = XINT (trans);
7072                       else
7073                         {
7074                           to_nchars = ASIZE (trans);
7075                           c = XINT (AREF (trans, 0));
7076                         }
7077                     }
7078                   else if (EQ (trans, Qt) && ! last_block)
7079                     break;
7080                 }
7081
7082               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7083                 {
7084                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7085                        / MAX_MULTIBYTE_LENGTH)
7086                       < to_nchars)
7087                     memory_full (SIZE_MAX);
7088                   dst = alloc_destination (coding,
7089                                            buf_end - buf
7090                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7091                                            dst);
7092                   if (EQ (coding->src_object, coding->dst_object))
7093                     {
7094                       coding_set_source (coding);
7095                       dst_end = (((unsigned char *) coding->source)
7096                                  + coding->consumed);
7097                     }
7098                   else
7099                     dst_end = coding->destination + coding->dst_bytes;
7100                 }
7101
7102               for (i = 0; i < to_nchars; i++)
7103                 {
7104                   if (i > 0)
7105                     c = XINT (AREF (trans, i));
7106                   if (coding->dst_multibyte
7107                       || ! CHAR_BYTE8_P (c))
7108                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7109                   else
7110                     *dst++ = CHAR_TO_BYTE8 (c);
7111                 }
7112               produced_chars += to_nchars;
7113               buf += from_nchars;
7114             }
7115           else
7116             /* This is an annotation datum.  (-C) is the length.  */
7117             buf += -c;
7118         }
7119       carryover = buf_end - buf;
7120     }
7121   else
7122     {
7123       /* Source characters are at coding->source.  */
7124       const unsigned char *src = coding->source;
7125       const unsigned char *src_end = src + coding->consumed;
7126
7127       if (EQ (coding->dst_object, coding->src_object))
7128         dst_end = (unsigned char *) src;
7129       if (coding->src_multibyte != coding->dst_multibyte)
7130         {
7131           if (coding->src_multibyte)
7132             {
7133               bool multibytep = 1;
7134               ptrdiff_t consumed_chars = 0;
7135
7136               while (1)
7137                 {
7138                   const unsigned char *src_base = src;
7139                   int c;
7140
7141                   ONE_MORE_BYTE (c);
7142                   if (dst == dst_end)
7143                     {
7144                       if (EQ (coding->src_object, coding->dst_object))
7145                         dst_end = (unsigned char *) src;
7146                       if (dst == dst_end)
7147                         {
7148                           ptrdiff_t offset = src - coding->source;
7149
7150                           dst = alloc_destination (coding, src_end - src + 1,
7151                                                    dst);
7152                           dst_end = coding->destination + coding->dst_bytes;
7153                           coding_set_source (coding);
7154                           src = coding->source + offset;
7155                           src_end = coding->source + coding->consumed;
7156                           if (EQ (coding->src_object, coding->dst_object))
7157                             dst_end = (unsigned char *) src;
7158                         }
7159                     }
7160                   *dst++ = c;
7161                   produced_chars++;
7162                 }
7163             no_more_source:
7164               ;
7165             }
7166           else
7167             while (src < src_end)
7168               {
7169                 bool multibytep = 1;
7170                 int c = *src++;
7171
7172                 if (dst >= dst_end - 1)
7173                   {
7174                     if (EQ (coding->src_object, coding->dst_object))
7175                       dst_end = (unsigned char *) src;
7176                     if (dst >= dst_end - 1)
7177                       {
7178                         ptrdiff_t offset = src - coding->source;
7179                         ptrdiff_t more_bytes;
7180
7181                         if (EQ (coding->src_object, coding->dst_object))
7182                           more_bytes = ((src_end - src) / 2) + 2;
7183                         else
7184                           more_bytes = src_end - src + 2;
7185                         dst = alloc_destination (coding, more_bytes, dst);
7186                         dst_end = coding->destination + coding->dst_bytes;
7187                         coding_set_source (coding);
7188                         src = coding->source + offset;
7189                         src_end = coding->source + coding->consumed;
7190                         if (EQ (coding->src_object, coding->dst_object))
7191                           dst_end = (unsigned char *) src;
7192                       }
7193                   }
7194                 EMIT_ONE_BYTE (c);
7195               }
7196         }
7197       else
7198         {
7199           if (!EQ (coding->src_object, coding->dst_object))
7200             {
7201               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7202
7203               if (require > 0)
7204                 {
7205                   ptrdiff_t offset = src - coding->source;
7206
7207                   dst = alloc_destination (coding, require, dst);
7208                   coding_set_source (coding);
7209                   src = coding->source + offset;
7210                   src_end = coding->source + coding->consumed;
7211                 }
7212             }
7213           produced_chars = coding->consumed_char;
7214           while (src < src_end)
7215             *dst++ = *src++;
7216         }
7217     }
7218
7219   produced = dst - (coding->destination + coding->produced);
7220   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7221     insert_from_gap (produced_chars, produced, 0);
7222   coding->produced += produced;
7223   coding->produced_char += produced_chars;
7224   return carryover;
7225 }
7226
7227 /* Compose text in CODING->object according to the annotation data at
7228    CHARBUF.  CHARBUF is an array:
7229      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7230  */
7231
7232 static void
7233 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7234 {
7235   int len;
7236   ptrdiff_t to;
7237   enum composition_method method;
7238   Lisp_Object components;
7239
7240   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7241   to = pos + charbuf[2];
7242   method = (enum composition_method) (charbuf[4]);
7243
7244   if (method == COMPOSITION_RELATIVE)
7245     components = Qnil;
7246   else
7247     {
7248       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7249       int i, j;
7250
7251       if (method == COMPOSITION_WITH_RULE)
7252         len = charbuf[2] * 3 - 2;
7253       charbuf += MAX_ANNOTATION_LENGTH;
7254       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7255       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7256         {
7257           if (charbuf[i] >= 0)
7258             args[j] = make_number (charbuf[i]);
7259           else
7260             {
7261               i++;
7262               args[j] = make_number (charbuf[i] % 0x100);
7263             }
7264         }
7265       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7266     }
7267   compose_text (pos, to, components, Qnil, coding->dst_object);
7268 }
7269
7270
7271 /* Put `charset' property on text in CODING->object according to
7272    the annotation data at CHARBUF.  CHARBUF is an array:
7273      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7274  */
7275
7276 static void
7277 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7278 {
7279   ptrdiff_t from = pos - charbuf[2];
7280   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7281
7282   Fput_text_property (make_number (from), make_number (pos),
7283                       Qcharset, CHARSET_NAME (charset),
7284                       coding->dst_object);
7285 }
7286
7287
7288 #define CHARBUF_SIZE 0x4000
7289
7290 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7291   do {                                                                  \
7292     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7293     coding->charbuf_size = CHARBUF_SIZE;                                \
7294   } while (0)
7295
7296
7297 static void
7298 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7299 {
7300   int *charbuf = coding->charbuf;
7301   int *charbuf_end = charbuf + coding->charbuf_used;
7302
7303   if (NILP (coding->dst_object))
7304     return;
7305
7306   while (charbuf < charbuf_end)
7307     {
7308       if (*charbuf >= 0)
7309         pos++, charbuf++;
7310       else
7311         {
7312           int len = -*charbuf;
7313
7314           if (len > 2)
7315             switch (charbuf[1])
7316               {
7317               case CODING_ANNOTATE_COMPOSITION_MASK:
7318                 produce_composition (coding, charbuf, pos);
7319                 break;
7320               case CODING_ANNOTATE_CHARSET_MASK:
7321                 produce_charset (coding, charbuf, pos);
7322                 break;
7323               }
7324           charbuf += len;
7325         }
7326     }
7327 }
7328
7329 /* Decode the data at CODING->src_object into CODING->dst_object.
7330    CODING->src_object is a buffer, a string, or nil.
7331    CODING->dst_object is a buffer.
7332
7333    If CODING->src_object is a buffer, it must be the current buffer.
7334    In this case, if CODING->src_pos is positive, it is a position of
7335    the source text in the buffer, otherwise, the source text is in the
7336    gap area of the buffer, and CODING->src_pos specifies the offset of
7337    the text from GPT (which must be the same as PT).  If this is the
7338    same buffer as CODING->dst_object, CODING->src_pos must be
7339    negative.
7340
7341    If CODING->src_object is a string, CODING->src_pos is an index to
7342    that string.
7343
7344    If CODING->src_object is nil, CODING->source must already point to
7345    the non-relocatable memory area.  In this case, CODING->src_pos is
7346    an offset from CODING->source.
7347
7348    The decoded data is inserted at the current point of the buffer
7349    CODING->dst_object.
7350 */
7351
7352 static void
7353 decode_coding (struct coding_system *coding)
7354 {
7355   Lisp_Object attrs;
7356   Lisp_Object undo_list;
7357   Lisp_Object translation_table;
7358   struct ccl_spec cclspec;
7359   int carryover;
7360   int i;
7361
7362   USE_SAFE_ALLOCA;
7363
7364   if (BUFFERP (coding->src_object)
7365       && coding->src_pos > 0
7366       && coding->src_pos < GPT
7367       && coding->src_pos + coding->src_chars > GPT)
7368     move_gap_both (coding->src_pos, coding->src_pos_byte);
7369
7370   undo_list = Qt;
7371   if (BUFFERP (coding->dst_object))
7372     {
7373       set_buffer_internal (XBUFFER (coding->dst_object));
7374       if (GPT != PT)
7375         move_gap_both (PT, PT_BYTE);
7376
7377       /* We must disable undo_list in order to record the whole insert
7378          transaction via record_insert at the end.  But doing so also
7379          disables the recording of the first change to the undo_list.
7380          Therefore we check for first change here and record it via
7381          record_first_change if needed.  */
7382       if (MODIFF <= SAVE_MODIFF)
7383         record_first_change ();
7384
7385       undo_list = BVAR (current_buffer, undo_list);
7386       bset_undo_list (current_buffer, Qt);
7387     }
7388
7389   coding->consumed = coding->consumed_char = 0;
7390   coding->produced = coding->produced_char = 0;
7391   coding->chars_at_source = 0;
7392   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7393   coding->errors = 0;
7394
7395   ALLOC_CONVERSION_WORK_AREA (coding);
7396
7397   attrs = CODING_ID_ATTRS (coding->id);
7398   translation_table = get_translation_table (attrs, 0, NULL);
7399
7400   carryover = 0;
7401   if (coding->decoder == decode_coding_ccl)
7402     {
7403       coding->spec.ccl = &cclspec;
7404       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7405     }
7406   do
7407     {
7408       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7409
7410       coding_set_source (coding);
7411       coding->annotated = 0;
7412       coding->charbuf_used = carryover;
7413       (*(coding->decoder)) (coding);
7414       coding_set_destination (coding);
7415       carryover = produce_chars (coding, translation_table, 0);
7416       if (coding->annotated)
7417         produce_annotation (coding, pos);
7418       for (i = 0; i < carryover; i++)
7419         coding->charbuf[i]
7420           = coding->charbuf[coding->charbuf_used - carryover + i];
7421     }
7422   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7423          || (coding->consumed < coding->src_bytes
7424              && (coding->result == CODING_RESULT_SUCCESS
7425                  || coding->result == CODING_RESULT_INVALID_SRC)));
7426
7427   if (carryover > 0)
7428     {
7429       coding_set_destination (coding);
7430       coding->charbuf_used = carryover;
7431       produce_chars (coding, translation_table, 1);
7432     }
7433
7434   coding->carryover_bytes = 0;
7435   if (coding->consumed < coding->src_bytes)
7436     {
7437       ptrdiff_t nbytes = coding->src_bytes - coding->consumed;
7438       const unsigned char *src;
7439
7440       coding_set_source (coding);
7441       coding_set_destination (coding);
7442       src = coding->source + coding->consumed;
7443
7444       if (coding->mode & CODING_MODE_LAST_BLOCK)
7445         {
7446           /* Flush out unprocessed data as binary chars.  We are sure
7447              that the number of data is less than the size of
7448              coding->charbuf.  */
7449           coding->charbuf_used = 0;
7450           coding->chars_at_source = 0;
7451
7452           while (nbytes-- > 0)
7453             {
7454               int c = *src++;
7455
7456               if (c & 0x80)
7457                 c = BYTE8_TO_CHAR (c);
7458               coding->charbuf[coding->charbuf_used++] = c;
7459             }
7460           produce_chars (coding, Qnil, 1);
7461         }
7462       else
7463         {
7464           /* Record unprocessed bytes in coding->carryover.  We are
7465              sure that the number of data is less than the size of
7466              coding->carryover.  */
7467           unsigned char *p = coding->carryover;
7468
7469           if (nbytes > sizeof coding->carryover)
7470             nbytes = sizeof coding->carryover;
7471           coding->carryover_bytes = nbytes;
7472           while (nbytes-- > 0)
7473             *p++ = *src++;
7474         }
7475       coding->consumed = coding->src_bytes;
7476     }
7477
7478   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7479       && !inhibit_eol_conversion)
7480     decode_eol (coding);
7481   if (BUFFERP (coding->dst_object))
7482     {
7483       bset_undo_list (current_buffer, undo_list);
7484       record_insert (coding->dst_pos, coding->produced_char);
7485     }
7486
7487   SAFE_FREE ();
7488 }
7489
7490
7491 /* Extract an annotation datum from a composition starting at POS and
7492    ending before LIMIT of CODING->src_object (buffer or string), store
7493    the data in BUF, set *STOP to a starting position of the next
7494    composition (if any) or to LIMIT, and return the address of the
7495    next element of BUF.
7496
7497    If such an annotation is not found, set *STOP to a starting
7498    position of a composition after POS (if any) or to LIMIT, and
7499    return BUF.  */
7500
7501 static int *
7502 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7503                                struct coding_system *coding, int *buf,
7504                                ptrdiff_t *stop)
7505 {
7506   ptrdiff_t start, end;
7507   Lisp_Object prop;
7508
7509   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7510       || end > limit)
7511     *stop = limit;
7512   else if (start > pos)
7513     *stop = start;
7514   else
7515     {
7516       if (start == pos)
7517         {
7518           /* We found a composition.  Store the corresponding
7519              annotation data in BUF.  */
7520           int *head = buf;
7521           enum composition_method method = composition_method (prop);
7522           int nchars = COMPOSITION_LENGTH (prop);
7523
7524           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7525           if (method != COMPOSITION_RELATIVE)
7526             {
7527               Lisp_Object components;
7528               ptrdiff_t i, len, i_byte;
7529
7530               components = COMPOSITION_COMPONENTS (prop);
7531               if (VECTORP (components))
7532                 {
7533                   len = ASIZE (components);
7534                   for (i = 0; i < len; i++)
7535                     *buf++ = XINT (AREF (components, i));
7536                 }
7537               else if (STRINGP (components))
7538                 {
7539                   len = SCHARS (components);
7540                   i = i_byte = 0;
7541                   while (i < len)
7542                     {
7543                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7544                       buf++;
7545                     }
7546                 }
7547               else if (INTEGERP (components))
7548                 {
7549                   len = 1;
7550                   *buf++ = XINT (components);
7551                 }
7552               else if (CONSP (components))
7553                 {
7554                   for (len = 0; CONSP (components);
7555                        len++, components = XCDR (components))
7556                     *buf++ = XINT (XCAR (components));
7557                 }
7558               else
7559                 emacs_abort ();
7560               *head -= len;
7561             }
7562         }
7563
7564       if (find_composition (end, limit, &start, &end, &prop,
7565                             coding->src_object)
7566           && end <= limit)
7567         *stop = start;
7568       else
7569         *stop = limit;
7570     }
7571   return buf;
7572 }
7573
7574
7575 /* Extract an annotation datum from a text property `charset' at POS of
7576    CODING->src_object (buffer of string), store the data in BUF, set
7577    *STOP to the position where the value of `charset' property changes
7578    (limiting by LIMIT), and return the address of the next element of
7579    BUF.
7580
7581    If the property value is nil, set *STOP to the position where the
7582    property value is non-nil (limiting by LIMIT), and return BUF.  */
7583
7584 static int *
7585 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7586                            struct coding_system *coding, int *buf,
7587                            ptrdiff_t *stop)
7588 {
7589   Lisp_Object val, next;
7590   int id;
7591
7592   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7593   if (! NILP (val) && CHARSETP (val))
7594     id = XINT (CHARSET_SYMBOL_ID (val));
7595   else
7596     id = -1;
7597   ADD_CHARSET_DATA (buf, 0, id);
7598   next = Fnext_single_property_change (make_number (pos), Qcharset,
7599                                        coding->src_object,
7600                                        make_number (limit));
7601   *stop = XINT (next);
7602   return buf;
7603 }
7604
7605
7606 static void
7607 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7608                int max_lookup)
7609 {
7610   int *buf = coding->charbuf;
7611   int *buf_end = coding->charbuf + coding->charbuf_size;
7612   const unsigned char *src = coding->source + coding->consumed;
7613   const unsigned char *src_end = coding->source + coding->src_bytes;
7614   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7615   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7616   bool multibytep = coding->src_multibyte;
7617   Lisp_Object eol_type;
7618   int c;
7619   ptrdiff_t stop, stop_composition, stop_charset;
7620   int *lookup_buf = NULL;
7621
7622   if (! NILP (translation_table))
7623     lookup_buf = alloca (sizeof (int) * max_lookup);
7624
7625   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7626   if (VECTORP (eol_type))
7627     eol_type = Qunix;
7628
7629   /* Note: composition handling is not yet implemented.  */
7630   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7631
7632   if (NILP (coding->src_object))
7633     stop = stop_composition = stop_charset = end_pos;
7634   else
7635     {
7636       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7637         stop = stop_composition = pos;
7638       else
7639         stop = stop_composition = end_pos;
7640       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7641         stop = stop_charset = pos;
7642       else
7643         stop_charset = end_pos;
7644     }
7645
7646   /* Compensate for CRLF and conversion.  */
7647   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7648   while (buf < buf_end)
7649     {
7650       Lisp_Object trans;
7651
7652       if (pos == stop)
7653         {
7654           if (pos == end_pos)
7655             break;
7656           if (pos == stop_composition)
7657             buf = handle_composition_annotation (pos, end_pos, coding,
7658                                                  buf, &stop_composition);
7659           if (pos == stop_charset)
7660             buf = handle_charset_annotation (pos, end_pos, coding,
7661                                              buf, &stop_charset);
7662           stop = (stop_composition < stop_charset
7663                   ? stop_composition : stop_charset);
7664         }
7665
7666       if (! multibytep)
7667         {
7668           int bytes;
7669
7670           if (coding->encoder == encode_coding_raw_text
7671               || coding->encoder == encode_coding_ccl)
7672             c = *src++, pos++;
7673           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7674             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7675           else
7676             c = BYTE8_TO_CHAR (*src), src++, pos++;
7677         }
7678       else
7679         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7680       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7681         c = '\n';
7682       if (! EQ (eol_type, Qunix))
7683         {
7684           if (c == '\n')
7685             {
7686               if (EQ (eol_type, Qdos))
7687                 *buf++ = '\r';
7688               else
7689                 c = '\r';
7690             }
7691         }
7692
7693       trans = Qnil;
7694       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7695       if (NILP (trans))
7696         *buf++ = c;
7697       else
7698         {
7699           ptrdiff_t from_nchars = 1, to_nchars = 1;
7700           int *lookup_buf_end;
7701           const unsigned char *p = src;
7702           int i;
7703
7704           lookup_buf[0] = c;
7705           for (i = 1; i < max_lookup && p < src_end; i++)
7706             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7707           lookup_buf_end = lookup_buf + i;
7708           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7709           if (INTEGERP (trans))
7710             c = XINT (trans);
7711           else if (CONSP (trans))
7712             {
7713               from_nchars = ASIZE (XCAR (trans));
7714               trans = XCDR (trans);
7715               if (INTEGERP (trans))
7716                 c = XINT (trans);
7717               else
7718                 {
7719                   to_nchars = ASIZE (trans);
7720                   if (buf_end - buf < to_nchars)
7721                     break;
7722                   c = XINT (AREF (trans, 0));
7723                 }
7724             }
7725           else
7726             break;
7727           *buf++ = c;
7728           for (i = 1; i < to_nchars; i++)
7729             *buf++ = XINT (AREF (trans, i));
7730           for (i = 1; i < from_nchars; i++, pos++)
7731             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7732         }
7733     }
7734
7735   coding->consumed = src - coding->source;
7736   coding->consumed_char = pos - coding->src_pos;
7737   coding->charbuf_used = buf - coding->charbuf;
7738   coding->chars_at_source = 0;
7739 }
7740
7741
7742 /* Encode the text at CODING->src_object into CODING->dst_object.
7743    CODING->src_object is a buffer or a string.
7744    CODING->dst_object is a buffer or nil.
7745
7746    If CODING->src_object is a buffer, it must be the current buffer.
7747    In this case, if CODING->src_pos is positive, it is a position of
7748    the source text in the buffer, otherwise. the source text is in the
7749    gap area of the buffer, and coding->src_pos specifies the offset of
7750    the text from GPT (which must be the same as PT).  If this is the
7751    same buffer as CODING->dst_object, CODING->src_pos must be
7752    negative and CODING should not have `pre-write-conversion'.
7753
7754    If CODING->src_object is a string, CODING should not have
7755    `pre-write-conversion'.
7756
7757    If CODING->dst_object is a buffer, the encoded data is inserted at
7758    the current point of that buffer.
7759
7760    If CODING->dst_object is nil, the encoded data is placed at the
7761    memory area specified by CODING->destination.  */
7762
7763 static void
7764 encode_coding (struct coding_system *coding)
7765 {
7766   Lisp_Object attrs;
7767   Lisp_Object translation_table;
7768   int max_lookup;
7769   struct ccl_spec cclspec;
7770
7771   USE_SAFE_ALLOCA;
7772
7773   attrs = CODING_ID_ATTRS (coding->id);
7774   if (coding->encoder == encode_coding_raw_text)
7775     translation_table = Qnil, max_lookup = 0;
7776   else
7777     translation_table = get_translation_table (attrs, 1, &max_lookup);
7778
7779   if (BUFFERP (coding->dst_object))
7780     {
7781       set_buffer_internal (XBUFFER (coding->dst_object));
7782       coding->dst_multibyte
7783         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7784     }
7785
7786   coding->consumed = coding->consumed_char = 0;
7787   coding->produced = coding->produced_char = 0;
7788   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7789   coding->errors = 0;
7790
7791   ALLOC_CONVERSION_WORK_AREA (coding);
7792
7793   if (coding->encoder == encode_coding_ccl)
7794     {
7795       coding->spec.ccl = &cclspec;
7796       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7797     }
7798   do {
7799     coding_set_source (coding);
7800     consume_chars (coding, translation_table, max_lookup);
7801     coding_set_destination (coding);
7802     (*(coding->encoder)) (coding);
7803   } while (coding->consumed_char < coding->src_chars);
7804
7805   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7806     insert_from_gap (coding->produced_char, coding->produced, 0);
7807
7808   SAFE_FREE ();
7809 }
7810
7811
7812 /* Name (or base name) of work buffer for code conversion.  */
7813 static Lisp_Object Vcode_conversion_workbuf_name;
7814
7815 /* A working buffer used by the top level conversion.  Once it is
7816    created, it is never destroyed.  It has the name
7817    Vcode_conversion_workbuf_name.  The other working buffers are
7818    destroyed after the use is finished, and their names are modified
7819    versions of Vcode_conversion_workbuf_name.  */
7820 static Lisp_Object Vcode_conversion_reused_workbuf;
7821
7822 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7823 static bool reused_workbuf_in_use;
7824
7825
7826 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7827    multibyteness of returning buffer.  */
7828
7829 static Lisp_Object
7830 make_conversion_work_buffer (bool multibyte)
7831 {
7832   Lisp_Object name, workbuf;
7833   struct buffer *current;
7834
7835   if (reused_workbuf_in_use)
7836     {
7837       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7838       workbuf = Fget_buffer_create (name);
7839     }
7840   else
7841     {
7842       reused_workbuf_in_use = 1;
7843       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7844         Vcode_conversion_reused_workbuf
7845           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7846       workbuf = Vcode_conversion_reused_workbuf;
7847     }
7848   current = current_buffer;
7849   set_buffer_internal (XBUFFER (workbuf));
7850   /* We can't allow modification hooks to run in the work buffer.  For
7851      instance, directory_files_internal assumes that file decoding
7852      doesn't compile new regexps.  */
7853   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7854   Ferase_buffer ();
7855   bset_undo_list (current_buffer, Qt);
7856   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7857   set_buffer_internal (current);
7858   return workbuf;
7859 }
7860
7861
7862 static void
7863 code_conversion_restore (Lisp_Object arg)
7864 {
7865   Lisp_Object current, workbuf;
7866   struct gcpro gcpro1;
7867
7868   GCPRO1 (arg);
7869   current = XCAR (arg);
7870   workbuf = XCDR (arg);
7871   if (! NILP (workbuf))
7872     {
7873       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7874         reused_workbuf_in_use = 0;
7875       else
7876         Fkill_buffer (workbuf);
7877     }
7878   set_buffer_internal (XBUFFER (current));
7879   UNGCPRO;
7880 }
7881
7882 Lisp_Object
7883 code_conversion_save (bool with_work_buf, bool multibyte)
7884 {
7885   Lisp_Object workbuf = Qnil;
7886
7887   if (with_work_buf)
7888     workbuf = make_conversion_work_buffer (multibyte);
7889   record_unwind_protect (code_conversion_restore,
7890                          Fcons (Fcurrent_buffer (), workbuf));
7891   return workbuf;
7892 }
7893
7894 void
7895 decode_coding_gap (struct coding_system *coding,
7896                    ptrdiff_t chars, ptrdiff_t bytes)
7897 {
7898   ptrdiff_t count = SPECPDL_INDEX ();
7899   Lisp_Object attrs;
7900
7901   coding->src_object = Fcurrent_buffer ();
7902   coding->src_chars = chars;
7903   coding->src_bytes = bytes;
7904   coding->src_pos = -chars;
7905   coding->src_pos_byte = -bytes;
7906   coding->src_multibyte = chars < bytes;
7907   coding->dst_object = coding->src_object;
7908   coding->dst_pos = PT;
7909   coding->dst_pos_byte = PT_BYTE;
7910   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7911
7912   coding->head_ascii = -1;
7913   coding->detected_utf8_bytes = coding->detected_utf8_chars = -1;
7914   coding->eol_seen = EOL_SEEN_NONE;
7915   if (CODING_REQUIRE_DETECTION (coding))
7916     detect_coding (coding);
7917   attrs = CODING_ID_ATTRS (coding->id);
7918   if (! disable_ascii_optimization
7919       && ! coding->src_multibyte
7920       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7921       && NILP (CODING_ATTR_POST_READ (attrs))
7922       && NILP (get_translation_table (attrs, 0, NULL)))
7923     {
7924       chars = coding->head_ascii;
7925       if (chars < 0)
7926         chars = check_ascii (coding);
7927       if (chars != bytes)
7928         {
7929           /* There exists a non-ASCII byte.  */
7930           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8)
7931               && coding->detected_utf8_bytes == coding->src_bytes)
7932             {
7933               if (coding->detected_utf8_chars >= 0)
7934                 chars = coding->detected_utf8_chars;
7935               else
7936                 chars = check_utf_8 (coding);
7937               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7938                   && coding->head_ascii == 0
7939                   && coding->source[0] == UTF_8_BOM_1
7940                   && coding->source[1] == UTF_8_BOM_2
7941                   && coding->source[2] == UTF_8_BOM_3)
7942                 {
7943                   chars--;
7944                   bytes -= 3;
7945                   coding->src_bytes -= 3;
7946                 }
7947             }
7948           else
7949             chars = -1;
7950         }
7951       if (chars >= 0)
7952         {
7953           Lisp_Object eol_type;
7954
7955           eol_type = CODING_ID_EOL_TYPE (coding->id);
7956           if (VECTORP (eol_type))
7957             {
7958               if (coding->eol_seen != EOL_SEEN_NONE)
7959                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7960             }
7961           if (EQ (eol_type, Qmac))
7962             {
7963               unsigned char *src_end = GAP_END_ADDR;
7964               unsigned char *src = src_end - coding->src_bytes;
7965
7966               while (src < src_end)
7967                 {
7968                   if (*src++ == '\r')
7969                     src[-1] = '\n';
7970                 }
7971             }
7972           else if (EQ (eol_type, Qdos))
7973             {
7974               unsigned char *src = GAP_END_ADDR;
7975               unsigned char *src_beg = src - coding->src_bytes;
7976               unsigned char *dst = src;
7977               ptrdiff_t diff;
7978
7979               while (src_beg < src)
7980                 {
7981                   *--dst = *--src;
7982                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7983                     src--;
7984                 }
7985               diff = dst - src;
7986               bytes -= diff;
7987               chars -= diff;
7988             }
7989           coding->produced = bytes;
7990           coding->produced_char = chars;
7991           insert_from_gap (chars, bytes, 1);
7992           return;
7993         }
7994     }
7995   code_conversion_save (0, 0);
7996
7997   coding->mode |= CODING_MODE_LAST_BLOCK;
7998   current_buffer->text->inhibit_shrinking = 1;
7999   decode_coding (coding);
8000   current_buffer->text->inhibit_shrinking = 0;
8001
8002   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8003     {
8004       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8005       Lisp_Object val;
8006
8007       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8008       val = call1 (CODING_ATTR_POST_READ (attrs),
8009                    make_number (coding->produced_char));
8010       CHECK_NATNUM (val);
8011       coding->produced_char += Z - prev_Z;
8012       coding->produced += Z_BYTE - prev_Z_BYTE;
8013     }
8014
8015   unbind_to (count, Qnil);
8016 }
8017
8018
8019 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
8020    SRC_OBJECT into DST_OBJECT by coding context CODING.
8021
8022    SRC_OBJECT is a buffer, a string, or Qnil.
8023
8024    If it is a buffer, the text is at point of the buffer.  FROM and TO
8025    are positions in the buffer.
8026
8027    If it is a string, the text is at the beginning of the string.
8028    FROM and TO are indices to the string.
8029
8030    If it is nil, the text is at coding->source.  FROM and TO are
8031    indices to coding->source.
8032
8033    DST_OBJECT is a buffer, Qt, or Qnil.
8034
8035    If it is a buffer, the decoded text is inserted at point of the
8036    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8037    is deleted.
8038
8039    If it is Qt, a string is made from the decoded text, and
8040    set in CODING->dst_object.
8041
8042    If it is Qnil, the decoded text is stored at CODING->destination.
8043    The caller must allocate CODING->dst_bytes bytes at
8044    CODING->destination by xmalloc.  If the decoded text is longer than
8045    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8046  */
8047
8048 void
8049 decode_coding_object (struct coding_system *coding,
8050                       Lisp_Object src_object,
8051                       ptrdiff_t from, ptrdiff_t from_byte,
8052                       ptrdiff_t to, ptrdiff_t to_byte,
8053                       Lisp_Object dst_object)
8054 {
8055   ptrdiff_t count = SPECPDL_INDEX ();
8056   unsigned char *destination IF_LINT (= NULL);
8057   ptrdiff_t dst_bytes IF_LINT (= 0);
8058   ptrdiff_t chars = to - from;
8059   ptrdiff_t bytes = to_byte - from_byte;
8060   Lisp_Object attrs;
8061   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8062   bool need_marker_adjustment = 0;
8063   Lisp_Object old_deactivate_mark;
8064
8065   old_deactivate_mark = Vdeactivate_mark;
8066
8067   if (NILP (dst_object))
8068     {
8069       destination = coding->destination;
8070       dst_bytes = coding->dst_bytes;
8071     }
8072
8073   coding->src_object = src_object;
8074   coding->src_chars = chars;
8075   coding->src_bytes = bytes;
8076   coding->src_multibyte = chars < bytes;
8077
8078   if (STRINGP (src_object))
8079     {
8080       coding->src_pos = from;
8081       coding->src_pos_byte = from_byte;
8082     }
8083   else if (BUFFERP (src_object))
8084     {
8085       set_buffer_internal (XBUFFER (src_object));
8086       if (from != GPT)
8087         move_gap_both (from, from_byte);
8088       if (EQ (src_object, dst_object))
8089         {
8090           struct Lisp_Marker *tail;
8091
8092           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8093             {
8094               tail->need_adjustment
8095                 = tail->charpos == (tail->insertion_type ? from : to);
8096               need_marker_adjustment |= tail->need_adjustment;
8097             }
8098           saved_pt = PT, saved_pt_byte = PT_BYTE;
8099           TEMP_SET_PT_BOTH (from, from_byte);
8100           current_buffer->text->inhibit_shrinking = 1;
8101           del_range_both (from, from_byte, to, to_byte, 1);
8102           coding->src_pos = -chars;
8103           coding->src_pos_byte = -bytes;
8104         }
8105       else
8106         {
8107           coding->src_pos = from;
8108           coding->src_pos_byte = from_byte;
8109         }
8110     }
8111
8112   if (CODING_REQUIRE_DETECTION (coding))
8113     detect_coding (coding);
8114   attrs = CODING_ID_ATTRS (coding->id);
8115
8116   if (EQ (dst_object, Qt)
8117       || (! NILP (CODING_ATTR_POST_READ (attrs))
8118           && NILP (dst_object)))
8119     {
8120       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8121       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8122       coding->dst_pos = BEG;
8123       coding->dst_pos_byte = BEG_BYTE;
8124     }
8125   else if (BUFFERP (dst_object))
8126     {
8127       code_conversion_save (0, 0);
8128       coding->dst_object = dst_object;
8129       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8130       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8131       coding->dst_multibyte
8132         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8133     }
8134   else
8135     {
8136       code_conversion_save (0, 0);
8137       coding->dst_object = Qnil;
8138       /* Most callers presume this will return a multibyte result, and they
8139          won't use `binary' or `raw-text' anyway, so let's not worry about
8140          CODING_FOR_UNIBYTE.  */
8141       coding->dst_multibyte = 1;
8142     }
8143
8144   decode_coding (coding);
8145
8146   if (BUFFERP (coding->dst_object))
8147     set_buffer_internal (XBUFFER (coding->dst_object));
8148
8149   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8150     {
8151       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8152       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8153       Lisp_Object val;
8154
8155       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8156       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8157               old_deactivate_mark);
8158       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8159                         make_number (coding->produced_char));
8160       UNGCPRO;
8161       CHECK_NATNUM (val);
8162       coding->produced_char += Z - prev_Z;
8163       coding->produced += Z_BYTE - prev_Z_BYTE;
8164     }
8165
8166   if (EQ (dst_object, Qt))
8167     {
8168       coding->dst_object = Fbuffer_string ();
8169     }
8170   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8171     {
8172       set_buffer_internal (XBUFFER (coding->dst_object));
8173       if (dst_bytes < coding->produced)
8174         {
8175           eassert (coding->produced > 0);
8176           destination = xrealloc (destination, coding->produced);
8177           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8178             move_gap_both (BEGV, BEGV_BYTE);
8179           memcpy (destination, BEGV_ADDR, coding->produced);
8180           coding->destination = destination;
8181         }
8182     }
8183
8184   if (saved_pt >= 0)
8185     {
8186       /* This is the case of:
8187          (BUFFERP (src_object) && EQ (src_object, dst_object))
8188          As we have moved PT while replacing the original buffer
8189          contents, we must recover it now.  */
8190       set_buffer_internal (XBUFFER (src_object));
8191       current_buffer->text->inhibit_shrinking = 0;
8192       if (saved_pt < from)
8193         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8194       else if (saved_pt < from + chars)
8195         TEMP_SET_PT_BOTH (from, from_byte);
8196       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8197         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8198                           saved_pt_byte + (coding->produced - bytes));
8199       else
8200         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8201                           saved_pt_byte + (coding->produced - bytes));
8202
8203       if (need_marker_adjustment)
8204         {
8205           struct Lisp_Marker *tail;
8206
8207           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8208             if (tail->need_adjustment)
8209               {
8210                 tail->need_adjustment = 0;
8211                 if (tail->insertion_type)
8212                   {
8213                     tail->bytepos = from_byte;
8214                     tail->charpos = from;
8215                   }
8216                 else
8217                   {
8218                     tail->bytepos = from_byte + coding->produced;
8219                     tail->charpos
8220                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8221                          ? tail->bytepos : from + coding->produced_char);
8222                   }
8223               }
8224         }
8225     }
8226
8227   Vdeactivate_mark = old_deactivate_mark;
8228   unbind_to (count, coding->dst_object);
8229 }
8230
8231
8232 void
8233 encode_coding_object (struct coding_system *coding,
8234                       Lisp_Object src_object,
8235                       ptrdiff_t from, ptrdiff_t from_byte,
8236                       ptrdiff_t to, ptrdiff_t to_byte,
8237                       Lisp_Object dst_object)
8238 {
8239   ptrdiff_t count = SPECPDL_INDEX ();
8240   ptrdiff_t chars = to - from;
8241   ptrdiff_t bytes = to_byte - from_byte;
8242   Lisp_Object attrs;
8243   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8244   bool need_marker_adjustment = 0;
8245   bool kill_src_buffer = 0;
8246   Lisp_Object old_deactivate_mark;
8247
8248   old_deactivate_mark = Vdeactivate_mark;
8249
8250   coding->src_object = src_object;
8251   coding->src_chars = chars;
8252   coding->src_bytes = bytes;
8253   coding->src_multibyte = chars < bytes;
8254
8255   attrs = CODING_ID_ATTRS (coding->id);
8256
8257   if (EQ (src_object, dst_object))
8258     {
8259       struct Lisp_Marker *tail;
8260
8261       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8262         {
8263           tail->need_adjustment
8264             = tail->charpos == (tail->insertion_type ? from : to);
8265           need_marker_adjustment |= tail->need_adjustment;
8266         }
8267     }
8268
8269   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8270     {
8271       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8272       set_buffer_internal (XBUFFER (coding->src_object));
8273       if (STRINGP (src_object))
8274         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8275       else if (BUFFERP (src_object))
8276         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8277       else
8278         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8279
8280       if (EQ (src_object, dst_object))
8281         {
8282           set_buffer_internal (XBUFFER (src_object));
8283           saved_pt = PT, saved_pt_byte = PT_BYTE;
8284           del_range_both (from, from_byte, to, to_byte, 1);
8285           set_buffer_internal (XBUFFER (coding->src_object));
8286         }
8287
8288       {
8289         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8290
8291         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8292                 old_deactivate_mark);
8293         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8294                     make_number (BEG), make_number (Z));
8295         UNGCPRO;
8296       }
8297       if (XBUFFER (coding->src_object) != current_buffer)
8298         kill_src_buffer = 1;
8299       coding->src_object = Fcurrent_buffer ();
8300       if (BEG != GPT)
8301         move_gap_both (BEG, BEG_BYTE);
8302       coding->src_chars = Z - BEG;
8303       coding->src_bytes = Z_BYTE - BEG_BYTE;
8304       coding->src_pos = BEG;
8305       coding->src_pos_byte = BEG_BYTE;
8306       coding->src_multibyte = Z < Z_BYTE;
8307     }
8308   else if (STRINGP (src_object))
8309     {
8310       code_conversion_save (0, 0);
8311       coding->src_pos = from;
8312       coding->src_pos_byte = from_byte;
8313     }
8314   else if (BUFFERP (src_object))
8315     {
8316       code_conversion_save (0, 0);
8317       set_buffer_internal (XBUFFER (src_object));
8318       if (EQ (src_object, dst_object))
8319         {
8320           saved_pt = PT, saved_pt_byte = PT_BYTE;
8321           coding->src_object = del_range_1 (from, to, 1, 1);
8322           coding->src_pos = 0;
8323           coding->src_pos_byte = 0;
8324         }
8325       else
8326         {
8327           if (from < GPT && to >= GPT)
8328             move_gap_both (from, from_byte);
8329           coding->src_pos = from;
8330           coding->src_pos_byte = from_byte;
8331         }
8332     }
8333   else
8334     code_conversion_save (0, 0);
8335
8336   if (BUFFERP (dst_object))
8337     {
8338       coding->dst_object = dst_object;
8339       if (EQ (src_object, dst_object))
8340         {
8341           coding->dst_pos = from;
8342           coding->dst_pos_byte = from_byte;
8343         }
8344       else
8345         {
8346           struct buffer *current = current_buffer;
8347
8348           set_buffer_temp (XBUFFER (dst_object));
8349           coding->dst_pos = PT;
8350           coding->dst_pos_byte = PT_BYTE;
8351           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8352           set_buffer_temp (current);
8353         }
8354       coding->dst_multibyte
8355         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8356     }
8357   else if (EQ (dst_object, Qt))
8358     {
8359       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8360       coding->dst_object = Qnil;
8361       coding->destination = xmalloc (dst_bytes);
8362       coding->dst_bytes = dst_bytes;
8363       coding->dst_multibyte = 0;
8364     }
8365   else
8366     {
8367       coding->dst_object = Qnil;
8368       coding->dst_multibyte = 0;
8369     }
8370
8371   encode_coding (coding);
8372
8373   if (EQ (dst_object, Qt))
8374     {
8375       if (BUFFERP (coding->dst_object))
8376         coding->dst_object = Fbuffer_string ();
8377       else if (coding->raw_destination)
8378         /* This is used to avoid creating huge Lisp string.
8379            NOTE: caller who sets `raw_destination' is also
8380            responsible for freeing `destination' buffer.  */
8381         coding->dst_object = Qnil;
8382       else
8383         {
8384           coding->dst_object
8385             = make_unibyte_string ((char *) coding->destination,
8386                                    coding->produced);
8387           xfree (coding->destination);
8388         }
8389     }
8390
8391   if (saved_pt >= 0)
8392     {
8393       /* This is the case of:
8394          (BUFFERP (src_object) && EQ (src_object, dst_object))
8395          As we have moved PT while replacing the original buffer
8396          contents, we must recover it now.  */
8397       set_buffer_internal (XBUFFER (src_object));
8398       if (saved_pt < from)
8399         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8400       else if (saved_pt < from + chars)
8401         TEMP_SET_PT_BOTH (from, from_byte);
8402       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8403         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8404                           saved_pt_byte + (coding->produced - bytes));
8405       else
8406         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8407                           saved_pt_byte + (coding->produced - bytes));
8408
8409       if (need_marker_adjustment)
8410         {
8411           struct Lisp_Marker *tail;
8412
8413           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8414             if (tail->need_adjustment)
8415               {
8416                 tail->need_adjustment = 0;
8417                 if (tail->insertion_type)
8418                   {
8419                     tail->bytepos = from_byte;
8420                     tail->charpos = from;
8421                   }
8422                 else
8423                   {
8424                     tail->bytepos = from_byte + coding->produced;
8425                     tail->charpos
8426                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8427                          ? tail->bytepos : from + coding->produced_char);
8428                   }
8429               }
8430         }
8431     }
8432
8433   if (kill_src_buffer)
8434     Fkill_buffer (coding->src_object);
8435
8436   Vdeactivate_mark = old_deactivate_mark;
8437   unbind_to (count, Qnil);
8438 }
8439
8440
8441 Lisp_Object
8442 preferred_coding_system (void)
8443 {
8444   int id = coding_categories[coding_priorities[0]].id;
8445
8446   return CODING_ID_NAME (id);
8447 }
8448
8449 #if defined (WINDOWSNT) || defined (CYGWIN)
8450
8451 Lisp_Object
8452 from_unicode (Lisp_Object str)
8453 {
8454   CHECK_STRING (str);
8455   if (!STRING_MULTIBYTE (str) &&
8456       SBYTES (str) & 1)
8457     {
8458       str = Fsubstring (str, make_number (0), make_number (-1));
8459     }
8460
8461   return code_convert_string_norecord (str, Qutf_16le, 0);
8462 }
8463
8464 Lisp_Object
8465 from_unicode_buffer (const wchar_t* wstr)
8466 {
8467     return from_unicode (
8468         make_unibyte_string (
8469             (char*) wstr,
8470             /* we get one of the two final 0 bytes for free. */
8471             1 + sizeof (wchar_t) * wcslen (wstr)));
8472 }
8473
8474 wchar_t *
8475 to_unicode (Lisp_Object str, Lisp_Object *buf)
8476 {
8477   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8478   /* We need to make another copy (in addition to the one made by
8479      code_convert_string_norecord) to ensure that the final string is
8480      _doubly_ zero terminated --- that is, that the string is
8481      terminated by two zero bytes and one utf-16le null character.
8482      Because strings are already terminated with a single zero byte,
8483      we just add one additional zero. */
8484   str = make_uninit_string (SBYTES (*buf) + 1);
8485   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8486   SDATA (str) [SBYTES (*buf)] = '\0';
8487   *buf = str;
8488   return WCSDATA (*buf);
8489 }
8490
8491 #endif /* WINDOWSNT || CYGWIN */
8492
8493 \f
8494 #ifdef emacs
8495 /*** 8. Emacs Lisp library functions ***/
8496
8497 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8498        doc: /* Return t if OBJECT is nil or a coding-system.
8499 See the documentation of `define-coding-system' for information
8500 about coding-system objects.  */)
8501   (Lisp_Object object)
8502 {
8503   if (NILP (object)
8504       || CODING_SYSTEM_ID (object) >= 0)
8505     return Qt;
8506   if (! SYMBOLP (object)
8507       || NILP (Fget (object, Qcoding_system_define_form)))
8508     return Qnil;
8509   return Qt;
8510 }
8511
8512 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8513        Sread_non_nil_coding_system, 1, 1, 0,
8514        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8515   (Lisp_Object prompt)
8516 {
8517   Lisp_Object val;
8518   do
8519     {
8520       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8521                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8522     }
8523   while (SCHARS (val) == 0);
8524   return (Fintern (val, Qnil));
8525 }
8526
8527 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8528        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8529 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8530 Ignores case when completing coding systems (all Emacs coding systems
8531 are lower-case).  */)
8532   (Lisp_Object prompt, Lisp_Object default_coding_system)
8533 {
8534   Lisp_Object val;
8535   ptrdiff_t count = SPECPDL_INDEX ();
8536
8537   if (SYMBOLP (default_coding_system))
8538     default_coding_system = SYMBOL_NAME (default_coding_system);
8539   specbind (Qcompletion_ignore_case, Qt);
8540   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8541                           Qt, Qnil, Qcoding_system_history,
8542                           default_coding_system, Qnil);
8543   unbind_to (count, Qnil);
8544   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8545 }
8546
8547 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8548        1, 1, 0,
8549        doc: /* Check validity of CODING-SYSTEM.
8550 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8551 It is valid if it is nil or a symbol defined as a coding system by the
8552 function `define-coding-system'.  */)
8553   (Lisp_Object coding_system)
8554 {
8555   Lisp_Object define_form;
8556
8557   define_form = Fget (coding_system, Qcoding_system_define_form);
8558   if (! NILP (define_form))
8559     {
8560       Fput (coding_system, Qcoding_system_define_form, Qnil);
8561       safe_eval (define_form);
8562     }
8563   if (!NILP (Fcoding_system_p (coding_system)))
8564     return coding_system;
8565   xsignal1 (Qcoding_system_error, coding_system);
8566 }
8567
8568 \f
8569 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8570    HIGHEST, return the coding system of the highest
8571    priority among the detected coding systems.  Otherwise return a
8572    list of detected coding systems sorted by their priorities.  If
8573    MULTIBYTEP, it is assumed that the bytes are in correct
8574    multibyte form but contains only ASCII and eight-bit chars.
8575    Otherwise, the bytes are raw bytes.
8576
8577    CODING-SYSTEM controls the detection as below:
8578
8579    If it is nil, detect both text-format and eol-format.  If the
8580    text-format part of CODING-SYSTEM is already specified
8581    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8582    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8583    detect only text-format.  */
8584
8585 Lisp_Object
8586 detect_coding_system (const unsigned char *src,
8587                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8588                       bool highest, bool multibytep,
8589                       Lisp_Object coding_system)
8590 {
8591   const unsigned char *src_end = src + src_bytes;
8592   Lisp_Object attrs, eol_type;
8593   Lisp_Object val = Qnil;
8594   struct coding_system coding;
8595   ptrdiff_t id;
8596   struct coding_detection_info detect_info;
8597   enum coding_category base_category;
8598   bool null_byte_found = 0, eight_bit_found = 0;
8599
8600   if (NILP (coding_system))
8601     coding_system = Qundecided;
8602   setup_coding_system (coding_system, &coding);
8603   attrs = CODING_ID_ATTRS (coding.id);
8604   eol_type = CODING_ID_EOL_TYPE (coding.id);
8605   coding_system = CODING_ATTR_BASE_NAME (attrs);
8606
8607   coding.source = src;
8608   coding.src_chars = src_chars;
8609   coding.src_bytes = src_bytes;
8610   coding.src_multibyte = multibytep;
8611   coding.consumed = 0;
8612   coding.mode |= CODING_MODE_LAST_BLOCK;
8613   coding.head_ascii = 0;
8614
8615   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8616
8617   /* At first, detect text-format if necessary.  */
8618   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8619   if (base_category == coding_category_undecided)
8620     {
8621       enum coding_category category IF_LINT (= 0);
8622       struct coding_system *this IF_LINT (= NULL);
8623       int c, i;
8624       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8625                                        inhibit_null_byte_detection);
8626       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8627                                        inhibit_iso_escape_detection);
8628       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8629
8630       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8631       for (; src < src_end; src++)
8632         {
8633           c = *src;
8634           if (c & 0x80)
8635             {
8636               eight_bit_found = 1;
8637               if (null_byte_found)
8638                 break;
8639             }
8640           else if (c < 0x20)
8641             {
8642               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8643                   && ! inhibit_ied
8644                   && ! detect_info.checked)
8645                 {
8646                   if (detect_coding_iso_2022 (&coding, &detect_info))
8647                     {
8648                       /* We have scanned the whole data.  */
8649                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8650                         {
8651                           /* We didn't find an 8-bit code.  We may
8652                              have found a null-byte, but it's very
8653                              rare that a binary file confirm to
8654                              ISO-2022.  */
8655                           src = src_end;
8656                           coding.head_ascii = src - coding.source;
8657                         }
8658                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8659                       break;
8660                     }
8661                 }
8662               else if (! c && !inhibit_nbd)
8663                 {
8664                   null_byte_found = 1;
8665                   if (eight_bit_found)
8666                     break;
8667                 }
8668               if (! eight_bit_found)
8669                 coding.head_ascii++;
8670             }
8671           else if (! eight_bit_found)
8672             coding.head_ascii++;
8673         }
8674
8675       if (null_byte_found || eight_bit_found
8676           || coding.head_ascii < coding.src_bytes
8677           || detect_info.found)
8678         {
8679           if (coding.head_ascii == coding.src_bytes)
8680             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8681             for (i = 0; i < coding_category_raw_text; i++)
8682               {
8683                 category = coding_priorities[i];
8684                 this = coding_categories + category;
8685                 if (detect_info.found & (1 << category))
8686                   break;
8687               }
8688           else
8689             {
8690               if (null_byte_found)
8691                 {
8692                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8693                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8694                 }
8695               else if (prefer_utf_8
8696                        && detect_coding_utf_8 (&coding, &detect_info))
8697                 {
8698                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8699                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8700                 }
8701               for (i = 0; i < coding_category_raw_text; i++)
8702                 {
8703                   category = coding_priorities[i];
8704                   this = coding_categories + category;
8705
8706                   if (this->id < 0)
8707                     {
8708                       /* No coding system of this category is defined.  */
8709                       detect_info.rejected |= (1 << category);
8710                     }
8711                   else if (category >= coding_category_raw_text)
8712                     continue;
8713                   else if (detect_info.checked & (1 << category))
8714                     {
8715                       if (highest
8716                           && (detect_info.found & (1 << category)))
8717                         break;
8718                     }
8719                   else if ((*(this->detector)) (&coding, &detect_info)
8720                            && highest
8721                            && (detect_info.found & (1 << category)))
8722                     {
8723                       if (category == coding_category_utf_16_auto)
8724                         {
8725                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8726                             category = coding_category_utf_16_le;
8727                           else
8728                             category = coding_category_utf_16_be;
8729                         }
8730                       break;
8731                     }
8732                 }
8733             }
8734         }
8735
8736       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8737           || null_byte_found)
8738         {
8739           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8740           id = CODING_SYSTEM_ID (Qno_conversion);
8741           val = list1 (make_number (id));
8742         }
8743       else if (! detect_info.rejected && ! detect_info.found)
8744         {
8745           detect_info.found = CATEGORY_MASK_ANY;
8746           id = coding_categories[coding_category_undecided].id;
8747           val = list1 (make_number (id));
8748         }
8749       else if (highest)
8750         {
8751           if (detect_info.found)
8752             {
8753               detect_info.found = 1 << category;
8754               val = list1 (make_number (this->id));
8755             }
8756           else
8757             for (i = 0; i < coding_category_raw_text; i++)
8758               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8759                 {
8760                   detect_info.found = 1 << coding_priorities[i];
8761                   id = coding_categories[coding_priorities[i]].id;
8762                   val = list1 (make_number (id));
8763                   break;
8764                 }
8765         }
8766       else
8767         {
8768           int mask = detect_info.rejected | detect_info.found;
8769           int found = 0;
8770
8771           for (i = coding_category_raw_text - 1; i >= 0; i--)
8772             {
8773               category = coding_priorities[i];
8774               if (! (mask & (1 << category)))
8775                 {
8776                   found |= 1 << category;
8777                   id = coding_categories[category].id;
8778                   if (id >= 0)
8779                     val = list1 (make_number (id));
8780                 }
8781             }
8782           for (i = coding_category_raw_text - 1; i >= 0; i--)
8783             {
8784               category = coding_priorities[i];
8785               if (detect_info.found & (1 << category))
8786                 {
8787                   id = coding_categories[category].id;
8788                   val = Fcons (make_number (id), val);
8789                 }
8790             }
8791           detect_info.found |= found;
8792         }
8793     }
8794   else if (base_category == coding_category_utf_8_auto)
8795     {
8796       if (detect_coding_utf_8 (&coding, &detect_info))
8797         {
8798           struct coding_system *this;
8799
8800           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8801             this = coding_categories + coding_category_utf_8_sig;
8802           else
8803             this = coding_categories + coding_category_utf_8_nosig;
8804           val = list1 (make_number (this->id));
8805         }
8806     }
8807   else if (base_category == coding_category_utf_16_auto)
8808     {
8809       if (detect_coding_utf_16 (&coding, &detect_info))
8810         {
8811           struct coding_system *this;
8812
8813           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8814             this = coding_categories + coding_category_utf_16_le;
8815           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8816             this = coding_categories + coding_category_utf_16_be;
8817           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8818             this = coding_categories + coding_category_utf_16_be_nosig;
8819           else
8820             this = coding_categories + coding_category_utf_16_le_nosig;
8821           val = list1 (make_number (this->id));
8822         }
8823     }
8824   else
8825     {
8826       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8827       val = list1 (make_number (coding.id));
8828     }
8829
8830   /* Then, detect eol-format if necessary.  */
8831   {
8832     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8833     Lisp_Object tail;
8834
8835     if (VECTORP (eol_type))
8836       {
8837         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8838           {
8839             if (null_byte_found)
8840               normal_eol = EOL_SEEN_LF;
8841             else
8842               normal_eol = detect_eol (coding.source, src_bytes,
8843                                        coding_category_raw_text);
8844           }
8845         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8846                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8847           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8848                                       coding_category_utf_16_be);
8849         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8850                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8851           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8852                                       coding_category_utf_16_le);
8853       }
8854     else
8855       {
8856         if (EQ (eol_type, Qunix))
8857           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8858         else if (EQ (eol_type, Qdos))
8859           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8860         else
8861           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8862       }
8863
8864     for (tail = val; CONSP (tail); tail = XCDR (tail))
8865       {
8866         enum coding_category category;
8867         int this_eol;
8868
8869         id = XINT (XCAR (tail));
8870         attrs = CODING_ID_ATTRS (id);
8871         category = XINT (CODING_ATTR_CATEGORY (attrs));
8872         eol_type = CODING_ID_EOL_TYPE (id);
8873         if (VECTORP (eol_type))
8874           {
8875             if (category == coding_category_utf_16_be
8876                 || category == coding_category_utf_16_be_nosig)
8877               this_eol = utf_16_be_eol;
8878             else if (category == coding_category_utf_16_le
8879                      || category == coding_category_utf_16_le_nosig)
8880               this_eol = utf_16_le_eol;
8881             else
8882               this_eol = normal_eol;
8883
8884             if (this_eol == EOL_SEEN_LF)
8885               XSETCAR (tail, AREF (eol_type, 0));
8886             else if (this_eol == EOL_SEEN_CRLF)
8887               XSETCAR (tail, AREF (eol_type, 1));
8888             else if (this_eol == EOL_SEEN_CR)
8889               XSETCAR (tail, AREF (eol_type, 2));
8890             else
8891               XSETCAR (tail, CODING_ID_NAME (id));
8892           }
8893         else
8894           XSETCAR (tail, CODING_ID_NAME (id));
8895       }
8896   }
8897
8898   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8899 }
8900
8901
8902 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8903        2, 3, 0,
8904        doc: /* Detect coding system of the text in the region between START and END.
8905 Return a list of possible coding systems ordered by priority.
8906 The coding systems to try and their priorities follows what
8907 the function `coding-system-priority-list' (which see) returns.
8908
8909 If only ASCII characters are found (except for such ISO-2022 control
8910 characters as ESC), it returns a list of single element `undecided'
8911 or its subsidiary coding system according to a detected end-of-line
8912 format.
8913
8914 If optional argument HIGHEST is non-nil, return the coding system of
8915 highest priority.  */)
8916   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8917 {
8918   ptrdiff_t from, to;
8919   ptrdiff_t from_byte, to_byte;
8920
8921   validate_region (&start, &end);
8922   from = XINT (start), to = XINT (end);
8923   from_byte = CHAR_TO_BYTE (from);
8924   to_byte = CHAR_TO_BYTE (to);
8925
8926   if (from < GPT && to >= GPT)
8927     move_gap_both (to, to_byte);
8928
8929   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8930                                to - from, to_byte - from_byte,
8931                                !NILP (highest),
8932                                !NILP (BVAR (current_buffer
8933                                       , enable_multibyte_characters)),
8934                                Qnil);
8935 }
8936
8937 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8938        1, 2, 0,
8939        doc: /* Detect coding system of the text in STRING.
8940 Return a list of possible coding systems ordered by priority.
8941 The coding systems to try and their priorities follows what
8942 the function `coding-system-priority-list' (which see) returns.
8943
8944 If only ASCII characters are found (except for such ISO-2022 control
8945 characters as ESC), it returns a list of single element `undecided'
8946 or its subsidiary coding system according to a detected end-of-line
8947 format.
8948
8949 If optional argument HIGHEST is non-nil, return the coding system of
8950 highest priority.  */)
8951   (Lisp_Object string, Lisp_Object highest)
8952 {
8953   CHECK_STRING (string);
8954
8955   return detect_coding_system (SDATA (string),
8956                                SCHARS (string), SBYTES (string),
8957                                !NILP (highest), STRING_MULTIBYTE (string),
8958                                Qnil);
8959 }
8960
8961
8962 static bool
8963 char_encodable_p (int c, Lisp_Object attrs)
8964 {
8965   Lisp_Object tail;
8966   struct charset *charset;
8967   Lisp_Object translation_table;
8968
8969   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8970   if (! NILP (translation_table))
8971     c = translate_char (translation_table, c);
8972   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8973        CONSP (tail); tail = XCDR (tail))
8974     {
8975       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8976       if (CHAR_CHARSET_P (c, charset))
8977         break;
8978     }
8979   return (! NILP (tail));
8980 }
8981
8982
8983 /* Return a list of coding systems that safely encode the text between
8984    START and END.  If EXCLUDE is non-nil, it is a list of coding
8985    systems not to check.  The returned list doesn't contain any such
8986    coding systems.  In any case, if the text contains only ASCII or is
8987    unibyte, return t.  */
8988
8989 DEFUN ("find-coding-systems-region-internal",
8990        Ffind_coding_systems_region_internal,
8991        Sfind_coding_systems_region_internal, 2, 3, 0,
8992        doc: /* Internal use only.  */)
8993   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8994 {
8995   Lisp_Object coding_attrs_list, safe_codings;
8996   ptrdiff_t start_byte, end_byte;
8997   const unsigned char *p, *pbeg, *pend;
8998   int c;
8999   Lisp_Object tail, elt, work_table;
9000
9001   if (STRINGP (start))
9002     {
9003       if (!STRING_MULTIBYTE (start)
9004           || SCHARS (start) == SBYTES (start))
9005         return Qt;
9006       start_byte = 0;
9007       end_byte = SBYTES (start);
9008     }
9009   else
9010     {
9011       CHECK_NUMBER_COERCE_MARKER (start);
9012       CHECK_NUMBER_COERCE_MARKER (end);
9013       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9014         args_out_of_range (start, end);
9015       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9016         return Qt;
9017       start_byte = CHAR_TO_BYTE (XINT (start));
9018       end_byte = CHAR_TO_BYTE (XINT (end));
9019       if (XINT (end) - XINT (start) == end_byte - start_byte)
9020         return Qt;
9021
9022       if (XINT (start) < GPT && XINT (end) > GPT)
9023         {
9024           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9025             move_gap_both (XINT (start), start_byte);
9026           else
9027             move_gap_both (XINT (end), end_byte);
9028         }
9029     }
9030
9031   coding_attrs_list = Qnil;
9032   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9033     if (NILP (exclude)
9034         || NILP (Fmemq (XCAR (tail), exclude)))
9035       {
9036         Lisp_Object attrs;
9037
9038         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9039         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9040           {
9041             ASET (attrs, coding_attr_trans_tbl,
9042                   get_translation_table (attrs, 1, NULL));
9043             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9044           }
9045       }
9046
9047   if (STRINGP (start))
9048     p = pbeg = SDATA (start);
9049   else
9050     p = pbeg = BYTE_POS_ADDR (start_byte);
9051   pend = p + (end_byte - start_byte);
9052
9053   while (p < pend && ASCII_BYTE_P (*p)) p++;
9054   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9055
9056   work_table = Fmake_char_table (Qnil, Qnil);
9057   while (p < pend)
9058     {
9059       if (ASCII_BYTE_P (*p))
9060         p++;
9061       else
9062         {
9063           c = STRING_CHAR_ADVANCE (p);
9064           if (!NILP (char_table_ref (work_table, c)))
9065             /* This character was already checked.  Ignore it.  */
9066             continue;
9067
9068           charset_map_loaded = 0;
9069           for (tail = coding_attrs_list; CONSP (tail);)
9070             {
9071               elt = XCAR (tail);
9072               if (NILP (elt))
9073                 tail = XCDR (tail);
9074               else if (char_encodable_p (c, elt))
9075                 tail = XCDR (tail);
9076               else if (CONSP (XCDR (tail)))
9077                 {
9078                   XSETCAR (tail, XCAR (XCDR (tail)));
9079                   XSETCDR (tail, XCDR (XCDR (tail)));
9080                 }
9081               else
9082                 {
9083                   XSETCAR (tail, Qnil);
9084                   tail = XCDR (tail);
9085                 }
9086             }
9087           if (charset_map_loaded)
9088             {
9089               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9090
9091               if (STRINGP (start))
9092                 pbeg = SDATA (start);
9093               else
9094                 pbeg = BYTE_POS_ADDR (start_byte);
9095               p = pbeg + p_offset;
9096               pend = pbeg + pend_offset;
9097             }
9098           char_table_set (work_table, c, Qt);
9099         }
9100     }
9101
9102   safe_codings = list2 (Qraw_text, Qno_conversion);
9103   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9104     if (! NILP (XCAR (tail)))
9105       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9106
9107   return safe_codings;
9108 }
9109
9110
9111 DEFUN ("unencodable-char-position", Funencodable_char_position,
9112        Sunencodable_char_position, 3, 5, 0,
9113        doc: /*
9114 Return position of first un-encodable character in a region.
9115 START and END specify the region and CODING-SYSTEM specifies the
9116 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9117
9118 If optional 4th argument COUNT is non-nil, it specifies at most how
9119 many un-encodable characters to search.  In this case, the value is a
9120 list of positions.
9121
9122 If optional 5th argument STRING is non-nil, it is a string to search
9123 for un-encodable characters.  In that case, START and END are indexes
9124 to the string.  */)
9125   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9126 {
9127   EMACS_INT n;
9128   struct coding_system coding;
9129   Lisp_Object attrs, charset_list, translation_table;
9130   Lisp_Object positions;
9131   ptrdiff_t from, to;
9132   const unsigned char *p, *stop, *pend;
9133   bool ascii_compatible;
9134
9135   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9136   attrs = CODING_ID_ATTRS (coding.id);
9137   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9138     return Qnil;
9139   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9140   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9141   translation_table = get_translation_table (attrs, 1, NULL);
9142
9143   if (NILP (string))
9144     {
9145       validate_region (&start, &end);
9146       from = XINT (start);
9147       to = XINT (end);
9148       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9149           || (ascii_compatible
9150               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9151         return Qnil;
9152       p = CHAR_POS_ADDR (from);
9153       pend = CHAR_POS_ADDR (to);
9154       if (from < GPT && to >= GPT)
9155         stop = GPT_ADDR;
9156       else
9157         stop = pend;
9158     }
9159   else
9160     {
9161       CHECK_STRING (string);
9162       CHECK_NATNUM (start);
9163       CHECK_NATNUM (end);
9164       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9165         args_out_of_range_3 (string, start, end);
9166       from = XINT (start);
9167       to = XINT (end);
9168       if (! STRING_MULTIBYTE (string))
9169         return Qnil;
9170       p = SDATA (string) + string_char_to_byte (string, from);
9171       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9172       if (ascii_compatible && (to - from) == (pend - p))
9173         return Qnil;
9174     }
9175
9176   if (NILP (count))
9177     n = 1;
9178   else
9179     {
9180       CHECK_NATNUM (count);
9181       n = XINT (count);
9182     }
9183
9184   positions = Qnil;
9185   charset_map_loaded = 0;
9186   while (1)
9187     {
9188       int c;
9189
9190       if (ascii_compatible)
9191         while (p < stop && ASCII_BYTE_P (*p))
9192           p++, from++;
9193       if (p >= stop)
9194         {
9195           if (p >= pend)
9196             break;
9197           stop = pend;
9198           p = GAP_END_ADDR;
9199         }
9200
9201       c = STRING_CHAR_ADVANCE (p);
9202       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9203           && ! char_charset (translate_char (translation_table, c),
9204                              charset_list, NULL))
9205         {
9206           positions = Fcons (make_number (from), positions);
9207           n--;
9208           if (n == 0)
9209             break;
9210         }
9211
9212       from++;
9213       if (charset_map_loaded && NILP (string))
9214         {
9215           p = CHAR_POS_ADDR (from);
9216           pend = CHAR_POS_ADDR (to);
9217           if (from < GPT && to >= GPT)
9218             stop = GPT_ADDR;
9219           else
9220             stop = pend;
9221           charset_map_loaded = 0;
9222         }
9223     }
9224
9225   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9226 }
9227
9228
9229 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9230        Scheck_coding_systems_region, 3, 3, 0,
9231        doc: /* Check if the region is encodable by coding systems.
9232
9233 START and END are buffer positions specifying the region.
9234 CODING-SYSTEM-LIST is a list of coding systems to check.
9235
9236 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9237 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9238 whole region, POS0, POS1, ... are buffer positions where non-encodable
9239 characters are found.
9240
9241 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9242 value is nil.
9243
9244 START may be a string.  In that case, check if the string is
9245 encodable, and the value contains indices to the string instead of
9246 buffer positions.  END is ignored.
9247
9248 If the current buffer (or START if it is a string) is unibyte, the value
9249 is nil.  */)
9250   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9251 {
9252   Lisp_Object list;
9253   ptrdiff_t start_byte, end_byte;
9254   ptrdiff_t pos;
9255   const unsigned char *p, *pbeg, *pend;
9256   int c;
9257   Lisp_Object tail, elt, attrs;
9258
9259   if (STRINGP (start))
9260     {
9261       if (!STRING_MULTIBYTE (start)
9262           || SCHARS (start) == SBYTES (start))
9263         return Qnil;
9264       start_byte = 0;
9265       end_byte = SBYTES (start);
9266       pos = 0;
9267     }
9268   else
9269     {
9270       CHECK_NUMBER_COERCE_MARKER (start);
9271       CHECK_NUMBER_COERCE_MARKER (end);
9272       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9273         args_out_of_range (start, end);
9274       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9275         return Qnil;
9276       start_byte = CHAR_TO_BYTE (XINT (start));
9277       end_byte = CHAR_TO_BYTE (XINT (end));
9278       if (XINT (end) - XINT (start) == end_byte - start_byte)
9279         return Qnil;
9280
9281       if (XINT (start) < GPT && XINT (end) > GPT)
9282         {
9283           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9284             move_gap_both (XINT (start), start_byte);
9285           else
9286             move_gap_both (XINT (end), end_byte);
9287         }
9288       pos = XINT (start);
9289     }
9290
9291   list = Qnil;
9292   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9293     {
9294       elt = XCAR (tail);
9295       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9296       ASET (attrs, coding_attr_trans_tbl,
9297             get_translation_table (attrs, 1, NULL));
9298       list = Fcons (list2 (elt, attrs), list);
9299     }
9300
9301   if (STRINGP (start))
9302     p = pbeg = SDATA (start);
9303   else
9304     p = pbeg = BYTE_POS_ADDR (start_byte);
9305   pend = p + (end_byte - start_byte);
9306
9307   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9308   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9309
9310   while (p < pend)
9311     {
9312       if (ASCII_BYTE_P (*p))
9313         p++;
9314       else
9315         {
9316           c = STRING_CHAR_ADVANCE (p);
9317
9318           charset_map_loaded = 0;
9319           for (tail = list; CONSP (tail); tail = XCDR (tail))
9320             {
9321               elt = XCDR (XCAR (tail));
9322               if (! char_encodable_p (c, XCAR (elt)))
9323                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9324             }
9325           if (charset_map_loaded)
9326             {
9327               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9328
9329               if (STRINGP (start))
9330                 pbeg = SDATA (start);
9331               else
9332                 pbeg = BYTE_POS_ADDR (start_byte);
9333               p = pbeg + p_offset;
9334               pend = pbeg + pend_offset;
9335             }
9336         }
9337       pos++;
9338     }
9339
9340   tail = list;
9341   list = Qnil;
9342   for (; CONSP (tail); tail = XCDR (tail))
9343     {
9344       elt = XCAR (tail);
9345       if (CONSP (XCDR (XCDR (elt))))
9346         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9347                       list);
9348     }
9349
9350   return list;
9351 }
9352
9353
9354 static Lisp_Object
9355 code_convert_region (Lisp_Object start, Lisp_Object end,
9356                      Lisp_Object coding_system, Lisp_Object dst_object,
9357                      bool encodep, bool norecord)
9358 {
9359   struct coding_system coding;
9360   ptrdiff_t from, from_byte, to, to_byte;
9361   Lisp_Object src_object;
9362
9363   if (NILP (coding_system))
9364     coding_system = Qno_conversion;
9365   else
9366     CHECK_CODING_SYSTEM (coding_system);
9367   src_object = Fcurrent_buffer ();
9368   if (NILP (dst_object))
9369     dst_object = src_object;
9370   else if (! EQ (dst_object, Qt))
9371     CHECK_BUFFER (dst_object);
9372
9373   validate_region (&start, &end);
9374   from = XFASTINT (start);
9375   from_byte = CHAR_TO_BYTE (from);
9376   to = XFASTINT (end);
9377   to_byte = CHAR_TO_BYTE (to);
9378
9379   setup_coding_system (coding_system, &coding);
9380   coding.mode |= CODING_MODE_LAST_BLOCK;
9381
9382   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9383     {
9384       struct buffer *buf = XBUFFER (dst_object);
9385       ptrdiff_t buf_pt = BUF_PT (buf);
9386
9387       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9388     }
9389
9390   if (encodep)
9391     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9392                           dst_object);
9393   else
9394     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9395                           dst_object);
9396   if (! norecord)
9397     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9398
9399   return (BUFFERP (dst_object)
9400           ? make_number (coding.produced_char)
9401           : coding.dst_object);
9402 }
9403
9404
9405 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9406        3, 4, "r\nzCoding system: ",
9407        doc: /* Decode the current region from the specified coding system.
9408 When called from a program, takes four arguments:
9409         START, END, CODING-SYSTEM, and DESTINATION.
9410 START and END are buffer positions.
9411
9412 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9413 If nil, the region between START and END is replaced by the decoded text.
9414 If buffer, the decoded text is inserted in that buffer after point (point
9415 does not move).
9416 In those cases, the length of the decoded text is returned.
9417 If DESTINATION is t, the decoded text is returned.
9418
9419 This function sets `last-coding-system-used' to the precise coding system
9420 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9421 not fully specified.)  */)
9422   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9423 {
9424   return code_convert_region (start, end, coding_system, destination, 0, 0);
9425 }
9426
9427 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9428        3, 4, "r\nzCoding system: ",
9429        doc: /* Encode the current region by specified coding system.
9430 When called from a program, takes four arguments:
9431         START, END, CODING-SYSTEM and DESTINATION.
9432 START and END are buffer positions.
9433
9434 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9435 If nil, the region between START and END is replace by the encoded text.
9436 If buffer, the encoded text is inserted in that buffer after point (point
9437 does not move).
9438 In those cases, the length of the encoded text is returned.
9439 If DESTINATION is t, the encoded text is returned.
9440
9441 This function sets `last-coding-system-used' to the precise coding system
9442 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9443 not fully specified.)  */)
9444   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9445 {
9446   return code_convert_region (start, end, coding_system, destination, 1, 0);
9447 }
9448
9449 Lisp_Object
9450 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9451                      Lisp_Object dst_object, bool encodep, bool nocopy,
9452                      bool norecord)
9453 {
9454   struct coding_system coding;
9455   ptrdiff_t chars, bytes;
9456
9457   CHECK_STRING (string);
9458   if (NILP (coding_system))
9459     {
9460       if (! norecord)
9461         Vlast_coding_system_used = Qno_conversion;
9462       if (NILP (dst_object))
9463         return (nocopy ? Fcopy_sequence (string) : string);
9464     }
9465
9466   if (NILP (coding_system))
9467     coding_system = Qno_conversion;
9468   else
9469     CHECK_CODING_SYSTEM (coding_system);
9470   if (NILP (dst_object))
9471     dst_object = Qt;
9472   else if (! EQ (dst_object, Qt))
9473     CHECK_BUFFER (dst_object);
9474
9475   setup_coding_system (coding_system, &coding);
9476   coding.mode |= CODING_MODE_LAST_BLOCK;
9477   chars = SCHARS (string);
9478   bytes = SBYTES (string);
9479
9480   if (BUFFERP (dst_object))
9481     {
9482       struct buffer *buf = XBUFFER (dst_object);
9483       ptrdiff_t buf_pt = BUF_PT (buf);
9484
9485       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9486     }
9487
9488   if (encodep)
9489     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9490   else
9491     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9492   if (! norecord)
9493     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9494
9495   return (BUFFERP (dst_object)
9496           ? make_number (coding.produced_char)
9497           : coding.dst_object);
9498 }
9499
9500
9501 /* Encode or decode STRING according to CODING_SYSTEM.
9502    Do not set Vlast_coding_system_used.
9503
9504    This function is called only from macros DECODE_FILE and
9505    ENCODE_FILE, thus we ignore character composition.  */
9506
9507 Lisp_Object
9508 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9509                               bool encodep)
9510 {
9511   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9512 }
9513
9514 /* Encode or decode a file name, to or from a unibyte string suitable
9515    for passing to C library functions.  */
9516 Lisp_Object
9517 decode_file_name (Lisp_Object fname)
9518 {
9519 #ifdef WINDOWSNT
9520   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9521      converts the file names either to UTF-16LE or to the system ANSI
9522      codepage internally, depending on the underlying OS; see w32.c.  */
9523   if (! NILP (Fcoding_system_p (Qutf_8)))
9524     return code_convert_string_norecord (fname, Qutf_8, 0);
9525   return fname;
9526 #else  /* !WINDOWSNT */
9527   if (! NILP (Vfile_name_coding_system))
9528     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9529   else if (! NILP (Vdefault_file_name_coding_system))
9530     return code_convert_string_norecord (fname,
9531                                          Vdefault_file_name_coding_system, 0);
9532   else
9533     return fname;
9534 #endif
9535 }
9536
9537 Lisp_Object
9538 encode_file_name (Lisp_Object fname)
9539 {
9540   /* This is especially important during bootstrap and dumping, when
9541      file-name encoding is not yet known, and therefore any non-ASCII
9542      file names are unibyte strings, and could only be thrashed if we
9543      try to encode them.  */
9544   if (!STRING_MULTIBYTE (fname))
9545     return fname;
9546 #ifdef WINDOWSNT
9547   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9548      converts the file names either to UTF-16LE or to the system ANSI
9549      codepage internally, depending on the underlying OS; see w32.c.  */
9550   if (! NILP (Fcoding_system_p (Qutf_8)))
9551     return code_convert_string_norecord (fname, Qutf_8, 1);
9552   return fname;
9553 #else  /* !WINDOWSNT */
9554   if (! NILP (Vfile_name_coding_system))
9555     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9556   else if (! NILP (Vdefault_file_name_coding_system))
9557     return code_convert_string_norecord (fname,
9558                                          Vdefault_file_name_coding_system, 1);
9559   else
9560     return fname;
9561 #endif
9562 }
9563
9564 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9565        2, 4, 0,
9566        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9567
9568 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9569 if the decoding operation is trivial.
9570
9571 Optional fourth arg BUFFER non-nil means that the decoded text is
9572 inserted in that buffer after point (point does not move).  In this
9573 case, the return value is the length of the decoded text.
9574
9575 This function sets `last-coding-system-used' to the precise coding system
9576 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9577 not fully specified.)  */)
9578   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9579 {
9580   return code_convert_string (string, coding_system, buffer,
9581                               0, ! NILP (nocopy), 0);
9582 }
9583
9584 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9585        2, 4, 0,
9586        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9587
9588 Optional third arg NOCOPY non-nil means it is OK to return STRING
9589 itself if the encoding operation is trivial.
9590
9591 Optional fourth arg BUFFER non-nil means that the encoded text is
9592 inserted in that buffer after point (point does not move).  In this
9593 case, the return value is the length of the encoded text.
9594
9595 This function sets `last-coding-system-used' to the precise coding system
9596 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9597 not fully specified.)  */)
9598   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9599 {
9600   return code_convert_string (string, coding_system, buffer,
9601                               1, ! NILP (nocopy), 0);
9602 }
9603
9604 \f
9605 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9606        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9607 Return the corresponding character.  */)
9608   (Lisp_Object code)
9609 {
9610   Lisp_Object spec, attrs, val;
9611   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9612   EMACS_INT ch;
9613   int c;
9614
9615   CHECK_NATNUM (code);
9616   ch = XFASTINT (code);
9617   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9618   attrs = AREF (spec, 0);
9619
9620   if (ASCII_BYTE_P (ch)
9621       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9622     return code;
9623
9624   val = CODING_ATTR_CHARSET_LIST (attrs);
9625   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9626   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9627   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9628
9629   if (ch <= 0x7F)
9630     {
9631       c = ch;
9632       charset = charset_roman;
9633     }
9634   else if (ch >= 0xA0 && ch < 0xDF)
9635     {
9636       c = ch - 0x80;
9637       charset = charset_kana;
9638     }
9639   else
9640     {
9641       EMACS_INT c1 = ch >> 8;
9642       int c2 = ch & 0xFF;
9643
9644       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9645           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9646         error ("Invalid code: %"pI"d", ch);
9647       c = ch;
9648       SJIS_TO_JIS (c);
9649       charset = charset_kanji;
9650     }
9651   c = DECODE_CHAR (charset, c);
9652   if (c < 0)
9653     error ("Invalid code: %"pI"d", ch);
9654   return make_number (c);
9655 }
9656
9657
9658 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9659        doc: /* Encode a Japanese character CH to shift_jis encoding.
9660 Return the corresponding code in SJIS.  */)
9661   (Lisp_Object ch)
9662 {
9663   Lisp_Object spec, attrs, charset_list;
9664   int c;
9665   struct charset *charset;
9666   unsigned code;
9667
9668   CHECK_CHARACTER (ch);
9669   c = XFASTINT (ch);
9670   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9671   attrs = AREF (spec, 0);
9672
9673   if (ASCII_CHAR_P (c)
9674       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9675     return ch;
9676
9677   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9678   charset = char_charset (c, charset_list, &code);
9679   if (code == CHARSET_INVALID_CODE (charset))
9680     error ("Can't encode by shift_jis encoding: %c", c);
9681   JIS_TO_SJIS (code);
9682
9683   return make_number (code);
9684 }
9685
9686 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9687        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9688 Return the corresponding character.  */)
9689   (Lisp_Object code)
9690 {
9691   Lisp_Object spec, attrs, val;
9692   struct charset *charset_roman, *charset_big5, *charset;
9693   EMACS_INT ch;
9694   int c;
9695
9696   CHECK_NATNUM (code);
9697   ch = XFASTINT (code);
9698   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9699   attrs = AREF (spec, 0);
9700
9701   if (ASCII_BYTE_P (ch)
9702       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9703     return code;
9704
9705   val = CODING_ATTR_CHARSET_LIST (attrs);
9706   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9707   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9708
9709   if (ch <= 0x7F)
9710     {
9711       c = ch;
9712       charset = charset_roman;
9713     }
9714   else
9715     {
9716       EMACS_INT b1 = ch >> 8;
9717       int b2 = ch & 0x7F;
9718       if (b1 < 0xA1 || b1 > 0xFE
9719           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9720         error ("Invalid code: %"pI"d", ch);
9721       c = ch;
9722       charset = charset_big5;
9723     }
9724   c = DECODE_CHAR (charset, c);
9725   if (c < 0)
9726     error ("Invalid code: %"pI"d", ch);
9727   return make_number (c);
9728 }
9729
9730 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9731        doc: /* Encode the Big5 character CH to BIG5 coding system.
9732 Return the corresponding character code in Big5.  */)
9733   (Lisp_Object ch)
9734 {
9735   Lisp_Object spec, attrs, charset_list;
9736   struct charset *charset;
9737   int c;
9738   unsigned code;
9739
9740   CHECK_CHARACTER (ch);
9741   c = XFASTINT (ch);
9742   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9743   attrs = AREF (spec, 0);
9744   if (ASCII_CHAR_P (c)
9745       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9746     return ch;
9747
9748   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9749   charset = char_charset (c, charset_list, &code);
9750   if (code == CHARSET_INVALID_CODE (charset))
9751     error ("Can't encode by Big5 encoding: %c", c);
9752
9753   return make_number (code);
9754 }
9755
9756 \f
9757 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9758        Sset_terminal_coding_system_internal, 1, 2, 0,
9759        doc: /* Internal use only.  */)
9760   (Lisp_Object coding_system, Lisp_Object terminal)
9761 {
9762   struct terminal *term = get_terminal (terminal, 1);
9763   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9766   /* We had better not send unsafe characters to terminal.  */
9767   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9768   /* Character composition should be disabled.  */
9769   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9770   terminal_coding->src_multibyte = 1;
9771   terminal_coding->dst_multibyte = 0;
9772   tset_charset_list
9773     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9774             ? coding_charset_list (terminal_coding)
9775             : list1 (make_number (charset_ascii))));
9776   return Qnil;
9777 }
9778
9779 DEFUN ("set-safe-terminal-coding-system-internal",
9780        Fset_safe_terminal_coding_system_internal,
9781        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9782        doc: /* Internal use only.  */)
9783   (Lisp_Object coding_system)
9784 {
9785   CHECK_SYMBOL (coding_system);
9786   setup_coding_system (Fcheck_coding_system (coding_system),
9787                        &safe_terminal_coding);
9788   /* Character composition should be disabled.  */
9789   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9790   safe_terminal_coding.src_multibyte = 1;
9791   safe_terminal_coding.dst_multibyte = 0;
9792   return Qnil;
9793 }
9794
9795 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9796        Sterminal_coding_system, 0, 1, 0,
9797        doc: /* Return coding system specified for terminal output on the given terminal.
9798 TERMINAL may be a terminal object, a frame, or nil for the selected
9799 frame's terminal device.  */)
9800   (Lisp_Object terminal)
9801 {
9802   struct coding_system *terminal_coding
9803     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9804   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9805
9806   /* For backward compatibility, return nil if it is `undecided'.  */
9807   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9808 }
9809
9810 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9811        Sset_keyboard_coding_system_internal, 1, 2, 0,
9812        doc: /* Internal use only.  */)
9813   (Lisp_Object coding_system, Lisp_Object terminal)
9814 {
9815   struct terminal *t = get_terminal (terminal, 1);
9816   CHECK_SYMBOL (coding_system);
9817   if (NILP (coding_system))
9818     coding_system = Qno_conversion;
9819   else
9820     Fcheck_coding_system (coding_system);
9821   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9822   /* Character composition should be disabled.  */
9823   TERMINAL_KEYBOARD_CODING (t)->common_flags
9824     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9825   return Qnil;
9826 }
9827
9828 DEFUN ("keyboard-coding-system",
9829        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9830        doc: /* Return coding system specified for decoding keyboard input.  */)
9831   (Lisp_Object terminal)
9832 {
9833   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9834                          (get_terminal (terminal, 1))->id);
9835 }
9836
9837 \f
9838 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9839        Sfind_operation_coding_system,  1, MANY, 0,
9840        doc: /* Choose a coding system for an operation based on the target name.
9841 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9842 DECODING-SYSTEM is the coding system to use for decoding
9843 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9844 for encoding (in case OPERATION does encoding).
9845
9846 The first argument OPERATION specifies an I/O primitive:
9847   For file I/O, `insert-file-contents' or `write-region'.
9848   For process I/O, `call-process', `call-process-region', or `start-process'.
9849   For network I/O, `open-network-stream'.
9850
9851 The remaining arguments should be the same arguments that were passed
9852 to the primitive.  Depending on which primitive, one of those arguments
9853 is selected as the TARGET.  For example, if OPERATION does file I/O,
9854 whichever argument specifies the file name is TARGET.
9855
9856 TARGET has a meaning which depends on OPERATION:
9857   For file I/O, TARGET is a file name (except for the special case below).
9858   For process I/O, TARGET is a process name.
9859   For network I/O, TARGET is a service name or a port number.
9860
9861 This function looks up what is specified for TARGET in
9862 `file-coding-system-alist', `process-coding-system-alist',
9863 or `network-coding-system-alist' depending on OPERATION.
9864 They may specify a coding system, a cons of coding systems,
9865 or a function symbol to call.
9866 In the last case, we call the function with one argument,
9867 which is a list of all the arguments given to this function.
9868 If the function can't decide a coding system, it can return
9869 `undecided' so that the normal code-detection is performed.
9870
9871 If OPERATION is `insert-file-contents', the argument corresponding to
9872 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9873 file name to look up, and BUFFER is a buffer that contains the file's
9874 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9875 function to call for FILENAME, that function should examine the
9876 contents of BUFFER instead of reading the file.
9877
9878 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9879   (ptrdiff_t nargs, Lisp_Object *args)
9880 {
9881   Lisp_Object operation, target_idx, target, val;
9882   register Lisp_Object chain;
9883
9884   if (nargs < 2)
9885     error ("Too few arguments");
9886   operation = args[0];
9887   if (!SYMBOLP (operation)
9888       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9889     error ("Invalid first argument");
9890   if (nargs <= 1 + XFASTINT (target_idx))
9891     error ("Too few arguments for operation `%s'",
9892            SDATA (SYMBOL_NAME (operation)));
9893   target = args[XFASTINT (target_idx) + 1];
9894   if (!(STRINGP (target)
9895         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9896             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9897         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9898     error ("Invalid argument %"pI"d of operation `%s'",
9899            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9900   if (CONSP (target))
9901     target = XCAR (target);
9902
9903   chain = ((EQ (operation, Qinsert_file_contents)
9904             || EQ (operation, Qwrite_region))
9905            ? Vfile_coding_system_alist
9906            : (EQ (operation, Qopen_network_stream)
9907               ? Vnetwork_coding_system_alist
9908               : Vprocess_coding_system_alist));
9909   if (NILP (chain))
9910     return Qnil;
9911
9912   for (; CONSP (chain); chain = XCDR (chain))
9913     {
9914       Lisp_Object elt;
9915
9916       elt = XCAR (chain);
9917       if (CONSP (elt)
9918           && ((STRINGP (target)
9919                && STRINGP (XCAR (elt))
9920                && fast_string_match (XCAR (elt), target) >= 0)
9921               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9922         {
9923           val = XCDR (elt);
9924           /* Here, if VAL is both a valid coding system and a valid
9925              function symbol, we return VAL as a coding system.  */
9926           if (CONSP (val))
9927             return val;
9928           if (! SYMBOLP (val))
9929             return Qnil;
9930           if (! NILP (Fcoding_system_p (val)))
9931             return Fcons (val, val);
9932           if (! NILP (Ffboundp (val)))
9933             {
9934               /* We use call1 rather than safe_call1
9935                  so as to get bug reports about functions called here
9936                  which don't handle the current interface.  */
9937               val = call1 (val, Flist (nargs, args));
9938               if (CONSP (val))
9939                 return val;
9940               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9941                 return Fcons (val, val);
9942             }
9943           return Qnil;
9944         }
9945     }
9946   return Qnil;
9947 }
9948
9949 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9950        Sset_coding_system_priority, 0, MANY, 0,
9951        doc: /* Assign higher priority to the coding systems given as arguments.
9952 If multiple coding systems belong to the same category,
9953 all but the first one are ignored.
9954
9955 usage: (set-coding-system-priority &rest coding-systems)  */)
9956   (ptrdiff_t nargs, Lisp_Object *args)
9957 {
9958   ptrdiff_t i, j;
9959   bool changed[coding_category_max];
9960   enum coding_category priorities[coding_category_max];
9961
9962   memset (changed, 0, sizeof changed);
9963
9964   for (i = j = 0; i < nargs; i++)
9965     {
9966       enum coding_category category;
9967       Lisp_Object spec, attrs;
9968
9969       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9970       attrs = AREF (spec, 0);
9971       category = XINT (CODING_ATTR_CATEGORY (attrs));
9972       if (changed[category])
9973         /* Ignore this coding system because a coding system of the
9974            same category already had a higher priority.  */
9975         continue;
9976       changed[category] = 1;
9977       priorities[j++] = category;
9978       if (coding_categories[category].id >= 0
9979           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9980         setup_coding_system (args[i], &coding_categories[category]);
9981       Fset (AREF (Vcoding_category_table, category), args[i]);
9982     }
9983
9984   /* Now we have decided top J priorities.  Reflect the order of the
9985      original priorities to the remaining priorities.  */
9986
9987   for (i = j, j = 0; i < coding_category_max; i++, j++)
9988     {
9989       while (j < coding_category_max
9990              && changed[coding_priorities[j]])
9991         j++;
9992       if (j == coding_category_max)
9993         emacs_abort ();
9994       priorities[i] = coding_priorities[j];
9995     }
9996
9997   memcpy (coding_priorities, priorities, sizeof priorities);
9998
9999   /* Update `coding-category-list'.  */
10000   Vcoding_category_list = Qnil;
10001   for (i = coding_category_max; i-- > 0; )
10002     Vcoding_category_list
10003       = Fcons (AREF (Vcoding_category_table, priorities[i]),
10004                Vcoding_category_list);
10005
10006   return Qnil;
10007 }
10008
10009 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
10010        Scoding_system_priority_list, 0, 1, 0,
10011        doc: /* Return a list of coding systems ordered by their priorities.
10012 The list contains a subset of coding systems; i.e. coding systems
10013 assigned to each coding category (see `coding-category-list').
10014
10015 HIGHESTP non-nil means just return the highest priority one.  */)
10016   (Lisp_Object highestp)
10017 {
10018   int i;
10019   Lisp_Object val;
10020
10021   for (i = 0, val = Qnil; i < coding_category_max; i++)
10022     {
10023       enum coding_category category = coding_priorities[i];
10024       int id = coding_categories[category].id;
10025       Lisp_Object attrs;
10026
10027       if (id < 0)
10028         continue;
10029       attrs = CODING_ID_ATTRS (id);
10030       if (! NILP (highestp))
10031         return CODING_ATTR_BASE_NAME (attrs);
10032       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10033     }
10034   return Fnreverse (val);
10035 }
10036
10037 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10038
10039 static Lisp_Object
10040 make_subsidiaries (Lisp_Object base)
10041 {
10042   Lisp_Object subsidiaries;
10043   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10044   char *buf = alloca (base_name_len + 6);
10045   int i;
10046
10047   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10048   subsidiaries = make_uninit_vector (3);
10049   for (i = 0; i < 3; i++)
10050     {
10051       strcpy (buf + base_name_len, suffixes[i]);
10052       ASET (subsidiaries, i, intern (buf));
10053     }
10054   return subsidiaries;
10055 }
10056
10057
10058 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10059        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10060        doc: /* For internal use only.
10061 usage: (define-coding-system-internal ...)  */)
10062   (ptrdiff_t nargs, Lisp_Object *args)
10063 {
10064   Lisp_Object name;
10065   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10066   Lisp_Object attrs;            /* Vector of attributes.  */
10067   Lisp_Object eol_type;
10068   Lisp_Object aliases;
10069   Lisp_Object coding_type, charset_list, safe_charsets;
10070   enum coding_category category;
10071   Lisp_Object tail, val;
10072   int max_charset_id = 0;
10073   int i;
10074
10075   if (nargs < coding_arg_max)
10076     goto short_args;
10077
10078   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10079
10080   name = args[coding_arg_name];
10081   CHECK_SYMBOL (name);
10082   ASET (attrs, coding_attr_base_name, name);
10083
10084   val = args[coding_arg_mnemonic];
10085   if (! STRINGP (val))
10086     CHECK_CHARACTER (val);
10087   ASET (attrs, coding_attr_mnemonic, val);
10088
10089   coding_type = args[coding_arg_coding_type];
10090   CHECK_SYMBOL (coding_type);
10091   ASET (attrs, coding_attr_type, coding_type);
10092
10093   charset_list = args[coding_arg_charset_list];
10094   if (SYMBOLP (charset_list))
10095     {
10096       if (EQ (charset_list, Qiso_2022))
10097         {
10098           if (! EQ (coding_type, Qiso_2022))
10099             error ("Invalid charset-list");
10100           charset_list = Viso_2022_charset_list;
10101         }
10102       else if (EQ (charset_list, Qemacs_mule))
10103         {
10104           if (! EQ (coding_type, Qemacs_mule))
10105             error ("Invalid charset-list");
10106           charset_list = Vemacs_mule_charset_list;
10107         }
10108       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10109         {
10110           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10111             error ("Invalid charset-list");
10112           if (max_charset_id < XFASTINT (XCAR (tail)))
10113             max_charset_id = XFASTINT (XCAR (tail));
10114         }
10115     }
10116   else
10117     {
10118       charset_list = Fcopy_sequence (charset_list);
10119       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10120         {
10121           struct charset *charset;
10122
10123           val = XCAR (tail);
10124           CHECK_CHARSET_GET_CHARSET (val, charset);
10125           if (EQ (coding_type, Qiso_2022)
10126               ? CHARSET_ISO_FINAL (charset) < 0
10127               : EQ (coding_type, Qemacs_mule)
10128               ? CHARSET_EMACS_MULE_ID (charset) < 0
10129               : 0)
10130             error ("Can't handle charset `%s'",
10131                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10132
10133           XSETCAR (tail, make_number (charset->id));
10134           if (max_charset_id < charset->id)
10135             max_charset_id = charset->id;
10136         }
10137     }
10138   ASET (attrs, coding_attr_charset_list, charset_list);
10139
10140   safe_charsets = make_uninit_string (max_charset_id + 1);
10141   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10142   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10143     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10144   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10145
10146   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10147
10148   val = args[coding_arg_decode_translation_table];
10149   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10150     CHECK_SYMBOL (val);
10151   ASET (attrs, coding_attr_decode_tbl, val);
10152
10153   val = args[coding_arg_encode_translation_table];
10154   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10155     CHECK_SYMBOL (val);
10156   ASET (attrs, coding_attr_encode_tbl, val);
10157
10158   val = args[coding_arg_post_read_conversion];
10159   CHECK_SYMBOL (val);
10160   ASET (attrs, coding_attr_post_read, val);
10161
10162   val = args[coding_arg_pre_write_conversion];
10163   CHECK_SYMBOL (val);
10164   ASET (attrs, coding_attr_pre_write, val);
10165
10166   val = args[coding_arg_default_char];
10167   if (NILP (val))
10168     ASET (attrs, coding_attr_default_char, make_number (' '));
10169   else
10170     {
10171       CHECK_CHARACTER (val);
10172       ASET (attrs, coding_attr_default_char, val);
10173     }
10174
10175   val = args[coding_arg_for_unibyte];
10176   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10177
10178   val = args[coding_arg_plist];
10179   CHECK_LIST (val);
10180   ASET (attrs, coding_attr_plist, val);
10181
10182   if (EQ (coding_type, Qcharset))
10183     {
10184       /* Generate a lisp vector of 256 elements.  Each element is nil,
10185          integer, or a list of charset IDs.
10186
10187          If Nth element is nil, the byte code N is invalid in this
10188          coding system.
10189
10190          If Nth element is a number NUM, N is the first byte of a
10191          charset whose ID is NUM.
10192
10193          If Nth element is a list of charset IDs, N is the first byte
10194          of one of them.  The list is sorted by dimensions of the
10195          charsets.  A charset of smaller dimension comes first. */
10196       val = Fmake_vector (make_number (256), Qnil);
10197
10198       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10199         {
10200           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10201           int dim = CHARSET_DIMENSION (charset);
10202           int idx = (dim - 1) * 4;
10203
10204           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10205             ASET (attrs, coding_attr_ascii_compat, Qt);
10206
10207           for (i = charset->code_space[idx];
10208                i <= charset->code_space[idx + 1]; i++)
10209             {
10210               Lisp_Object tmp, tmp2;
10211               int dim2;
10212
10213               tmp = AREF (val, i);
10214               if (NILP (tmp))
10215                 tmp = XCAR (tail);
10216               else if (NUMBERP (tmp))
10217                 {
10218                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10219                   if (dim < dim2)
10220                     tmp = list2 (XCAR (tail), tmp);
10221                   else
10222                     tmp = list2 (tmp, XCAR (tail));
10223                 }
10224               else
10225                 {
10226                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10227                     {
10228                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10229                       if (dim < dim2)
10230                         break;
10231                     }
10232                   if (NILP (tmp2))
10233                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10234                   else
10235                     {
10236                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10237                       XSETCAR (tmp2, XCAR (tail));
10238                     }
10239                 }
10240               ASET (val, i, tmp);
10241             }
10242         }
10243       ASET (attrs, coding_attr_charset_valids, val);
10244       category = coding_category_charset;
10245     }
10246   else if (EQ (coding_type, Qccl))
10247     {
10248       Lisp_Object valids;
10249
10250       if (nargs < coding_arg_ccl_max)
10251         goto short_args;
10252
10253       val = args[coding_arg_ccl_decoder];
10254       CHECK_CCL_PROGRAM (val);
10255       if (VECTORP (val))
10256         val = Fcopy_sequence (val);
10257       ASET (attrs, coding_attr_ccl_decoder, val);
10258
10259       val = args[coding_arg_ccl_encoder];
10260       CHECK_CCL_PROGRAM (val);
10261       if (VECTORP (val))
10262         val = Fcopy_sequence (val);
10263       ASET (attrs, coding_attr_ccl_encoder, val);
10264
10265       val = args[coding_arg_ccl_valids];
10266       valids = Fmake_string (make_number (256), make_number (0));
10267       for (tail = val; CONSP (tail); tail = XCDR (tail))
10268         {
10269           int from, to;
10270
10271           val = XCAR (tail);
10272           if (INTEGERP (val))
10273             {
10274               if (! (0 <= XINT (val) && XINT (val) <= 255))
10275                 args_out_of_range_3 (val, make_number (0), make_number (255));
10276               from = to = XINT (val);
10277             }
10278           else
10279             {
10280               CHECK_CONS (val);
10281               CHECK_NATNUM_CAR (val);
10282               CHECK_NUMBER_CDR (val);
10283               if (XINT (XCAR (val)) > 255)
10284                 args_out_of_range_3 (XCAR (val),
10285                                      make_number (0), make_number (255));
10286               from = XINT (XCAR (val));
10287               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10288                 args_out_of_range_3 (XCDR (val),
10289                                      XCAR (val), make_number (255));
10290               to = XINT (XCDR (val));
10291             }
10292           for (i = from; i <= to; i++)
10293             SSET (valids, i, 1);
10294         }
10295       ASET (attrs, coding_attr_ccl_valids, valids);
10296
10297       category = coding_category_ccl;
10298     }
10299   else if (EQ (coding_type, Qutf_16))
10300     {
10301       Lisp_Object bom, endian;
10302
10303       ASET (attrs, coding_attr_ascii_compat, Qnil);
10304
10305       if (nargs < coding_arg_utf16_max)
10306         goto short_args;
10307
10308       bom = args[coding_arg_utf16_bom];
10309       if (! NILP (bom) && ! EQ (bom, Qt))
10310         {
10311           CHECK_CONS (bom);
10312           val = XCAR (bom);
10313           CHECK_CODING_SYSTEM (val);
10314           val = XCDR (bom);
10315           CHECK_CODING_SYSTEM (val);
10316         }
10317       ASET (attrs, coding_attr_utf_bom, bom);
10318
10319       endian = args[coding_arg_utf16_endian];
10320       CHECK_SYMBOL (endian);
10321       if (NILP (endian))
10322         endian = Qbig;
10323       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10324         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10325       ASET (attrs, coding_attr_utf_16_endian, endian);
10326
10327       category = (CONSP (bom)
10328                   ? coding_category_utf_16_auto
10329                   : NILP (bom)
10330                   ? (EQ (endian, Qbig)
10331                      ? coding_category_utf_16_be_nosig
10332                      : coding_category_utf_16_le_nosig)
10333                   : (EQ (endian, Qbig)
10334                      ? coding_category_utf_16_be
10335                      : coding_category_utf_16_le));
10336     }
10337   else if (EQ (coding_type, Qiso_2022))
10338     {
10339       Lisp_Object initial, reg_usage, request, flags;
10340
10341       if (nargs < coding_arg_iso2022_max)
10342         goto short_args;
10343
10344       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10345       CHECK_VECTOR (initial);
10346       for (i = 0; i < 4; i++)
10347         {
10348           val = AREF (initial, i);
10349           if (! NILP (val))
10350             {
10351               struct charset *charset;
10352
10353               CHECK_CHARSET_GET_CHARSET (val, charset);
10354               ASET (initial, i, make_number (CHARSET_ID (charset)));
10355               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10356                 ASET (attrs, coding_attr_ascii_compat, Qt);
10357             }
10358           else
10359             ASET (initial, i, make_number (-1));
10360         }
10361
10362       reg_usage = args[coding_arg_iso2022_reg_usage];
10363       CHECK_CONS (reg_usage);
10364       CHECK_NUMBER_CAR (reg_usage);
10365       CHECK_NUMBER_CDR (reg_usage);
10366
10367       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10368       for (tail = request; CONSP (tail); tail = XCDR (tail))
10369         {
10370           int id;
10371           Lisp_Object tmp1;
10372
10373           val = XCAR (tail);
10374           CHECK_CONS (val);
10375           tmp1 = XCAR (val);
10376           CHECK_CHARSET_GET_ID (tmp1, id);
10377           CHECK_NATNUM_CDR (val);
10378           if (XINT (XCDR (val)) >= 4)
10379             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10380           XSETCAR (val, make_number (id));
10381         }
10382
10383       flags = args[coding_arg_iso2022_flags];
10384       CHECK_NATNUM (flags);
10385       i = XINT (flags) & INT_MAX;
10386       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10387         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10388       flags = make_number (i);
10389
10390       ASET (attrs, coding_attr_iso_initial, initial);
10391       ASET (attrs, coding_attr_iso_usage, reg_usage);
10392       ASET (attrs, coding_attr_iso_request, request);
10393       ASET (attrs, coding_attr_iso_flags, flags);
10394       setup_iso_safe_charsets (attrs);
10395
10396       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10397         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10398                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10399                     ? coding_category_iso_7_else
10400                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10401                     ? coding_category_iso_7
10402                     : coding_category_iso_7_tight);
10403       else
10404         {
10405           int id = XINT (AREF (initial, 1));
10406
10407           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10408                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10409                        || id < 0)
10410                       ? coding_category_iso_8_else
10411                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10412                       ? coding_category_iso_8_1
10413                       : coding_category_iso_8_2);
10414         }
10415       if (category != coding_category_iso_8_1
10416           && category != coding_category_iso_8_2)
10417         ASET (attrs, coding_attr_ascii_compat, Qnil);
10418     }
10419   else if (EQ (coding_type, Qemacs_mule))
10420     {
10421       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10422         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10423       ASET (attrs, coding_attr_ascii_compat, Qt);
10424       category = coding_category_emacs_mule;
10425     }
10426   else if (EQ (coding_type, Qshift_jis))
10427     {
10428
10429       struct charset *charset;
10430
10431       if (XINT (Flength (charset_list)) != 3
10432           && XINT (Flength (charset_list)) != 4)
10433         error ("There should be three or four charsets");
10434
10435       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10436       if (CHARSET_DIMENSION (charset) != 1)
10437         error ("Dimension of charset %s is not one",
10438                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10439       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10440         ASET (attrs, coding_attr_ascii_compat, Qt);
10441
10442       charset_list = XCDR (charset_list);
10443       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10444       if (CHARSET_DIMENSION (charset) != 1)
10445         error ("Dimension of charset %s is not one",
10446                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10447
10448       charset_list = XCDR (charset_list);
10449       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10450       if (CHARSET_DIMENSION (charset) != 2)
10451         error ("Dimension of charset %s is not two",
10452                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10453
10454       charset_list = XCDR (charset_list);
10455       if (! NILP (charset_list))
10456         {
10457           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10458           if (CHARSET_DIMENSION (charset) != 2)
10459             error ("Dimension of charset %s is not two",
10460                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10461         }
10462
10463       category = coding_category_sjis;
10464       Vsjis_coding_system = name;
10465     }
10466   else if (EQ (coding_type, Qbig5))
10467     {
10468       struct charset *charset;
10469
10470       if (XINT (Flength (charset_list)) != 2)
10471         error ("There should be just two charsets");
10472
10473       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10474       if (CHARSET_DIMENSION (charset) != 1)
10475         error ("Dimension of charset %s is not one",
10476                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10477       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10478         ASET (attrs, coding_attr_ascii_compat, Qt);
10479
10480       charset_list = XCDR (charset_list);
10481       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10482       if (CHARSET_DIMENSION (charset) != 2)
10483         error ("Dimension of charset %s is not two",
10484                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10485
10486       category = coding_category_big5;
10487       Vbig5_coding_system = name;
10488     }
10489   else if (EQ (coding_type, Qraw_text))
10490     {
10491       category = coding_category_raw_text;
10492       ASET (attrs, coding_attr_ascii_compat, Qt);
10493     }
10494   else if (EQ (coding_type, Qutf_8))
10495     {
10496       Lisp_Object bom;
10497
10498       if (nargs < coding_arg_utf8_max)
10499         goto short_args;
10500
10501       bom = args[coding_arg_utf8_bom];
10502       if (! NILP (bom) && ! EQ (bom, Qt))
10503         {
10504           CHECK_CONS (bom);
10505           val = XCAR (bom);
10506           CHECK_CODING_SYSTEM (val);
10507           val = XCDR (bom);
10508           CHECK_CODING_SYSTEM (val);
10509         }
10510       ASET (attrs, coding_attr_utf_bom, bom);
10511       if (NILP (bom))
10512         ASET (attrs, coding_attr_ascii_compat, Qt);
10513
10514       category = (CONSP (bom) ? coding_category_utf_8_auto
10515                   : NILP (bom) ? coding_category_utf_8_nosig
10516                   : coding_category_utf_8_sig);
10517     }
10518   else if (EQ (coding_type, Qundecided))
10519     {
10520       if (nargs < coding_arg_undecided_max)
10521         goto short_args;
10522       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10523             args[coding_arg_undecided_inhibit_null_byte_detection]);
10524       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10525             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10526       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10527             args[coding_arg_undecided_prefer_utf_8]);
10528       category = coding_category_undecided;
10529     }
10530   else
10531     error ("Invalid coding system type: %s",
10532            SDATA (SYMBOL_NAME (coding_type)));
10533
10534   ASET (attrs, coding_attr_category, make_number (category));
10535   ASET (attrs, coding_attr_plist,
10536         Fcons (QCcategory,
10537                Fcons (AREF (Vcoding_category_table, category),
10538                       CODING_ATTR_PLIST (attrs))));
10539   ASET (attrs, coding_attr_plist,
10540         Fcons (QCascii_compatible_p,
10541                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10542                       CODING_ATTR_PLIST (attrs))));
10543
10544   eol_type = args[coding_arg_eol_type];
10545   if (! NILP (eol_type)
10546       && ! EQ (eol_type, Qunix)
10547       && ! EQ (eol_type, Qdos)
10548       && ! EQ (eol_type, Qmac))
10549     error ("Invalid eol-type");
10550
10551   aliases = list1 (name);
10552
10553   if (NILP (eol_type))
10554     {
10555       eol_type = make_subsidiaries (name);
10556       for (i = 0; i < 3; i++)
10557         {
10558           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10559
10560           this_name = AREF (eol_type, i);
10561           this_aliases = list1 (this_name);
10562           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10563           this_spec = make_uninit_vector (3);
10564           ASET (this_spec, 0, attrs);
10565           ASET (this_spec, 1, this_aliases);
10566           ASET (this_spec, 2, this_eol_type);
10567           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10568           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10569           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10570           if (NILP (val))
10571             Vcoding_system_alist
10572               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10573                        Vcoding_system_alist);
10574         }
10575     }
10576
10577   spec_vec = make_uninit_vector (3);
10578   ASET (spec_vec, 0, attrs);
10579   ASET (spec_vec, 1, aliases);
10580   ASET (spec_vec, 2, eol_type);
10581
10582   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10583   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10584   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10585   if (NILP (val))
10586     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10587                                   Vcoding_system_alist);
10588
10589   {
10590     int id = coding_categories[category].id;
10591
10592     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10593       setup_coding_system (name, &coding_categories[category]);
10594   }
10595
10596   return Qnil;
10597
10598  short_args:
10599   return Fsignal (Qwrong_number_of_arguments,
10600                   Fcons (intern ("define-coding-system-internal"),
10601                          make_number (nargs)));
10602 }
10603
10604
10605 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10606        3, 3, 0,
10607        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10608   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10609 {
10610   Lisp_Object spec, attrs;
10611
10612   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10613   attrs = AREF (spec, 0);
10614   if (EQ (prop, QCmnemonic))
10615     {
10616       if (! STRINGP (val))
10617         CHECK_CHARACTER (val);
10618       ASET (attrs, coding_attr_mnemonic, val);
10619     }
10620   else if (EQ (prop, QCdefault_char))
10621     {
10622       if (NILP (val))
10623         val = make_number (' ');
10624       else
10625         CHECK_CHARACTER (val);
10626       ASET (attrs, coding_attr_default_char, val);
10627     }
10628   else if (EQ (prop, QCdecode_translation_table))
10629     {
10630       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10631         CHECK_SYMBOL (val);
10632       ASET (attrs, coding_attr_decode_tbl, val);
10633     }
10634   else if (EQ (prop, QCencode_translation_table))
10635     {
10636       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10637         CHECK_SYMBOL (val);
10638       ASET (attrs, coding_attr_encode_tbl, val);
10639     }
10640   else if (EQ (prop, QCpost_read_conversion))
10641     {
10642       CHECK_SYMBOL (val);
10643       ASET (attrs, coding_attr_post_read, val);
10644     }
10645   else if (EQ (prop, QCpre_write_conversion))
10646     {
10647       CHECK_SYMBOL (val);
10648       ASET (attrs, coding_attr_pre_write, val);
10649     }
10650   else if (EQ (prop, QCascii_compatible_p))
10651     {
10652       ASET (attrs, coding_attr_ascii_compat, val);
10653     }
10654
10655   ASET (attrs, coding_attr_plist,
10656         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10657   return val;
10658 }
10659
10660
10661 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10662        Sdefine_coding_system_alias, 2, 2, 0,
10663        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10664   (Lisp_Object alias, Lisp_Object coding_system)
10665 {
10666   Lisp_Object spec, aliases, eol_type, val;
10667
10668   CHECK_SYMBOL (alias);
10669   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10670   aliases = AREF (spec, 1);
10671   /* ALIASES should be a list of length more than zero, and the first
10672      element is a base coding system.  Append ALIAS at the tail of the
10673      list.  */
10674   while (!NILP (XCDR (aliases)))
10675     aliases = XCDR (aliases);
10676   XSETCDR (aliases, list1 (alias));
10677
10678   eol_type = AREF (spec, 2);
10679   if (VECTORP (eol_type))
10680     {
10681       Lisp_Object subsidiaries;
10682       int i;
10683
10684       subsidiaries = make_subsidiaries (alias);
10685       for (i = 0; i < 3; i++)
10686         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10687                                      AREF (eol_type, i));
10688     }
10689
10690   Fputhash (alias, spec, Vcoding_system_hash_table);
10691   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10692   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10693   if (NILP (val))
10694     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10695                                   Vcoding_system_alist);
10696
10697   return Qnil;
10698 }
10699
10700 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10701        1, 1, 0,
10702        doc: /* Return the base of CODING-SYSTEM.
10703 Any alias or subsidiary coding system is not a base coding system.  */)
10704   (Lisp_Object coding_system)
10705 {
10706   Lisp_Object spec, attrs;
10707
10708   if (NILP (coding_system))
10709     return (Qno_conversion);
10710   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10711   attrs = AREF (spec, 0);
10712   return CODING_ATTR_BASE_NAME (attrs);
10713 }
10714
10715 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10716        1, 1, 0,
10717        doc: "Return the property list of CODING-SYSTEM.")
10718   (Lisp_Object coding_system)
10719 {
10720   Lisp_Object spec, attrs;
10721
10722   if (NILP (coding_system))
10723     coding_system = Qno_conversion;
10724   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10725   attrs = AREF (spec, 0);
10726   return CODING_ATTR_PLIST (attrs);
10727 }
10728
10729
10730 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10731        1, 1, 0,
10732        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10733   (Lisp_Object coding_system)
10734 {
10735   Lisp_Object spec;
10736
10737   if (NILP (coding_system))
10738     coding_system = Qno_conversion;
10739   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10740   return AREF (spec, 1);
10741 }
10742
10743 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10744        Scoding_system_eol_type, 1, 1, 0,
10745        doc: /* Return eol-type of CODING-SYSTEM.
10746 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10747
10748 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10749 and CR respectively.
10750
10751 A vector value indicates that a format of end-of-line should be
10752 detected automatically.  Nth element of the vector is the subsidiary
10753 coding system whose eol-type is N.  */)
10754   (Lisp_Object coding_system)
10755 {
10756   Lisp_Object spec, eol_type;
10757   int n;
10758
10759   if (NILP (coding_system))
10760     coding_system = Qno_conversion;
10761   if (! CODING_SYSTEM_P (coding_system))
10762     return Qnil;
10763   spec = CODING_SYSTEM_SPEC (coding_system);
10764   eol_type = AREF (spec, 2);
10765   if (VECTORP (eol_type))
10766     return Fcopy_sequence (eol_type);
10767   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10768   return make_number (n);
10769 }
10770
10771 #endif /* emacs */
10772
10773 \f
10774 /*** 9. Post-amble ***/
10775
10776 void
10777 init_coding_once (void)
10778 {
10779   int i;
10780
10781   for (i = 0; i < coding_category_max; i++)
10782     {
10783       coding_categories[i].id = -1;
10784       coding_priorities[i] = i;
10785     }
10786
10787   /* ISO2022 specific initialize routine.  */
10788   for (i = 0; i < 0x20; i++)
10789     iso_code_class[i] = ISO_control_0;
10790   for (i = 0x21; i < 0x7F; i++)
10791     iso_code_class[i] = ISO_graphic_plane_0;
10792   for (i = 0x80; i < 0xA0; i++)
10793     iso_code_class[i] = ISO_control_1;
10794   for (i = 0xA1; i < 0xFF; i++)
10795     iso_code_class[i] = ISO_graphic_plane_1;
10796   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10797   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10798   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10799   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10800   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10801   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10802   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10803   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10804   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10805
10806   for (i = 0; i < 256; i++)
10807     {
10808       emacs_mule_bytes[i] = 1;
10809     }
10810   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10811   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10812   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10813   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10814 }
10815
10816 #ifdef emacs
10817
10818 void
10819 syms_of_coding (void)
10820 {
10821   staticpro (&Vcoding_system_hash_table);
10822   {
10823     Lisp_Object args[2];
10824     args[0] = QCtest;
10825     args[1] = Qeq;
10826     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10827   }
10828
10829   staticpro (&Vsjis_coding_system);
10830   Vsjis_coding_system = Qnil;
10831
10832   staticpro (&Vbig5_coding_system);
10833   Vbig5_coding_system = Qnil;
10834
10835   staticpro (&Vcode_conversion_reused_workbuf);
10836   Vcode_conversion_reused_workbuf = Qnil;
10837
10838   staticpro (&Vcode_conversion_workbuf_name);
10839   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10840
10841   reused_workbuf_in_use = 0;
10842
10843   DEFSYM (Qcharset, "charset");
10844   DEFSYM (Qtarget_idx, "target-idx");
10845   DEFSYM (Qcoding_system_history, "coding-system-history");
10846   Fset (Qcoding_system_history, Qnil);
10847
10848   /* Target FILENAME is the first argument.  */
10849   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10850   /* Target FILENAME is the third argument.  */
10851   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10852
10853   DEFSYM (Qcall_process, "call-process");
10854   /* Target PROGRAM is the first argument.  */
10855   Fput (Qcall_process, Qtarget_idx, make_number (0));
10856
10857   DEFSYM (Qcall_process_region, "call-process-region");
10858   /* Target PROGRAM is the third argument.  */
10859   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10860
10861   DEFSYM (Qstart_process, "start-process");
10862   /* Target PROGRAM is the third argument.  */
10863   Fput (Qstart_process, Qtarget_idx, make_number (2));
10864
10865   DEFSYM (Qopen_network_stream, "open-network-stream");
10866   /* Target SERVICE is the fourth argument.  */
10867   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10868
10869   DEFSYM (Qcoding_system, "coding-system");
10870   DEFSYM (Qcoding_aliases, "coding-aliases");
10871
10872   DEFSYM (Qeol_type, "eol-type");
10873   DEFSYM (Qunix, "unix");
10874   DEFSYM (Qdos, "dos");
10875   DEFSYM (Qmac, "mac");
10876
10877   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10878   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10879   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10880   DEFSYM (Qdefault_char, "default-char");
10881   DEFSYM (Qundecided, "undecided");
10882   DEFSYM (Qno_conversion, "no-conversion");
10883   DEFSYM (Qraw_text, "raw-text");
10884
10885   DEFSYM (Qiso_2022, "iso-2022");
10886
10887   DEFSYM (Qutf_8, "utf-8");
10888   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10889
10890 #if defined (WINDOWSNT) || defined (CYGWIN)
10891   /* No, not utf-16-le: that one has a BOM.  */
10892   DEFSYM (Qutf_16le, "utf-16le");
10893 #endif
10894
10895   DEFSYM (Qutf_16, "utf-16");
10896   DEFSYM (Qbig, "big");
10897   DEFSYM (Qlittle, "little");
10898
10899   DEFSYM (Qshift_jis, "shift-jis");
10900   DEFSYM (Qbig5, "big5");
10901
10902   DEFSYM (Qcoding_system_p, "coding-system-p");
10903
10904   DEFSYM (Qcoding_system_error, "coding-system-error");
10905   Fput (Qcoding_system_error, Qerror_conditions,
10906         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10907   Fput (Qcoding_system_error, Qerror_message,
10908         build_pure_c_string ("Invalid coding system"));
10909
10910   DEFSYM (Qtranslation_table, "translation-table");
10911   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10912   DEFSYM (Qtranslation_table_id, "translation-table-id");
10913   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10914   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10915
10916   DEFSYM (Qvalid_codes, "valid-codes");
10917
10918   DEFSYM (Qemacs_mule, "emacs-mule");
10919
10920   DEFSYM (QCcategory, ":category");
10921   DEFSYM (QCmnemonic, ":mnemonic");
10922   DEFSYM (QCdefault_char, ":default-char");
10923   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10924   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10925   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10926   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10927   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10928
10929   Vcoding_category_table
10930     = Fmake_vector (make_number (coding_category_max), Qnil);
10931   staticpro (&Vcoding_category_table);
10932   /* Followings are target of code detection.  */
10933   ASET (Vcoding_category_table, coding_category_iso_7,
10934         intern_c_string ("coding-category-iso-7"));
10935   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10936         intern_c_string ("coding-category-iso-7-tight"));
10937   ASET (Vcoding_category_table, coding_category_iso_8_1,
10938         intern_c_string ("coding-category-iso-8-1"));
10939   ASET (Vcoding_category_table, coding_category_iso_8_2,
10940         intern_c_string ("coding-category-iso-8-2"));
10941   ASET (Vcoding_category_table, coding_category_iso_7_else,
10942         intern_c_string ("coding-category-iso-7-else"));
10943   ASET (Vcoding_category_table, coding_category_iso_8_else,
10944         intern_c_string ("coding-category-iso-8-else"));
10945   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10946         intern_c_string ("coding-category-utf-8-auto"));
10947   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10948         intern_c_string ("coding-category-utf-8"));
10949   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10950         intern_c_string ("coding-category-utf-8-sig"));
10951   ASET (Vcoding_category_table, coding_category_utf_16_be,
10952         intern_c_string ("coding-category-utf-16-be"));
10953   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10954         intern_c_string ("coding-category-utf-16-auto"));
10955   ASET (Vcoding_category_table, coding_category_utf_16_le,
10956         intern_c_string ("coding-category-utf-16-le"));
10957   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10958         intern_c_string ("coding-category-utf-16-be-nosig"));
10959   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10960         intern_c_string ("coding-category-utf-16-le-nosig"));
10961   ASET (Vcoding_category_table, coding_category_charset,
10962         intern_c_string ("coding-category-charset"));
10963   ASET (Vcoding_category_table, coding_category_sjis,
10964         intern_c_string ("coding-category-sjis"));
10965   ASET (Vcoding_category_table, coding_category_big5,
10966         intern_c_string ("coding-category-big5"));
10967   ASET (Vcoding_category_table, coding_category_ccl,
10968         intern_c_string ("coding-category-ccl"));
10969   ASET (Vcoding_category_table, coding_category_emacs_mule,
10970         intern_c_string ("coding-category-emacs-mule"));
10971   /* Followings are NOT target of code detection.  */
10972   ASET (Vcoding_category_table, coding_category_raw_text,
10973         intern_c_string ("coding-category-raw-text"));
10974   ASET (Vcoding_category_table, coding_category_undecided,
10975         intern_c_string ("coding-category-undecided"));
10976
10977   DEFSYM (Qinsufficient_source, "insufficient-source");
10978   DEFSYM (Qinvalid_source, "invalid-source");
10979   DEFSYM (Qinterrupted, "interrupted");
10980   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10981
10982   defsubr (&Scoding_system_p);
10983   defsubr (&Sread_coding_system);
10984   defsubr (&Sread_non_nil_coding_system);
10985   defsubr (&Scheck_coding_system);
10986   defsubr (&Sdetect_coding_region);
10987   defsubr (&Sdetect_coding_string);
10988   defsubr (&Sfind_coding_systems_region_internal);
10989   defsubr (&Sunencodable_char_position);
10990   defsubr (&Scheck_coding_systems_region);
10991   defsubr (&Sdecode_coding_region);
10992   defsubr (&Sencode_coding_region);
10993   defsubr (&Sdecode_coding_string);
10994   defsubr (&Sencode_coding_string);
10995   defsubr (&Sdecode_sjis_char);
10996   defsubr (&Sencode_sjis_char);
10997   defsubr (&Sdecode_big5_char);
10998   defsubr (&Sencode_big5_char);
10999   defsubr (&Sset_terminal_coding_system_internal);
11000   defsubr (&Sset_safe_terminal_coding_system_internal);
11001   defsubr (&Sterminal_coding_system);
11002   defsubr (&Sset_keyboard_coding_system_internal);
11003   defsubr (&Skeyboard_coding_system);
11004   defsubr (&Sfind_operation_coding_system);
11005   defsubr (&Sset_coding_system_priority);
11006   defsubr (&Sdefine_coding_system_internal);
11007   defsubr (&Sdefine_coding_system_alias);
11008   defsubr (&Scoding_system_put);
11009   defsubr (&Scoding_system_base);
11010   defsubr (&Scoding_system_plist);
11011   defsubr (&Scoding_system_aliases);
11012   defsubr (&Scoding_system_eol_type);
11013   defsubr (&Scoding_system_priority_list);
11014
11015   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
11016                doc: /* List of coding systems.
11017
11018 Do not alter the value of this variable manually.  This variable should be
11019 updated by the functions `define-coding-system' and
11020 `define-coding-system-alias'.  */);
11021   Vcoding_system_list = Qnil;
11022
11023   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11024                doc: /* Alist of coding system names.
11025 Each element is one element list of coding system name.
11026 This variable is given to `completing-read' as COLLECTION argument.
11027
11028 Do not alter the value of this variable manually.  This variable should be
11029 updated by the functions `make-coding-system' and
11030 `define-coding-system-alias'.  */);
11031   Vcoding_system_alist = Qnil;
11032
11033   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11034                doc: /* List of coding-categories (symbols) ordered by priority.
11035
11036 On detecting a coding system, Emacs tries code detection algorithms
11037 associated with each coding-category one by one in this order.  When
11038 one algorithm agrees with a byte sequence of source text, the coding
11039 system bound to the corresponding coding-category is selected.
11040
11041 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11042   {
11043     int i;
11044
11045     Vcoding_category_list = Qnil;
11046     for (i = coding_category_max - 1; i >= 0; i--)
11047       Vcoding_category_list
11048         = Fcons (AREF (Vcoding_category_table, i),
11049                  Vcoding_category_list);
11050   }
11051
11052   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11053                doc: /* Specify the coding system for read operations.
11054 It is useful to bind this variable with `let', but do not set it globally.
11055 If the value is a coding system, it is used for decoding on read operation.
11056 If not, an appropriate element is used from one of the coding system alists.
11057 There are three such tables: `file-coding-system-alist',
11058 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11059   Vcoding_system_for_read = Qnil;
11060
11061   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11062                doc: /* Specify the coding system for write operations.
11063 Programs bind this variable with `let', but you should not set it globally.
11064 If the value is a coding system, it is used for encoding of output,
11065 when writing it to a file and when sending it to a file or subprocess.
11066
11067 If this does not specify a coding system, an appropriate element
11068 is used from one of the coding system alists.
11069 There are three such tables: `file-coding-system-alist',
11070 `process-coding-system-alist', and `network-coding-system-alist'.
11071 For output to files, if the above procedure does not specify a coding system,
11072 the value of `buffer-file-coding-system' is used.  */);
11073   Vcoding_system_for_write = Qnil;
11074
11075   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11076                doc: /*
11077 Coding system used in the latest file or process I/O.  */);
11078   Vlast_coding_system_used = Qnil;
11079
11080   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11081                doc: /*
11082 Error status of the last code conversion.
11083
11084 When an error was detected in the last code conversion, this variable
11085 is set to one of the following symbols.
11086   `insufficient-source'
11087   `inconsistent-eol'
11088   `invalid-source'
11089   `interrupted'
11090   `insufficient-memory'
11091 When no error was detected, the value doesn't change.  So, to check
11092 the error status of a code conversion by this variable, you must
11093 explicitly set this variable to nil before performing code
11094 conversion.  */);
11095   Vlast_code_conversion_error = Qnil;
11096
11097   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11098                doc: /*
11099 *Non-nil means always inhibit code conversion of end-of-line format.
11100 See info node `Coding Systems' and info node `Text and Binary' concerning
11101 such conversion.  */);
11102   inhibit_eol_conversion = 0;
11103
11104   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11105                doc: /*
11106 Non-nil means process buffer inherits coding system of process output.
11107 Bind it to t if the process output is to be treated as if it were a file
11108 read from some filesystem.  */);
11109   inherit_process_coding_system = 0;
11110
11111   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11112                doc: /*
11113 Alist to decide a coding system to use for a file I/O operation.
11114 The format is ((PATTERN . VAL) ...),
11115 where PATTERN is a regular expression matching a file name,
11116 VAL is a coding system, a cons of coding systems, or a function symbol.
11117 If VAL is a coding system, it is used for both decoding and encoding
11118 the file contents.
11119 If VAL is a cons of coding systems, the car part is used for decoding,
11120 and the cdr part is used for encoding.
11121 If VAL is a function symbol, the function must return a coding system
11122 or a cons of coding systems which are used as above.  The function is
11123 called with an argument that is a list of the arguments with which
11124 `find-operation-coding-system' was called.  If the function can't decide
11125 a coding system, it can return `undecided' so that the normal
11126 code-detection is performed.
11127
11128 See also the function `find-operation-coding-system'
11129 and the variable `auto-coding-alist'.  */);
11130   Vfile_coding_system_alist = Qnil;
11131
11132   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11133                doc: /*
11134 Alist to decide a coding system to use for a process I/O operation.
11135 The format is ((PATTERN . VAL) ...),
11136 where PATTERN is a regular expression matching a program name,
11137 VAL is a coding system, a cons of coding systems, or a function symbol.
11138 If VAL is a coding system, it is used for both decoding what received
11139 from the program and encoding what sent to the program.
11140 If VAL is a cons of coding systems, the car part is used for decoding,
11141 and the cdr part is used for encoding.
11142 If VAL is a function symbol, the function must return a coding system
11143 or a cons of coding systems which are used as above.
11144
11145 See also the function `find-operation-coding-system'.  */);
11146   Vprocess_coding_system_alist = Qnil;
11147
11148   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11149                doc: /*
11150 Alist to decide a coding system to use for a network I/O operation.
11151 The format is ((PATTERN . VAL) ...),
11152 where PATTERN is a regular expression matching a network service name
11153 or is a port number to connect to,
11154 VAL is a coding system, a cons of coding systems, or a function symbol.
11155 If VAL is a coding system, it is used for both decoding what received
11156 from the network stream and encoding what sent to the network stream.
11157 If VAL is a cons of coding systems, the car part is used for decoding,
11158 and the cdr part is used for encoding.
11159 If VAL is a function symbol, the function must return a coding system
11160 or a cons of coding systems which are used as above.
11161
11162 See also the function `find-operation-coding-system'.  */);
11163   Vnetwork_coding_system_alist = Qnil;
11164
11165   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11166                doc: /* Coding system to use with system messages.
11167 Also used for decoding keyboard input on X Window system.  */);
11168   Vlocale_coding_system = Qnil;
11169
11170   /* The eol mnemonics are reset in startup.el system-dependently.  */
11171   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11172                doc: /*
11173 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11174   eol_mnemonic_unix = build_pure_c_string (":");
11175
11176   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11177                doc: /*
11178 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11179   eol_mnemonic_dos = build_pure_c_string ("\\");
11180
11181   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11182                doc: /*
11183 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11184   eol_mnemonic_mac = build_pure_c_string ("/");
11185
11186   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11187                doc: /*
11188 *String displayed in mode line when end-of-line format is not yet determined.  */);
11189   eol_mnemonic_undecided = build_pure_c_string (":");
11190
11191   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11192                doc: /*
11193 *Non-nil enables character translation while encoding and decoding.  */);
11194   Venable_character_translation = Qt;
11195
11196   DEFVAR_LISP ("standard-translation-table-for-decode",
11197                Vstandard_translation_table_for_decode,
11198                doc: /* Table for translating characters while decoding.  */);
11199   Vstandard_translation_table_for_decode = Qnil;
11200
11201   DEFVAR_LISP ("standard-translation-table-for-encode",
11202                Vstandard_translation_table_for_encode,
11203                doc: /* Table for translating characters while encoding.  */);
11204   Vstandard_translation_table_for_encode = Qnil;
11205
11206   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11207                doc: /* Alist of charsets vs revision numbers.
11208 While encoding, if a charset (car part of an element) is found,
11209 designate it with the escape sequence identifying revision (cdr part
11210 of the element).  */);
11211   Vcharset_revision_table = Qnil;
11212
11213   DEFVAR_LISP ("default-process-coding-system",
11214                Vdefault_process_coding_system,
11215                doc: /* Cons of coding systems used for process I/O by default.
11216 The car part is used for decoding a process output,
11217 the cdr part is used for encoding a text to be sent to a process.  */);
11218   Vdefault_process_coding_system = Qnil;
11219
11220   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11221                doc: /*
11222 Table of extra Latin codes in the range 128..159 (inclusive).
11223 This is a vector of length 256.
11224 If Nth element is non-nil, the existence of code N in a file
11225 \(or output of subprocess) doesn't prevent it to be detected as
11226 a coding system of ISO 2022 variant which has a flag
11227 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11228 or reading output of a subprocess.
11229 Only 128th through 159th elements have a meaning.  */);
11230   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11231
11232   DEFVAR_LISP ("select-safe-coding-system-function",
11233                Vselect_safe_coding_system_function,
11234                doc: /*
11235 Function to call to select safe coding system for encoding a text.
11236
11237 If set, this function is called to force a user to select a proper
11238 coding system which can encode the text in the case that a default
11239 coding system used in each operation can't encode the text.  The
11240 function should take care that the buffer is not modified while
11241 the coding system is being selected.
11242
11243 The default value is `select-safe-coding-system' (which see).  */);
11244   Vselect_safe_coding_system_function = Qnil;
11245
11246   DEFVAR_BOOL ("coding-system-require-warning",
11247                coding_system_require_warning,
11248                doc: /* Internal use only.
11249 If non-nil, on writing a file, `select-safe-coding-system-function' is
11250 called even if `coding-system-for-write' is non-nil.  The command
11251 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11252   coding_system_require_warning = 0;
11253
11254
11255   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11256                inhibit_iso_escape_detection,
11257                doc: /*
11258 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11259
11260 When Emacs reads text, it tries to detect how the text is encoded.
11261 This code detection is sensitive to escape sequences.  If Emacs sees
11262 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11263 of the ISO2022 encodings, and decodes text by the corresponding coding
11264 system (e.g. `iso-2022-7bit').
11265
11266 However, there may be a case that you want to read escape sequences in
11267 a file as is.  In such a case, you can set this variable to non-nil.
11268 Then the code detection will ignore any escape sequences, and no text is
11269 detected as encoded in some ISO-2022 encoding.  The result is that all
11270 escape sequences become visible in a buffer.
11271
11272 The default value is nil, and it is strongly recommended not to change
11273 it.  That is because many Emacs Lisp source files that contain
11274 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11275 in Emacs's distribution, and they won't be decoded correctly on
11276 reading if you suppress escape sequence detection.
11277
11278 The other way to read escape sequences in a file without decoding is
11279 to explicitly specify some coding system that doesn't use ISO-2022
11280 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11281   inhibit_iso_escape_detection = 0;
11282
11283   DEFVAR_BOOL ("inhibit-null-byte-detection",
11284                inhibit_null_byte_detection,
11285                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11286 By default, Emacs treats it as binary data, and does not attempt to
11287 decode it.  The effect is as if you specified `no-conversion' for
11288 reading that text.
11289
11290 Set this to non-nil when a regular text happens to include null bytes.
11291 Examples are Index nodes of Info files and null-byte delimited output
11292 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11293 decode text as usual.  */);
11294   inhibit_null_byte_detection = 0;
11295
11296   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11297                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11298 Internal use only.  Remove after the experimental optimizer becomes stable.  */);
11299   disable_ascii_optimization = 0;
11300
11301   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11302                doc: /* Char table for translating self-inserting characters.
11303 This is applied to the result of input methods, not their input.
11304 See also `keyboard-translate-table'.
11305
11306 Use of this variable for character code unification was rendered
11307 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11308 internal character representation.  */);
11309     Vtranslation_table_for_input = Qnil;
11310
11311   {
11312     Lisp_Object args[coding_arg_undecided_max];
11313     Lisp_Object plist[16];
11314     int i;
11315
11316     for (i = 0; i < coding_arg_undecided_max; i++)
11317       args[i] = Qnil;
11318
11319     plist[0] = intern_c_string (":name");
11320     plist[1] = args[coding_arg_name] = Qno_conversion;
11321     plist[2] = intern_c_string (":mnemonic");
11322     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11323     plist[4] = intern_c_string (":coding-type");
11324     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11325     plist[6] = intern_c_string (":ascii-compatible-p");
11326     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11327     plist[8] = intern_c_string (":default-char");
11328     plist[9] = args[coding_arg_default_char] = make_number (0);
11329     plist[10] = intern_c_string (":for-unibyte");
11330     plist[11] = args[coding_arg_for_unibyte] = Qt;
11331     plist[12] = intern_c_string (":docstring");
11332     plist[13] = build_pure_c_string ("Do no conversion.\n\
11333 \n\
11334 When you visit a file with this coding, the file is read into a\n\
11335 unibyte buffer as is, thus each byte of a file is treated as a\n\
11336 character.");
11337     plist[14] = intern_c_string (":eol-type");
11338     plist[15] = args[coding_arg_eol_type] = Qunix;
11339     args[coding_arg_plist] = Flist (16, plist);
11340     Fdefine_coding_system_internal (coding_arg_max, args);
11341
11342     plist[1] = args[coding_arg_name] = Qundecided;
11343     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11344     plist[5] = args[coding_arg_coding_type] = Qundecided;
11345     /* This is already set.
11346        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11347     plist[8] = intern_c_string (":charset-list");
11348     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11349     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11350     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11351     plist[15] = args[coding_arg_eol_type] = Qnil;
11352     args[coding_arg_plist] = Flist (16, plist);
11353     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11354     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11355     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11356   }
11357
11358   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11359
11360   {
11361     int i;
11362
11363     for (i = 0; i < coding_category_max; i++)
11364       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11365   }
11366 #if defined (DOS_NT)
11367   system_eol_type = Qdos;
11368 #else
11369   system_eol_type = Qunix;
11370 #endif
11371   staticpro (&system_eol_type);
11372 }
11373
11374 char *
11375 emacs_strerror (int error_number)
11376 {
11377   char *str;
11378
11379   synchronize_system_messages_locale ();
11380   str = strerror (error_number);
11381
11382   if (! NILP (Vlocale_coding_system))
11383     {
11384       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11385                                                       Vlocale_coding_system,
11386                                                       0);
11387       str = SSDATA (dec);
11388     }
11389
11390   return str;
11391 }
11392
11393 #endif /* emacs */