src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  On
  59   the C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return true if the byte sequence conforms to XXX.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static bool
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   bool multibytep = coding->src_multibyte;
 162   ptrdiff_t consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   bool multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   bool multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   ptrdiff_t produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288
 289 #ifdef HAVE_WCHAR_H
 290 #include <wchar.h>
 291 #endif /* HAVE_WCHAR_H */
 292
 293 #include "lisp.h"
 294 #include "character.h"
 295 #include "buffer.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 static Lisp_Object Qcoding_system, Qeol_type;
 307 static Lisp_Object Qcoding_aliases;
 308 Lisp_Object Qunix, Qdos;
 309 static Lisp_Object Qmac;
 310 Lisp_Object Qbuffer_file_coding_system;
 311 static Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 static Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qutf_8;
 315 static Lisp_Object Qiso_2022;
 316 static Lisp_Object Qutf_16, Qshift_jis, Qbig5;
 317 static Lisp_Object Qbig, Qlittle;
 318 static Lisp_Object Qcoding_system_history;
 319 static Lisp_Object Qvalid_codes;
 320 static Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 321 static Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 322 static Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 323 static Lisp_Object QCascii_compatible_p;
 324
 325 Lisp_Object Qcall_process, Qcall_process_region;
 326 Lisp_Object Qstart_process, Qopen_network_stream;
 327 static Lisp_Object Qtarget_idx;
 328
 329 static Lisp_Object Qinsufficient_source, Qinvalid_source, Qinterrupted;
 330
 331 /* If a symbol has this property, evaluate the value to define the
 332    symbol as a coding system.  */
 333 static Lisp_Object Qcoding_system_define_form;
 334
 335 /* Format of end-of-line decided by system.  This is Qunix on
 336    Unix and Mac, Qdos on DOS/Windows.
 337    This has an effect only for external encoding (i.e. for output to
 338    file and process), not for in-buffer or Lisp string encoding.  */
 339 static Lisp_Object system_eol_type;
 340
 341 #ifdef emacs
 342
 343 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 344
 345 /* Coding system emacs-mule and raw-text are for converting only
 346    end-of-line format.  */
 347 Lisp_Object Qemacs_mule, Qraw_text;
 348 Lisp_Object Qutf_8_emacs;
 349
 350 #if defined (WINDOWSNT) || defined (CYGWIN)
 351 static Lisp_Object Qutf_16le;
 352 #endif
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding system to be used to encode text for terminal display when
 357    terminal coding system is nil.  */
 358 struct coding_system safe_terminal_coding;
 359
 360 #endif /* emacs */
 361
 362 Lisp_Object Qtranslation_table;
 363 Lisp_Object Qtranslation_table_id;
 364 static Lisp_Object Qtranslation_table_for_decode;
 365 static Lisp_Object Qtranslation_table_for_encode;
 366
 367 /* Two special coding systems.  */
 368 static Lisp_Object Vsjis_coding_system;
 369 static Lisp_Object Vbig5_coding_system;
 370
 371 /* ISO2022 section */
 372
 373 #define CODING_ISO_INITIAL(coding, reg)                 \
 374   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 375                      coding_attr_iso_initial),          \
 376                reg)))
 377
 378
 379 #define CODING_ISO_REQUEST(coding, charset_id)          \
 380   (((charset_id) <= (coding)->max_charset_id            \
 381     ? ((coding)->safe_charsets[charset_id] != 255       \
 382        ? (coding)->safe_charsets[charset_id]            \
 383        : -1)                                            \
 384     : -1))
 385
 386
 387 #define CODING_ISO_FLAGS(coding)        \
 388   ((coding)->spec.iso_2022.flags)
 389 #define CODING_ISO_DESIGNATION(coding, reg)     \
 390   ((coding)->spec.iso_2022.current_designation[reg])
 391 #define CODING_ISO_INVOCATION(coding, plane)    \
 392   ((coding)->spec.iso_2022.current_invocation[plane])
 393 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 394   ((coding)->spec.iso_2022.single_shifting)
 395 #define CODING_ISO_BOL(coding)  \
 396   ((coding)->spec.iso_2022.bol)
 397 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 398   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 399 #define CODING_ISO_CMP_STATUS(coding)   \
 400   (&(coding)->spec.iso_2022.cmp_status)
 401 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 402   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 403 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 404   ((coding)->spec.iso_2022.embedded_utf_8)
 405
 406 /* Control characters of ISO2022.  */
 407                         /* code */      /* function */
 408 #define ISO_CODE_SO     0x0E            /* shift-out */
 409 #define ISO_CODE_SI     0x0F            /* shift-in */
 410 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 411 #define ISO_CODE_ESC    0x1B            /* escape */
 412 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 413 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 414 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 415
 416 /* All code (1-byte) of ISO2022 is classified into one of the
 417    followings.  */
 418 enum iso_code_class_type
 419   {
 420     ISO_control_0,              /* Control codes in the range
 421                                    0x00..0x1F and 0x7F, except for the
 422                                    following 5 codes.  */
 423     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 424     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 425     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 426     ISO_escape,                 /* ISO_CODE_ESC (0x1B) */
 427     ISO_control_1,              /* Control codes in the range
 428                                    0x80..0x9F, except for the
 429                                    following 3 codes.  */
 430     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 431     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 432     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 433     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 434     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 435     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 436     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 437   };
 438
 439 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 440     `iso-flags' attribute of an iso2022 coding system.  */
 441
 442 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 443    instead of the correct short-form sequence (e.g. ESC $ A).  */
 444 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 445
 446 /* If set, reset graphic planes and registers at end-of-line to the
 447    initial state.  */
 448 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 449
 450 /* If set, reset graphic planes and registers before any control
 451    characters to the initial state.  */
 452 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 453
 454 /* If set, encode by 7-bit environment.  */
 455 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 456
 457 /* If set, use locking-shift function.  */
 458 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 459
 460 /* If set, use single-shift function.  Overwrite
 461    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 462 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 463
 464 /* If set, use designation escape sequence.  */
 465 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 466
 467 /* If set, produce revision number sequence.  */
 468 #define CODING_ISO_FLAG_REVISION        0x0080
 469
 470 /* If set, produce ISO6429's direction specifying sequence.  */
 471 #define CODING_ISO_FLAG_DIRECTION       0x0100
 472
 473 /* If set, assume designation states are reset at beginning of line on
 474    output.  */
 475 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 476
 477 /* If set, designation sequence should be placed at beginning of line
 478    on output.  */
 479 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 480
 481 /* If set, do not encode unsafe characters on output.  */
 482 #define CODING_ISO_FLAG_SAFE            0x0800
 483
 484 /* If set, extra latin codes (128..159) are accepted as a valid code
 485    on input.  */
 486 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 487
 488 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 489
 490 /* #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000 */
 491
 492 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 493
 494 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 495
 496 #define CODING_ISO_FLAG_LEVEL_4         0x20000
 497
 498 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 499
 500 /* A character to be produced on output if encoding of the original
 501    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 502 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 503
 504 /* UTF-8 section */
 505 #define CODING_UTF_8_BOM(coding)        \
 506   ((coding)->spec.utf_8_bom)
 507
 508 /* UTF-16 section */
 509 #define CODING_UTF_16_BOM(coding)       \
 510   ((coding)->spec.utf_16.bom)
 511
 512 #define CODING_UTF_16_ENDIAN(coding)    \
 513   ((coding)->spec.utf_16.endian)
 514
 515 #define CODING_UTF_16_SURROGATE(coding) \
 516   ((coding)->spec.utf_16.surrogate)
 517
 518
 519 /* CCL section */
 520 #define CODING_CCL_DECODER(coding)      \
 521   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 522 #define CODING_CCL_ENCODER(coding)      \
 523   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 524 #define CODING_CCL_VALIDS(coding)                                          \
 525   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 526
 527 /* Index for each coding category in `coding_categories' */
 528
 529 enum coding_category
 530   {
 531     coding_category_iso_7,
 532     coding_category_iso_7_tight,
 533     coding_category_iso_8_1,
 534     coding_category_iso_8_2,
 535     coding_category_iso_7_else,
 536     coding_category_iso_8_else,
 537     coding_category_utf_8_auto,
 538     coding_category_utf_8_nosig,
 539     coding_category_utf_8_sig,
 540     coding_category_utf_16_auto,
 541     coding_category_utf_16_be,
 542     coding_category_utf_16_le,
 543     coding_category_utf_16_be_nosig,
 544     coding_category_utf_16_le_nosig,
 545     coding_category_charset,
 546     coding_category_sjis,
 547     coding_category_big5,
 548     coding_category_ccl,
 549     coding_category_emacs_mule,
 550     /* All above are targets of code detection.  */
 551     coding_category_raw_text,
 552     coding_category_undecided,
 553     coding_category_max
 554   };
 555
 556 /* Definitions of flag bits used in detect_coding_XXXX.  */
 557 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 558 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 559 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 560 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 561 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 562 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 563 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 564 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 565 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 566 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 567 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 568 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 569 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 570 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 571 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 572 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 573 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 574 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 575 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 576 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 577
 578 /* This value is returned if detect_coding_mask () find nothing other
 579    than ASCII characters.  */
 580 #define CATEGORY_MASK_ANY               \
 581   (CATEGORY_MASK_ISO_7                  \
 582    | CATEGORY_MASK_ISO_7_TIGHT          \
 583    | CATEGORY_MASK_ISO_8_1              \
 584    | CATEGORY_MASK_ISO_8_2              \
 585    | CATEGORY_MASK_ISO_7_ELSE           \
 586    | CATEGORY_MASK_ISO_8_ELSE           \
 587    | CATEGORY_MASK_UTF_8_AUTO           \
 588    | CATEGORY_MASK_UTF_8_NOSIG          \
 589    | CATEGORY_MASK_UTF_8_SIG            \
 590    | CATEGORY_MASK_UTF_16_AUTO          \
 591    | CATEGORY_MASK_UTF_16_BE            \
 592    | CATEGORY_MASK_UTF_16_LE            \
 593    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 594    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 595    | CATEGORY_MASK_CHARSET              \
 596    | CATEGORY_MASK_SJIS                 \
 597    | CATEGORY_MASK_BIG5                 \
 598    | CATEGORY_MASK_CCL                  \
 599    | CATEGORY_MASK_EMACS_MULE)
 600
 601
 602 #define CATEGORY_MASK_ISO_7BIT \
 603   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 604
 605 #define CATEGORY_MASK_ISO_8BIT \
 606   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 607
 608 #define CATEGORY_MASK_ISO_ELSE \
 609   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 610
 611 #define CATEGORY_MASK_ISO_ESCAPE        \
 612   (CATEGORY_MASK_ISO_7                  \
 613    | CATEGORY_MASK_ISO_7_TIGHT          \
 614    | CATEGORY_MASK_ISO_7_ELSE           \
 615    | CATEGORY_MASK_ISO_8_ELSE)
 616
 617 #define CATEGORY_MASK_ISO       \
 618   (  CATEGORY_MASK_ISO_7BIT     \
 619      | CATEGORY_MASK_ISO_8BIT   \
 620      | CATEGORY_MASK_ISO_ELSE)
 621
 622 #define CATEGORY_MASK_UTF_16            \
 623   (CATEGORY_MASK_UTF_16_AUTO            \
 624    | CATEGORY_MASK_UTF_16_BE            \
 625    | CATEGORY_MASK_UTF_16_LE            \
 626    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 627    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 628
 629 #define CATEGORY_MASK_UTF_8     \
 630   (CATEGORY_MASK_UTF_8_AUTO     \
 631    | CATEGORY_MASK_UTF_8_NOSIG  \
 632    | CATEGORY_MASK_UTF_8_SIG)
 633
 634 /* Table of coding categories (Lisp symbols).  This variable is for
 635    internal use only.  */
 636 static Lisp_Object Vcoding_category_table;
 637
 638 /* Table of coding-categories ordered by priority.  */
 639 static enum coding_category coding_priorities[coding_category_max];
 640
 641 /* Nth element is a coding context for the coding system bound to the
 642    Nth coding category.  */
 643 static struct coding_system coding_categories[coding_category_max];
 644
 645 /*** Commonly used macros and functions ***/
 646
 647 #ifndef min
 648 #define min(a, b) ((a) < (b) ? (a) : (b))
 649 #endif
 650 #ifndef max
 651 #define max(a, b) ((a) > (b) ? (a) : (b))
 652 #endif
 653
 654 /* Encode a flag that can be nil, something else, or t as -1, 0, 1.  */
 655
 656 static int
 657 encode_inhibit_flag (Lisp_Object flag)
 658 {
 659   return NILP (flag) ? -1 : EQ (flag, Qt);
 660 }
 661
 662 /* True if the value of ENCODED_FLAG says a flag should be treated as set.
 663    1 means yes, -1 means no, 0 means ask the user variable VAR.  */
 664
 665 static bool
 666 inhibit_flag (int encoded_flag, bool var)
 667 {
 668   return 0 < encoded_flag + var;
 669 }
 670
 671 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 672   do {                                                  \
 673     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 674     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 675   } while (0)
 676
 677 static void
 678 CHECK_NATNUM_CAR (Lisp_Object x)
 679 {
 680   Lisp_Object tmp = XCAR (x);
 681   CHECK_NATNUM (tmp);
 682   XSETCAR (x, tmp);
 683 }
 684
 685 static void
 686 CHECK_NATNUM_CDR (Lisp_Object x)
 687 {
 688   Lisp_Object tmp = XCDR (x);
 689   CHECK_NATNUM (tmp);
 690   XSETCDR (x, tmp);
 691 }
 692
 693
 694 /* Safely get one byte from the source text pointed by SRC which ends
 695    at SRC_END, and set C to that byte.  If there are not enough bytes
 696    in the source, it jumps to 'no_more_source'.  If MULTIBYTEP,
 697    and a multibyte character is found at SRC, set C to the
 698    negative value of the character code.  The caller should declare
 699    and set these variables appropriately in advance:
 700         src, src_end, multibytep */
 701
 702 #define ONE_MORE_BYTE(c)                                \
 703   do {                                                  \
 704     if (src == src_end)                                 \
 705       {                                                 \
 706         if (src_base < src)                             \
 707           record_conversion_result                      \
 708             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 709         goto no_more_source;                            \
 710       }                                                 \
 711     c = *src++;                                         \
 712     if (multibytep && (c & 0x80))                       \
 713       {                                                 \
 714         if ((c & 0xFE) == 0xC0)                         \
 715           c = ((c & 1) << 6) | *src++;                  \
 716         else                                            \
 717           {                                             \
 718             src--;                                      \
 719             c = - string_char (src, &src, NULL);        \
 720             record_conversion_result                    \
 721               (coding, CODING_RESULT_INVALID_SRC);      \
 722           }                                             \
 723       }                                                 \
 724     consumed_chars++;                                   \
 725   } while (0)
 726
 727 /* Safely get two bytes from the source text pointed by SRC which ends
 728    at SRC_END, and set C1 and C2 to those bytes while skipping the
 729    heading multibyte characters.  If there are not enough bytes in the
 730    source, it jumps to 'no_more_source'.  If MULTIBYTEP and
 731    a multibyte character is found for C2, set C2 to the negative value
 732    of the character code.  The caller should declare and set these
 733    variables appropriately in advance:
 734         src, src_end, multibytep
 735    It is intended that this macro is used in detect_coding_utf_16.  */
 736
 737 #define TWO_MORE_BYTES(c1, c2)                          \
 738   do {                                                  \
 739     do {                                                \
 740       if (src == src_end)                               \
 741         goto no_more_source;                            \
 742       c1 = *src++;                                      \
 743       if (multibytep && (c1 & 0x80))                    \
 744         {                                               \
 745           if ((c1 & 0xFE) == 0xC0)                      \
 746             c1 = ((c1 & 1) << 6) | *src++;              \
 747           else                                          \
 748             {                                           \
 749               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 750               c1 = -1;                                  \
 751             }                                           \
 752         }                                               \
 753     } while (c1 < 0);                                   \
 754     if (src == src_end)                                 \
 755       goto no_more_source;                              \
 756     c2 = *src++;                                        \
 757     if (multibytep && (c2 & 0x80))                      \
 758       {                                                 \
 759         if ((c2 & 0xFE) == 0xC0)                        \
 760           c2 = ((c2 & 1) << 6) | *src++;                \
 761         else                                            \
 762           c2 = -1;                                      \
 763       }                                                 \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  The caller should
 769    assure that C is 0..127, and declare and set the variable `dst'
 770    appropriately in advance.
 771 */
 772
 773
 774 #define EMIT_ONE_ASCII_BYTE(c)  \
 775   do {                          \
 776     produced_chars++;           \
 777     *dst++ = (c);               \
 778   } while (0)
 779
 780
 781 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 782
 783 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 784   do {                                  \
 785     produced_chars += 2;                \
 786     *dst++ = (c1), *dst++ = (c2);       \
 787   } while (0)
 788
 789
 790 /* Store a byte C in the place pointed by DST and increment DST to the
 791    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP,
 792    store in an appropriate multibyte form.  The caller should
 793    declare and set the variables `dst' and `multibytep' appropriately
 794    in advance.  */
 795
 796 #define EMIT_ONE_BYTE(c)                \
 797   do {                                  \
 798     produced_chars++;                   \
 799     if (multibytep)                     \
 800       {                                 \
 801         unsigned ch = (c);              \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       *dst++ = (c);                     \
 808   } while (0)
 809
 810
 811 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 812
 813 #define EMIT_TWO_BYTES(c1, c2)          \
 814   do {                                  \
 815     produced_chars += 2;                \
 816     if (multibytep)                     \
 817       {                                 \
 818         unsigned ch;                    \
 819                                         \
 820         ch = (c1);                      \
 821         if (ch >= 0x80)                 \
 822           ch = BYTE8_TO_CHAR (ch);      \
 823         CHAR_STRING_ADVANCE (ch, dst);  \
 824         ch = (c2);                      \
 825         if (ch >= 0x80)                 \
 826           ch = BYTE8_TO_CHAR (ch);      \
 827         CHAR_STRING_ADVANCE (ch, dst);  \
 828       }                                 \
 829     else                                \
 830       {                                 \
 831         *dst++ = (c1);                  \
 832         *dst++ = (c2);                  \
 833       }                                 \
 834   } while (0)
 835
 836
 837 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 838   do {                                  \
 839     EMIT_ONE_BYTE (c1);                 \
 840     EMIT_TWO_BYTES (c2, c3);            \
 841   } while (0)
 842
 843
 844 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 845   do {                                          \
 846     EMIT_TWO_BYTES (c1, c2);                    \
 847     EMIT_TWO_BYTES (c3, c4);                    \
 848   } while (0)
 849
 850
 851 static void
 852 record_conversion_result (struct coding_system *coding,
 853                           enum coding_result_code result)
 854 {
 855   coding->result = result;
 856   switch (result)
 857     {
 858     case CODING_RESULT_INSUFFICIENT_SRC:
 859       Vlast_code_conversion_error = Qinsufficient_source;
 860       break;
 861     case CODING_RESULT_INVALID_SRC:
 862       Vlast_code_conversion_error = Qinvalid_source;
 863       break;
 864     case CODING_RESULT_INTERRUPT:
 865       Vlast_code_conversion_error = Qinterrupted;
 866       break;
 867     case CODING_RESULT_INSUFFICIENT_DST:
 868       /* Don't record this error in Vlast_code_conversion_error
 869          because it happens just temporarily and is resolved when the
 870          whole conversion is finished.  */
 871       break;
 872     case CODING_RESULT_SUCCESS:
 873       break;
 874     default:
 875       Vlast_code_conversion_error = intern ("Unknown error");
 876     }
 877 }
 878
 879 /* These wrapper macros are used to preserve validity of pointers into
 880    buffer text across calls to decode_char, encode_char, etc, which
 881    could cause relocation of buffers if it loads a charset map,
 882    because loading a charset map allocates large structures.  */
 883
 884 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 885   do {                                                                       \
 886     ptrdiff_t offset;                                                        \
 887                                                                              \
 888     charset_map_loaded = 0;                                                  \
 889     c = DECODE_CHAR (charset, code);                                         \
 890     if (charset_map_loaded                                                   \
 891         && (offset = coding_change_source (coding)))                         \
 892       {                                                                      \
 893         src += offset;                                                       \
 894         src_base += offset;                                                  \
 895         src_end += offset;                                                   \
 896       }                                                                      \
 897   } while (0)
 898
 899 #define CODING_ENCODE_CHAR(coding, dst, dst_end, charset, c, code)      \
 900   do {                                                                  \
 901     ptrdiff_t offset;                                                   \
 902                                                                         \
 903     charset_map_loaded = 0;                                             \
 904     code = ENCODE_CHAR (charset, c);                                    \
 905     if (charset_map_loaded                                              \
 906         && (offset = coding_change_destination (coding)))               \
 907       {                                                                 \
 908         dst += offset;                                                  \
 909         dst_end += offset;                                              \
 910       }                                                                 \
 911   } while (0)
 912
 913 #define CODING_CHAR_CHARSET(coding, dst, dst_end, c, charset_list, code_return, charset) \
 914   do {                                                                  \
 915     ptrdiff_t offset;                                                   \
 916                                                                         \
 917     charset_map_loaded = 0;                                             \
 918     charset = char_charset (c, charset_list, code_return);              \
 919     if (charset_map_loaded                                              \
 920         && (offset = coding_change_destination (coding)))               \
 921       {                                                                 \
 922         dst += offset;                                                  \
 923         dst_end += offset;                                              \
 924       }                                                                 \
 925   } while (0)
 926
 927 #define CODING_CHAR_CHARSET_P(coding, dst, dst_end, c, charset, result) \
 928   do {                                                                  \
 929     ptrdiff_t offset;                                                   \
 930                                                                         \
 931     charset_map_loaded = 0;                                             \
 932     result = CHAR_CHARSET_P (c, charset);                               \
 933     if (charset_map_loaded                                              \
 934         && (offset = coding_change_destination (coding)))               \
 935       {                                                                 \
 936         dst += offset;                                                  \
 937         dst_end += offset;                                              \
 938       }                                                                 \
 939   } while (0)
 940
 941
 942 /* If there are at least BYTES length of room at dst, allocate memory
 943    for coding->destination and update dst and dst_end.  We don't have
 944    to take care of coding->source which will be relocated.  It is
 945    handled by calling coding_set_source in encode_coding.  */
 946
 947 #define ASSURE_DESTINATION(bytes)                               \
 948   do {                                                          \
 949     if (dst + (bytes) >= dst_end)                               \
 950       {                                                         \
 951         ptrdiff_t more_bytes = charbuf_end - charbuf + (bytes); \
 952                                                                 \
 953         dst = alloc_destination (coding, more_bytes, dst);      \
 954         dst_end = coding->destination + coding->dst_bytes;      \
 955       }                                                         \
 956   } while (0)
 957
 958
 959 /* Store multibyte form of the character C in P, and advance P to the
 960    end of the multibyte form.  This used to be like CHAR_STRING_ADVANCE
 961    without ever calling MAYBE_UNIFY_CHAR, but nowadays we don't call
 962    MAYBE_UNIFY_CHAR in CHAR_STRING_ADVANCE.  */
 963
 964 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)  CHAR_STRING_ADVANCE(c, p)
 965
 966 /* Return the character code of character whose multibyte form is at
 967    P, and advance P to the end of the multibyte form.  This used to be
 968    like STRING_CHAR_ADVANCE without ever calling MAYBE_UNIFY_CHAR, but
 969    nowadays STRING_CHAR_ADVANCE doesn't call MAYBE_UNIFY_CHAR.  */
 970
 971 #define STRING_CHAR_ADVANCE_NO_UNIFY(p) STRING_CHAR_ADVANCE(p)
 972
 973 /* Set coding->source from coding->src_object.  */
 974
 975 static void
 976 coding_set_source (struct coding_system *coding)
 977 {
 978   if (BUFFERP (coding->src_object))
 979     {
 980       struct buffer *buf = XBUFFER (coding->src_object);
 981
 982       if (coding->src_pos < 0)
 983         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 984       else
 985         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 986     }
 987   else if (STRINGP (coding->src_object))
 988     {
 989       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 990     }
 991   else
 992     {
 993       /* Otherwise, the source is C string and is never relocated
 994          automatically.  Thus we don't have to update anything.  */
 995     }
 996 }
 997
 998
 999 /* Set coding->source from coding->src_object, and return how many
1000    bytes coding->source was changed.  */
1001
1002 static ptrdiff_t
1003 coding_change_source (struct coding_system *coding)
1004 {
1005   const unsigned char *orig = coding->source;
1006   coding_set_source (coding);
1007   return coding->source - orig;
1008 }
1009
1010
1011 /* Set coding->destination from coding->dst_object.  */
1012
1013 static void
1014 coding_set_destination (struct coding_system *coding)
1015 {
1016   if (BUFFERP (coding->dst_object))
1017     {
1018       if (BUFFERP (coding->src_object) && coding->src_pos < 0)
1019         {
1020           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1021           coding->dst_bytes = (GAP_END_ADDR
1022                                - (coding->src_bytes - coding->consumed)
1023                                - coding->destination);
1024         }
1025       else
1026         {
1027           /* We are sure that coding->dst_pos_byte is before the gap
1028              of the buffer. */
1029           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1030                                  + coding->dst_pos_byte - BEG_BYTE);
1031           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1032                                - coding->destination);
1033         }
1034     }
1035   else
1036     {
1037       /* Otherwise, the destination is C string and is never relocated
1038          automatically.  Thus we don't have to update anything.  */
1039     }
1040 }
1041
1042
1043 /* Set coding->destination from coding->dst_object, and return how
1044    many bytes coding->destination was changed.  */
1045
1046 static ptrdiff_t
1047 coding_change_destination (struct coding_system *coding)
1048 {
1049   const unsigned char *orig = coding->destination;
1050   coding_set_destination (coding);
1051   return coding->destination - orig;
1052 }
1053
1054
1055 static void
1056 coding_alloc_by_realloc (struct coding_system *coding, ptrdiff_t bytes)
1057 {
1058   if (STRING_BYTES_BOUND - coding->dst_bytes < bytes)
1059     string_overflow ();
1060   coding->destination = xrealloc (coding->destination,
1061                                   coding->dst_bytes + bytes);
1062   coding->dst_bytes += bytes;
1063 }
1064
1065 static void
1066 coding_alloc_by_making_gap (struct coding_system *coding,
1067                             ptrdiff_t gap_head_used, ptrdiff_t bytes)
1068 {
1069   if (EQ (coding->src_object, coding->dst_object))
1070     {
1071       /* The gap may contain the produced data at the head and not-yet
1072          consumed data at the tail.  To preserve those data, we at
1073          first make the gap size to zero, then increase the gap
1074          size.  */
1075       ptrdiff_t add = GAP_SIZE;
1076
1077       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1078       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1079       make_gap (bytes);
1080       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1081       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1082     }
1083   else
1084     make_gap_1 (XBUFFER (coding->dst_object), bytes);
1085 }
1086
1087
1088 static unsigned char *
1089 alloc_destination (struct coding_system *coding, ptrdiff_t nbytes,
1090                    unsigned char *dst)
1091 {
1092   ptrdiff_t offset = dst - coding->destination;
1093
1094   if (BUFFERP (coding->dst_object))
1095     {
1096       struct buffer *buf = XBUFFER (coding->dst_object);
1097
1098       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1099     }
1100   else
1101     coding_alloc_by_realloc (coding, nbytes);
1102   coding_set_destination (coding);
1103   dst = coding->destination + offset;
1104   return dst;
1105 }
1106
1107 /** Macros for annotations.  */
1108
1109 /* An annotation data is stored in the array coding->charbuf in this
1110    format:
1111      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1112    LENGTH is the number of elements in the annotation.
1113    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1114    NCHARS is the number of characters in the text annotated.
1115
1116    The format of the following elements depend on ANNOTATION_MASK.
1117
1118    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1119    follows:
1120      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1121
1122    NBYTES is the number of bytes specified in the header part of
1123    old-style emacs-mule encoding, or 0 for the other kind of
1124    composition.
1125
1126    METHOD is one of enum composition_method.
1127
1128    Optional COMPOSITION-COMPONENTS are characters and composition
1129    rules.
1130
1131    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1132    follows.
1133
1134    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1135    recover from an invalid annotation, and should be skipped by
1136    produce_annotation.  */
1137
1138 /* Maximum length of the header of annotation data.  */
1139 #define MAX_ANNOTATION_LENGTH 5
1140
1141 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1142   do {                                                  \
1143     *(buf)++ = -(len);                                  \
1144     *(buf)++ = (mask);                                  \
1145     *(buf)++ = (nchars);                                \
1146     coding->annotated = 1;                              \
1147   } while (0);
1148
1149 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1150   do {                                                                      \
1151     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1152     *buf++ = nbytes;                                                        \
1153     *buf++ = method;                                                        \
1154   } while (0)
1155
1156
1157 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1158   do {                                                                  \
1159     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1160     *buf++ = id;                                                        \
1161   } while (0)
1162
1163
1164 /* Bitmasks for coding->eol_seen.  */
1165
1166 #define EOL_SEEN_NONE   0
1167 #define EOL_SEEN_LF     1
1168 #define EOL_SEEN_CR     2
1169 #define EOL_SEEN_CRLF   4
1170
1171 \f
1172 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1173
1174
1175
1176 \f
1177 /*** 3. UTF-8 ***/
1178
1179 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1180    Return true if a text is encoded in UTF-8.  */
1181
1182 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1183 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1184 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1185 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1186 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1187 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1188
1189 #define UTF_8_BOM_1 0xEF
1190 #define UTF_8_BOM_2 0xBB
1191 #define UTF_8_BOM_3 0xBF
1192
1193 /* Unlike the other detect_coding_XXX, this function counts number of
1194    characters and check EOL format.  */
1195
1196 static bool
1197 detect_coding_utf_8 (struct coding_system *coding,
1198                      struct coding_detection_info *detect_info)
1199 {
1200   const unsigned char *src = coding->source, *src_base;
1201   const unsigned char *src_end = coding->source + coding->src_bytes;
1202   bool multibytep = coding->src_multibyte;
1203   ptrdiff_t consumed_chars = 0;
1204   bool bom_found = 0;
1205   int nchars = coding->head_ascii;
1206   int eol_seen = coding->eol_seen;
1207
1208   detect_info->checked |= CATEGORY_MASK_UTF_8;
1209   /* A coding system of this category is always ASCII compatible.  */
1210   src += nchars;
1211
1212   if (src == coding->source     /* BOM should be at the head.  */
1213       && src + 3 < src_end      /* BOM is 3-byte long.  */
1214       && src[0] == UTF_8_BOM_1
1215       && src[1] == UTF_8_BOM_2
1216       && src[2] == UTF_8_BOM_3)
1217     {
1218       bom_found = 1;
1219       src += 3;
1220       nchars++;
1221     }
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4;
1226
1227       src_base = src;
1228       ONE_MORE_BYTE (c);
1229       if (c < 0 || UTF_8_1_OCTET_P (c))
1230         {
1231           nchars++;
1232           if (c == '\r')
1233             {
1234               if (src < src_end && *src == '\n')
1235                 {
1236                   eol_seen |= EOL_SEEN_CRLF;
1237                   src++;
1238                   nchars++;
1239                 }
1240               else
1241                 eol_seen |= EOL_SEEN_CR;
1242             }
1243           else if (c == '\n')
1244             eol_seen |= EOL_SEEN_LF;
1245           continue;
1246         }
1247       ONE_MORE_BYTE (c1);
1248       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1249         break;
1250       if (UTF_8_2_OCTET_LEADING_P (c))
1251         {
1252           nchars++;
1253           continue;
1254         }
1255       ONE_MORE_BYTE (c2);
1256       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1257         break;
1258       if (UTF_8_3_OCTET_LEADING_P (c))
1259         {
1260           nchars++;
1261           continue;
1262         }
1263       ONE_MORE_BYTE (c3);
1264       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1265         break;
1266       if (UTF_8_4_OCTET_LEADING_P (c))
1267         {
1268           nchars++;
1269           continue;
1270         }
1271       ONE_MORE_BYTE (c4);
1272       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273         break;
1274       if (UTF_8_5_OCTET_LEADING_P (c))
1275         {
1276           nchars++;
1277           continue;
1278         }
1279       break;
1280     }
1281   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1282   return 0;
1283
1284  no_more_source:
1285   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1286     {
1287       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288       return 0;
1289     }
1290   if (bom_found)
1291     {
1292       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1293       detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1294     }
1295   else
1296     {
1297       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1298       if (nchars < src_end - coding->source)
1299         /* The found characters are less than source bytes, which
1300            means that we found a valid non-ASCII characters.  */
1301         detect_info->found |= CATEGORY_MASK_UTF_8_AUTO | CATEGORY_MASK_UTF_8_NOSIG;
1302     }
1303   coding->detected_utf8_chars = nchars;
1304   return 1;
1305 }
1306
1307
1308 static void
1309 decode_coding_utf_8 (struct coding_system *coding)
1310 {
1311   const unsigned char *src = coding->source + coding->consumed;
1312   const unsigned char *src_end = coding->source + coding->src_bytes;
1313   const unsigned char *src_base;
1314   int *charbuf = coding->charbuf + coding->charbuf_used;
1315   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1316   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1317   bool multibytep = coding->src_multibyte;
1318   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1319   bool eol_dos
1320     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1321   int byte_after_cr = -1;
1322
1323   if (bom != utf_without_bom)
1324     {
1325       int c1, c2, c3;
1326
1327       src_base = src;
1328       ONE_MORE_BYTE (c1);
1329       if (! UTF_8_3_OCTET_LEADING_P (c1))
1330         src = src_base;
1331       else
1332         {
1333           ONE_MORE_BYTE (c2);
1334           if (! UTF_8_EXTRA_OCTET_P (c2))
1335             src = src_base;
1336           else
1337             {
1338               ONE_MORE_BYTE (c3);
1339               if (! UTF_8_EXTRA_OCTET_P (c3))
1340                 src = src_base;
1341               else
1342                 {
1343                   if ((c1 != UTF_8_BOM_1)
1344                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1345                     src = src_base;
1346                   else
1347                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1348                 }
1349             }
1350         }
1351     }
1352   CODING_UTF_8_BOM (coding) = utf_without_bom;
1353
1354   while (1)
1355     {
1356       int c, c1, c2, c3, c4, c5;
1357
1358       src_base = src;
1359       consumed_chars_base = consumed_chars;
1360
1361       if (charbuf >= charbuf_end)
1362         {
1363           if (byte_after_cr >= 0)
1364             src_base--;
1365           break;
1366         }
1367
1368       /* In the simple case, rapidly handle ordinary characters */
1369       if (multibytep && ! eol_dos
1370           && charbuf < charbuf_end - 6 && src < src_end - 6)
1371         {
1372           while (charbuf < charbuf_end - 6 && src < src_end - 6)
1373             {
1374               c1 = *src;
1375               if (c1 & 0x80)
1376                 break;
1377               src++;
1378               consumed_chars++;
1379               *charbuf++ = c1;
1380
1381               c1 = *src;
1382               if (c1 & 0x80)
1383                 break;
1384               src++;
1385               consumed_chars++;
1386               *charbuf++ = c1;
1387
1388               c1 = *src;
1389               if (c1 & 0x80)
1390                 break;
1391               src++;
1392               consumed_chars++;
1393               *charbuf++ = c1;
1394
1395               c1 = *src;
1396               if (c1 & 0x80)
1397                 break;
1398               src++;
1399               consumed_chars++;
1400               *charbuf++ = c1;
1401             }
1402           /* If we handled at least one character, restart the main loop.  */
1403           if (src != src_base)
1404             continue;
1405         }
1406
1407       if (byte_after_cr >= 0)
1408         c1 = byte_after_cr, byte_after_cr = -1;
1409       else
1410         ONE_MORE_BYTE (c1);
1411       if (c1 < 0)
1412         {
1413           c = - c1;
1414         }
1415       else if (UTF_8_1_OCTET_P (c1))
1416         {
1417           if (eol_dos && c1 == '\r')
1418             ONE_MORE_BYTE (byte_after_cr);
1419           c = c1;
1420         }
1421       else
1422         {
1423           ONE_MORE_BYTE (c2);
1424           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1425             goto invalid_code;
1426           if (UTF_8_2_OCTET_LEADING_P (c1))
1427             {
1428               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1429               /* Reject overlong sequences here and below.  Encoders
1430                  producing them are incorrect, they can be misleading,
1431                  and they mess up read/write invariance.  */
1432               if (c < 128)
1433                 goto invalid_code;
1434             }
1435           else
1436             {
1437               ONE_MORE_BYTE (c3);
1438               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1439                 goto invalid_code;
1440               if (UTF_8_3_OCTET_LEADING_P (c1))
1441                 {
1442                   c = (((c1 & 0xF) << 12)
1443                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1444                   if (c < 0x800
1445                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1446                     goto invalid_code;
1447                 }
1448               else
1449                 {
1450                   ONE_MORE_BYTE (c4);
1451                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1452                     goto invalid_code;
1453                   if (UTF_8_4_OCTET_LEADING_P (c1))
1454                     {
1455                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1456                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1457                     if (c < 0x10000)
1458                       goto invalid_code;
1459                     }
1460                   else
1461                     {
1462                       ONE_MORE_BYTE (c5);
1463                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1464                         goto invalid_code;
1465                       if (UTF_8_5_OCTET_LEADING_P (c1))
1466                         {
1467                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1468                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1469                                | (c5 & 0x3F));
1470                           if ((c > MAX_CHAR) || (c < 0x200000))
1471                             goto invalid_code;
1472                         }
1473                       else
1474                         goto invalid_code;
1475                     }
1476                 }
1477             }
1478         }
1479
1480       *charbuf++ = c;
1481       continue;
1482
1483     invalid_code:
1484       src = src_base;
1485       consumed_chars = consumed_chars_base;
1486       ONE_MORE_BYTE (c);
1487       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1488       coding->errors++;
1489     }
1490
1491  no_more_source:
1492   coding->consumed_char += consumed_chars_base;
1493   coding->consumed = src_base - coding->source;
1494   coding->charbuf_used = charbuf - coding->charbuf;
1495 }
1496
1497
1498 static bool
1499 encode_coding_utf_8 (struct coding_system *coding)
1500 {
1501   bool multibytep = coding->dst_multibyte;
1502   int *charbuf = coding->charbuf;
1503   int *charbuf_end = charbuf + coding->charbuf_used;
1504   unsigned char *dst = coding->destination + coding->produced;
1505   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1506   ptrdiff_t produced_chars = 0;
1507   int c;
1508
1509   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1510     {
1511       ASSURE_DESTINATION (3);
1512       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1513       CODING_UTF_8_BOM (coding) = utf_without_bom;
1514     }
1515
1516   if (multibytep)
1517     {
1518       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1519
1520       while (charbuf < charbuf_end)
1521         {
1522           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1523
1524           ASSURE_DESTINATION (safe_room);
1525           c = *charbuf++;
1526           if (CHAR_BYTE8_P (c))
1527             {
1528               c = CHAR_TO_BYTE8 (c);
1529               EMIT_ONE_BYTE (c);
1530             }
1531           else
1532             {
1533               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1534               for (p = str; p < pend; p++)
1535                 EMIT_ONE_BYTE (*p);
1536             }
1537         }
1538     }
1539   else
1540     {
1541       int safe_room = MAX_MULTIBYTE_LENGTH;
1542
1543       while (charbuf < charbuf_end)
1544         {
1545           ASSURE_DESTINATION (safe_room);
1546           c = *charbuf++;
1547           if (CHAR_BYTE8_P (c))
1548             *dst++ = CHAR_TO_BYTE8 (c);
1549           else
1550             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1551           produced_chars++;
1552         }
1553     }
1554   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1555   coding->produced_char += produced_chars;
1556   coding->produced = dst - coding->destination;
1557   return 0;
1558 }
1559
1560
1561 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1562    Return true if a text is encoded in one of UTF-16 based coding systems.  */
1563
1564 #define UTF_16_HIGH_SURROGATE_P(val) \
1565   (((val) & 0xFC00) == 0xD800)
1566
1567 #define UTF_16_LOW_SURROGATE_P(val) \
1568   (((val) & 0xFC00) == 0xDC00)
1569
1570
1571 static bool
1572 detect_coding_utf_16 (struct coding_system *coding,
1573                       struct coding_detection_info *detect_info)
1574 {
1575   const unsigned char *src = coding->source;
1576   const unsigned char *src_end = coding->source + coding->src_bytes;
1577   bool multibytep = coding->src_multibyte;
1578   int c1, c2;
1579
1580   detect_info->checked |= CATEGORY_MASK_UTF_16;
1581   if (coding->mode & CODING_MODE_LAST_BLOCK
1582       && (coding->src_chars & 1))
1583     {
1584       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1585       return 0;
1586     }
1587
1588   TWO_MORE_BYTES (c1, c2);
1589   if ((c1 == 0xFF) && (c2 == 0xFE))
1590     {
1591       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1592                              | CATEGORY_MASK_UTF_16_AUTO);
1593       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1594                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1595                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1596     }
1597   else if ((c1 == 0xFE) && (c2 == 0xFF))
1598     {
1599       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1600                              | CATEGORY_MASK_UTF_16_AUTO);
1601       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1602                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1603                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1604     }
1605   else if (c2 < 0)
1606     {
1607       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1608       return 0;
1609     }
1610   else
1611     {
1612       /* We check the dispersion of Eth and Oth bytes where E is even and
1613          O is odd.  If both are high, we assume binary data.*/
1614       unsigned char e[256], o[256];
1615       unsigned e_num = 1, o_num = 1;
1616
1617       memset (e, 0, 256);
1618       memset (o, 0, 256);
1619       e[c1] = 1;
1620       o[c2] = 1;
1621
1622       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1623                                 |CATEGORY_MASK_UTF_16_BE
1624                                 | CATEGORY_MASK_UTF_16_LE);
1625
1626       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1627              != CATEGORY_MASK_UTF_16)
1628         {
1629           TWO_MORE_BYTES (c1, c2);
1630           if (c2 < 0)
1631             break;
1632           if (! e[c1])
1633             {
1634               e[c1] = 1;
1635               e_num++;
1636               if (e_num >= 128)
1637                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1638             }
1639           if (! o[c2])
1640             {
1641               o[c2] = 1;
1642               o_num++;
1643               if (o_num >= 128)
1644                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1645             }
1646         }
1647       return 0;
1648     }
1649
1650  no_more_source:
1651   return 1;
1652 }
1653
1654 static void
1655 decode_coding_utf_16 (struct coding_system *coding)
1656 {
1657   const unsigned char *src = coding->source + coding->consumed;
1658   const unsigned char *src_end = coding->source + coding->src_bytes;
1659   const unsigned char *src_base;
1660   int *charbuf = coding->charbuf + coding->charbuf_used;
1661   /* We may produces at most 3 chars in one loop.  */
1662   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1663   ptrdiff_t consumed_chars = 0, consumed_chars_base = 0;
1664   bool multibytep = coding->src_multibyte;
1665   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1666   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1667   int surrogate = CODING_UTF_16_SURROGATE (coding);
1668   bool eol_dos
1669     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1670   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1671
1672   if (bom == utf_with_bom)
1673     {
1674       int c, c1, c2;
1675
1676       src_base = src;
1677       ONE_MORE_BYTE (c1);
1678       ONE_MORE_BYTE (c2);
1679       c = (c1 << 8) | c2;
1680
1681       if (endian == utf_16_big_endian
1682           ? c != 0xFEFF : c != 0xFFFE)
1683         {
1684           /* The first two bytes are not BOM.  Treat them as bytes
1685              for a normal character.  */
1686           src = src_base;
1687           coding->errors++;
1688         }
1689       CODING_UTF_16_BOM (coding) = utf_without_bom;
1690     }
1691   else if (bom == utf_detect_bom)
1692     {
1693       /* We have already tried to detect BOM and failed in
1694          detect_coding.  */
1695       CODING_UTF_16_BOM (coding) = utf_without_bom;
1696     }
1697
1698   while (1)
1699     {
1700       int c, c1, c2;
1701
1702       src_base = src;
1703       consumed_chars_base = consumed_chars;
1704
1705       if (charbuf >= charbuf_end)
1706         {
1707           if (byte_after_cr1 >= 0)
1708             src_base -= 2;
1709           break;
1710         }
1711
1712       if (byte_after_cr1 >= 0)
1713         c1 = byte_after_cr1, byte_after_cr1 = -1;
1714       else
1715         ONE_MORE_BYTE (c1);
1716       if (c1 < 0)
1717         {
1718           *charbuf++ = -c1;
1719           continue;
1720         }
1721       if (byte_after_cr2 >= 0)
1722         c2 = byte_after_cr2, byte_after_cr2 = -1;
1723       else
1724         ONE_MORE_BYTE (c2);
1725       if (c2 < 0)
1726         {
1727           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1728           *charbuf++ = -c2;
1729           continue;
1730         }
1731       c = (endian == utf_16_big_endian
1732            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1733
1734       if (surrogate)
1735         {
1736           if (! UTF_16_LOW_SURROGATE_P (c))
1737             {
1738               if (endian == utf_16_big_endian)
1739                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1740               else
1741                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1742               *charbuf++ = c1;
1743               *charbuf++ = c2;
1744               coding->errors++;
1745               if (UTF_16_HIGH_SURROGATE_P (c))
1746                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1747               else
1748                 *charbuf++ = c;
1749             }
1750           else
1751             {
1752               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1753               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1754               *charbuf++ = 0x10000 + c;
1755             }
1756         }
1757       else
1758         {
1759           if (UTF_16_HIGH_SURROGATE_P (c))
1760             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1761           else
1762             {
1763               if (eol_dos && c == '\r')
1764                 {
1765                   ONE_MORE_BYTE (byte_after_cr1);
1766                   ONE_MORE_BYTE (byte_after_cr2);
1767                 }
1768               *charbuf++ = c;
1769             }
1770         }
1771     }
1772
1773  no_more_source:
1774   coding->consumed_char += consumed_chars_base;
1775   coding->consumed = src_base - coding->source;
1776   coding->charbuf_used = charbuf - coding->charbuf;
1777 }
1778
1779 static bool
1780 encode_coding_utf_16 (struct coding_system *coding)
1781 {
1782   bool multibytep = coding->dst_multibyte;
1783   int *charbuf = coding->charbuf;
1784   int *charbuf_end = charbuf + coding->charbuf_used;
1785   unsigned char *dst = coding->destination + coding->produced;
1786   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1787   int safe_room = 8;
1788   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1789   bool big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1790   ptrdiff_t produced_chars = 0;
1791   int c;
1792
1793   if (bom != utf_without_bom)
1794     {
1795       ASSURE_DESTINATION (safe_room);
1796       if (big_endian)
1797         EMIT_TWO_BYTES (0xFE, 0xFF);
1798       else
1799         EMIT_TWO_BYTES (0xFF, 0xFE);
1800       CODING_UTF_16_BOM (coding) = utf_without_bom;
1801     }
1802
1803   while (charbuf < charbuf_end)
1804     {
1805       ASSURE_DESTINATION (safe_room);
1806       c = *charbuf++;
1807       if (c > MAX_UNICODE_CHAR)
1808         c = coding->default_char;
1809
1810       if (c < 0x10000)
1811         {
1812           if (big_endian)
1813             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1814           else
1815             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1816         }
1817       else
1818         {
1819           int c1, c2;
1820
1821           c -= 0x10000;
1822           c1 = (c >> 10) + 0xD800;
1823           c2 = (c & 0x3FF) + 0xDC00;
1824           if (big_endian)
1825             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1826           else
1827             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1828         }
1829     }
1830   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1831   coding->produced = dst - coding->destination;
1832   coding->produced_char += produced_chars;
1833   return 0;
1834 }
1835
1836 \f
1837 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1838
1839 /* Emacs' internal format for representation of multiple character
1840    sets is a kind of multi-byte encoding, i.e. characters are
1841    represented by variable-length sequences of one-byte codes.
1842
1843    ASCII characters and control characters (e.g. `tab', `newline') are
1844    represented by one-byte sequences which are their ASCII codes, in
1845    the range 0x00 through 0x7F.
1846
1847    8-bit characters of the range 0x80..0x9F are represented by
1848    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1849    code + 0x20).
1850
1851    8-bit characters of the range 0xA0..0xFF are represented by
1852    one-byte sequences which are their 8-bit code.
1853
1854    The other characters are represented by a sequence of `base
1855    leading-code', optional `extended leading-code', and one or two
1856    `position-code's.  The length of the sequence is determined by the
1857    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1858    whereas extended leading-code and position-code take the range 0xA0
1859    through 0xFF.  See `charset.h' for more details about leading-code
1860    and position-code.
1861
1862    --- CODE RANGE of Emacs' internal format ---
1863    character set        range
1864    -------------        -----
1865    ascii                0x00..0x7F
1866    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1867    eight-bit-graphic    0xA0..0xBF
1868    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1869    ---------------------------------------------
1870
1871    As this is the internal character representation, the format is
1872    usually not used externally (i.e. in a file or in a data sent to a
1873    process).  But, it is possible to have a text externally in this
1874    format (i.e. by encoding by the coding system `emacs-mule').
1875
1876    In that case, a sequence of one-byte codes has a slightly different
1877    form.
1878
1879    At first, all characters in eight-bit-control are represented by
1880    one-byte sequences which are their 8-bit code.
1881
1882    Next, character composition data are represented by the byte
1883    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1884    where,
1885         METHOD is 0xF2 plus one of composition method (enum
1886         composition_method),
1887
1888         BYTES is 0xA0 plus a byte length of this composition data,
1889
1890         CHARS is 0xA0 plus a number of characters composed by this
1891         data,
1892
1893         COMPONENTs are characters of multibyte form or composition
1894         rules encoded by two-byte of ASCII codes.
1895
1896    In addition, for backward compatibility, the following formats are
1897    also recognized as composition data on decoding.
1898
1899    0x80 MSEQ ...
1900    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1901
1902    Here,
1903         MSEQ is a multibyte form but in these special format:
1904           ASCII: 0xA0 ASCII_CODE+0x80,
1905           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1906         RULE is a one byte code of the range 0xA0..0xF0 that
1907         represents a composition rule.
1908   */
1909
1910 char emacs_mule_bytes[256];
1911
1912
1913 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1914    Return true if a text is encoded in 'emacs-mule'.  */
1915
1916 static bool
1917 detect_coding_emacs_mule (struct coding_system *coding,
1918                           struct coding_detection_info *detect_info)
1919 {
1920   const unsigned char *src = coding->source, *src_base;
1921   const unsigned char *src_end = coding->source + coding->src_bytes;
1922   bool multibytep = coding->src_multibyte;
1923   ptrdiff_t consumed_chars = 0;
1924   int c;
1925   int found = 0;
1926
1927   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1928   /* A coding system of this category is always ASCII compatible.  */
1929   src += coding->head_ascii;
1930
1931   while (1)
1932     {
1933       src_base = src;
1934       ONE_MORE_BYTE (c);
1935       if (c < 0)
1936         continue;
1937       if (c == 0x80)
1938         {
1939           /* Perhaps the start of composite character.  We simply skip
1940              it because analyzing it is too heavy for detecting.  But,
1941              at least, we check that the composite character
1942              constitutes of more than 4 bytes.  */
1943           const unsigned char *src_start;
1944
1945         repeat:
1946           src_start = src;
1947           do
1948             {
1949               ONE_MORE_BYTE (c);
1950             }
1951           while (c >= 0xA0);
1952
1953           if (src - src_start <= 4)
1954             break;
1955           found = CATEGORY_MASK_EMACS_MULE;
1956           if (c == 0x80)
1957             goto repeat;
1958         }
1959
1960       if (c < 0x80)
1961         {
1962           if (c < 0x20
1963               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1964             break;
1965         }
1966       else
1967         {
1968           int more_bytes = emacs_mule_bytes[c] - 1;
1969
1970           while (more_bytes > 0)
1971             {
1972               ONE_MORE_BYTE (c);
1973               if (c < 0xA0)
1974                 {
1975                   src--;        /* Unread the last byte.  */
1976                   break;
1977                 }
1978               more_bytes--;
1979             }
1980           if (more_bytes != 0)
1981             break;
1982           found = CATEGORY_MASK_EMACS_MULE;
1983         }
1984     }
1985   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1986   return 0;
1987
1988  no_more_source:
1989   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1990     {
1991       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1992       return 0;
1993     }
1994   detect_info->found |= found;
1995   return 1;
1996 }
1997
1998
1999 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2000    character.  If CMP_STATUS indicates that we must expect MSEQ or
2001    RULE described above, decode it and return the negative value of
2002    the decoded character or rule.  If an invalid byte is found, return
2003    -1.  If SRC is too short, return -2.  */
2004
2005 static int
2006 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
2007                  int *nbytes, int *nchars, int *id,
2008                  struct composition_status *cmp_status)
2009 {
2010   const unsigned char *src_end = coding->source + coding->src_bytes;
2011   const unsigned char *src_base = src;
2012   bool multibytep = coding->src_multibyte;
2013   int charset_ID;
2014   unsigned code;
2015   int c;
2016   int consumed_chars = 0;
2017   bool mseq_found = 0;
2018
2019   ONE_MORE_BYTE (c);
2020   if (c < 0)
2021     {
2022       c = -c;
2023       charset_ID = emacs_mule_charset[0];
2024     }
2025   else
2026     {
2027       if (c >= 0xA0)
2028         {
2029           if (cmp_status->state != COMPOSING_NO
2030               && cmp_status->old_form)
2031             {
2032               if (cmp_status->state == COMPOSING_CHAR)
2033                 {
2034                   if (c == 0xA0)
2035                     {
2036                       ONE_MORE_BYTE (c);
2037                       c -= 0x80;
2038                       if (c < 0)
2039                         goto invalid_code;
2040                     }
2041                   else
2042                     c -= 0x20;
2043                   mseq_found = 1;
2044                 }
2045               else
2046                 {
2047                   *nbytes = src - src_base;
2048                   *nchars = consumed_chars;
2049                   return -c;
2050                 }
2051             }
2052           else
2053             goto invalid_code;
2054         }
2055
2056       switch (emacs_mule_bytes[c])
2057         {
2058         case 2:
2059           if ((charset_ID = emacs_mule_charset[c]) < 0)
2060             goto invalid_code;
2061           ONE_MORE_BYTE (c);
2062           if (c < 0xA0)
2063             goto invalid_code;
2064           code = c & 0x7F;
2065           break;
2066
2067         case 3:
2068           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2069               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2070             {
2071               ONE_MORE_BYTE (c);
2072               if (c < 0xA0 || (charset_ID = emacs_mule_charset[c]) < 0)
2073                 goto invalid_code;
2074               ONE_MORE_BYTE (c);
2075               if (c < 0xA0)
2076                 goto invalid_code;
2077               code = c & 0x7F;
2078             }
2079           else
2080             {
2081               if ((charset_ID = emacs_mule_charset[c]) < 0)
2082                 goto invalid_code;
2083               ONE_MORE_BYTE (c);
2084               if (c < 0xA0)
2085                 goto invalid_code;
2086               code = (c & 0x7F) << 8;
2087               ONE_MORE_BYTE (c);
2088               if (c < 0xA0)
2089                 goto invalid_code;
2090               code |= c & 0x7F;
2091             }
2092           break;
2093
2094         case 4:
2095           ONE_MORE_BYTE (c);
2096           if (c < 0 || (charset_ID = emacs_mule_charset[c]) < 0)
2097             goto invalid_code;
2098           ONE_MORE_BYTE (c);
2099           if (c < 0xA0)
2100             goto invalid_code;
2101           code = (c & 0x7F) << 8;
2102           ONE_MORE_BYTE (c);
2103           if (c < 0xA0)
2104             goto invalid_code;
2105           code |= c & 0x7F;
2106           break;
2107
2108         case 1:
2109           code = c;
2110           charset_ID = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2111           break;
2112
2113         default:
2114           emacs_abort ();
2115         }
2116       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2117                           CHARSET_FROM_ID (charset_ID), code, c);
2118       if (c < 0)
2119         goto invalid_code;
2120     }
2121   *nbytes = src - src_base;
2122   *nchars = consumed_chars;
2123   if (id)
2124     *id = charset_ID;
2125   return (mseq_found ? -c : c);
2126
2127  no_more_source:
2128   return -2;
2129
2130  invalid_code:
2131   return -1;
2132 }
2133
2134
2135 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2136
2137 /* Handle these composition sequence ('|': the end of header elements,
2138    BYTES and CHARS >= 0xA0):
2139
2140    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2141    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2142    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2143
2144    and these old form:
2145
2146    (4) relative composition: 0x80 | MSEQ ... MSEQ
2147    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2148
2149    When the starter 0x80 and the following header elements are found,
2150    this annotation header is produced.
2151
2152         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2153
2154    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2155    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2156
2157    Then, upon reading the following elements, these codes are produced
2158    until the composition end is found:
2159
2160    (1) CHAR ... CHAR
2161    (2) ALT ... ALT CHAR ... CHAR
2162    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2163    (4) CHAR ... CHAR
2164    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2165
2166    When the composition end is found, LENGTH and NCHARS in the
2167    annotation header is updated as below:
2168
2169    (1) LENGTH: unchanged, NCHARS: unchanged
2170    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2171    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2172    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2173    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2174
2175    If an error is found while composing, the annotation header is
2176    changed to the original composition header (plus filler -1s) as
2177    below:
2178
2179    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2180    (5)          [ 0x80 0xFF -1 -1- -1 ]
2181
2182    and the sequence [ -2 DECODED-RULE ] is changed to the original
2183    byte sequence as below:
2184         o the original byte sequence is B: [ B -1 ]
2185         o the original byte sequence is B1 B2: [ B1 B2 ]
2186
2187    Most of the routines are implemented by macros because many
2188    variables and labels in the caller decode_coding_emacs_mule must be
2189    accessible, and they are usually called just once (thus doesn't
2190    increase the size of compiled object).  */
2191
2192 /* Decode a composition rule represented by C as a component of
2193    composition sequence of Emacs 20 style.  Set RULE to the decoded
2194    rule. */
2195
2196 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2197   do {                                                  \
2198     int gref, nref;                                     \
2199                                                         \
2200     c -= 0xA0;                                          \
2201     if (c < 0 || c >= 81)                               \
2202       goto invalid_code;                                \
2203     gref = c / 9, nref = c % 9;                         \
2204     if (gref == 4) gref = 10;                           \
2205     if (nref == 4) nref = 10;                           \
2206     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2207   } while (0)
2208
2209
2210 /* Decode a composition rule represented by C and the following byte
2211    at SRC as a component of composition sequence of Emacs 21 style.
2212    Set RULE to the decoded rule.  */
2213
2214 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2215   do {                                                  \
2216     int gref, nref;                                     \
2217                                                         \
2218     gref = c - 0x20;                                    \
2219     if (gref < 0 || gref >= 81)                         \
2220       goto invalid_code;                                \
2221     ONE_MORE_BYTE (c);                                  \
2222     nref = c - 0x20;                                    \
2223     if (nref < 0 || nref >= 81)                         \
2224       goto invalid_code;                                \
2225     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2226   } while (0)
2227
2228
2229 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2230    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2231    byte length of this composition information, CHARS is the number of
2232    characters composed by this composition.  */
2233
2234 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2235   do {                                                                  \
2236     enum composition_method method = c - 0xF2;                          \
2237     int nbytes, nchars;                                                 \
2238                                                                         \
2239     ONE_MORE_BYTE (c);                                                  \
2240     if (c < 0)                                                          \
2241       goto invalid_code;                                                \
2242     nbytes = c - 0xA0;                                                  \
2243     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2244       goto invalid_code;                                                \
2245     ONE_MORE_BYTE (c);                                                  \
2246     nchars = c - 0xA0;                                                  \
2247     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2248       goto invalid_code;                                                \
2249     cmp_status->old_form = 0;                                           \
2250     cmp_status->method = method;                                        \
2251     if (method == COMPOSITION_RELATIVE)                                 \
2252       cmp_status->state = COMPOSING_CHAR;                               \
2253     else                                                                \
2254       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2255     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2256     cmp_status->nchars = nchars;                                        \
2257     cmp_status->ncomps = nbytes - 4;                                    \
2258     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2259   } while (0)
2260
2261
2262 /* Start of Emacs 20 style format for relative composition.  */
2263
2264 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2265   do {                                                          \
2266     cmp_status->old_form = 1;                                   \
2267     cmp_status->method = COMPOSITION_RELATIVE;                  \
2268     cmp_status->state = COMPOSING_CHAR;                         \
2269     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2270     cmp_status->nchars = cmp_status->ncomps = 0;                \
2271     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2272   } while (0)
2273
2274
2275 /* Start of Emacs 20 style format for rule-base composition.  */
2276
2277 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2278   do {                                                          \
2279     cmp_status->old_form = 1;                                   \
2280     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2281     cmp_status->state = COMPOSING_CHAR;                         \
2282     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2283     cmp_status->nchars = cmp_status->ncomps = 0;                \
2284     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2285   } while (0)
2286
2287
2288 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2289   do {                                                  \
2290     const unsigned char *current_src = src;             \
2291                                                         \
2292     ONE_MORE_BYTE (c);                                  \
2293     if (c < 0)                                          \
2294       goto invalid_code;                                \
2295     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2296         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2297       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2298     else if (c < 0xA0)                                  \
2299       goto invalid_code;                                \
2300     else if (c < 0xC0)                                  \
2301       {                                                 \
2302         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2303         /* Re-read C as a composition component.  */    \
2304         src = current_src;                              \
2305       }                                                 \
2306     else if (c == 0xFF)                                 \
2307       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2308     else                                                \
2309       goto invalid_code;                                \
2310   } while (0)
2311
2312 #define EMACS_MULE_COMPOSITION_END()                            \
2313   do {                                                          \
2314     int idx = - cmp_status->length;                             \
2315                                                                 \
2316     if (cmp_status->old_form)                                   \
2317       charbuf[idx + 2] = cmp_status->nchars;                    \
2318     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2319       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2320     cmp_status->state = COMPOSING_NO;                           \
2321   } while (0)
2322
2323
2324 static int
2325 emacs_mule_finish_composition (int *charbuf,
2326                                struct composition_status *cmp_status)
2327 {
2328   int idx = - cmp_status->length;
2329   int new_chars;
2330
2331   if (cmp_status->old_form && cmp_status->nchars > 0)
2332     {
2333       charbuf[idx + 2] = cmp_status->nchars;
2334       new_chars = 0;
2335       if (cmp_status->method == COMPOSITION_WITH_RULE
2336           && cmp_status->state == COMPOSING_CHAR)
2337         {
2338           /* The last rule was invalid.  */
2339           int rule = charbuf[-1] + 0xA0;
2340
2341           charbuf[-2] = BYTE8_TO_CHAR (rule);
2342           charbuf[-1] = -1;
2343           new_chars = 1;
2344         }
2345     }
2346   else
2347     {
2348       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2349
2350       if (cmp_status->method == COMPOSITION_WITH_RULE)
2351         {
2352           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2353           charbuf[idx++] = -3;
2354           charbuf[idx++] = 0;
2355           new_chars = 1;
2356         }
2357       else
2358         {
2359           int nchars = charbuf[idx + 1] + 0xA0;
2360           int nbytes = charbuf[idx + 2] + 0xA0;
2361
2362           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2363           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2364           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2365           charbuf[idx++] = -1;
2366           new_chars = 4;
2367         }
2368     }
2369   cmp_status->state = COMPOSING_NO;
2370   return new_chars;
2371 }
2372
2373 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2374   do {                                                                    \
2375     if (cmp_status->state != COMPOSING_NO)                                \
2376       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2377   } while (0)
2378
2379
2380 static void
2381 decode_coding_emacs_mule (struct coding_system *coding)
2382 {
2383   const unsigned char *src = coding->source + coding->consumed;
2384   const unsigned char *src_end = coding->source + coding->src_bytes;
2385   const unsigned char *src_base;
2386   int *charbuf = coding->charbuf + coding->charbuf_used;
2387   /* We may produce two annotations (charset and composition) in one
2388      loop and one more charset annotation at the end.  */
2389   int *charbuf_end
2390     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3)
2391       /* We can produce up to 2 characters in a loop.  */
2392       - 1;
2393   ptrdiff_t consumed_chars = 0, consumed_chars_base;
2394   bool multibytep = coding->src_multibyte;
2395   ptrdiff_t char_offset = coding->produced_char;
2396   ptrdiff_t last_offset = char_offset;
2397   int last_id = charset_ascii;
2398   bool eol_dos
2399     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2400   int byte_after_cr = -1;
2401   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2402
2403   if (cmp_status->state != COMPOSING_NO)
2404     {
2405       int i;
2406
2407       if (charbuf_end - charbuf < cmp_status->length)
2408         emacs_abort ();
2409       for (i = 0; i < cmp_status->length; i++)
2410         *charbuf++ = cmp_status->carryover[i];
2411       coding->annotated = 1;
2412     }
2413
2414   while (1)
2415     {
2416       int c, id IF_LINT (= 0);
2417
2418       src_base = src;
2419       consumed_chars_base = consumed_chars;
2420
2421       if (charbuf >= charbuf_end)
2422         {
2423           if (byte_after_cr >= 0)
2424             src_base--;
2425           break;
2426         }
2427
2428       if (byte_after_cr >= 0)
2429         c = byte_after_cr, byte_after_cr = -1;
2430       else
2431         ONE_MORE_BYTE (c);
2432
2433       if (c < 0 || c == 0x80)
2434         {
2435           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2436           if (c < 0)
2437             {
2438               *charbuf++ = -c;
2439               char_offset++;
2440             }
2441           else
2442             DECODE_EMACS_MULE_COMPOSITION_START ();
2443           continue;
2444         }
2445
2446       if (c < 0x80)
2447         {
2448           if (eol_dos && c == '\r')
2449             ONE_MORE_BYTE (byte_after_cr);
2450           id = charset_ascii;
2451           if (cmp_status->state != COMPOSING_NO)
2452             {
2453               if (cmp_status->old_form)
2454                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2455               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2456                 cmp_status->ncomps--;
2457             }
2458         }
2459       else
2460         {
2461           int nchars IF_LINT (= 0), nbytes IF_LINT (= 0);
2462           /* emacs_mule_char can load a charset map from a file, which
2463              allocates a large structure and might cause buffer text
2464              to be relocated as result.  Thus, we need to remember the
2465              original pointer to buffer text, and fix up all related
2466              pointers after the call.  */
2467           const unsigned char *orig = coding->source;
2468           ptrdiff_t offset;
2469
2470           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2471                                cmp_status);
2472           offset = coding->source - orig;
2473           if (offset)
2474             {
2475               src += offset;
2476               src_base += offset;
2477               src_end += offset;
2478             }
2479           if (c < 0)
2480             {
2481               if (c == -1)
2482                 goto invalid_code;
2483               if (c == -2)
2484                 break;
2485             }
2486           src = src_base + nbytes;
2487           consumed_chars = consumed_chars_base + nchars;
2488           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2489             cmp_status->ncomps -= nchars;
2490         }
2491
2492       /* Now if C >= 0, we found a normally encoded character, if C <
2493          0, we found an old-style composition component character or
2494          rule.  */
2495
2496       if (cmp_status->state == COMPOSING_NO)
2497         {
2498           if (last_id != id)
2499             {
2500               if (last_id != charset_ascii)
2501                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2502                                   last_id);
2503               last_id = id;
2504               last_offset = char_offset;
2505             }
2506           *charbuf++ = c;
2507           char_offset++;
2508         }
2509       else if (cmp_status->state == COMPOSING_CHAR)
2510         {
2511           if (cmp_status->old_form)
2512             {
2513               if (c >= 0)
2514                 {
2515                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516                   *charbuf++ = c;
2517                   char_offset++;
2518                 }
2519               else
2520                 {
2521                   *charbuf++ = -c;
2522                   cmp_status->nchars++;
2523                   cmp_status->length++;
2524                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2525                     EMACS_MULE_COMPOSITION_END ();
2526                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2527                     cmp_status->state = COMPOSING_RULE;
2528                 }
2529             }
2530           else
2531             {
2532               *charbuf++ = c;
2533               cmp_status->length++;
2534               cmp_status->nchars--;
2535               if (cmp_status->nchars == 0)
2536                 EMACS_MULE_COMPOSITION_END ();
2537             }
2538         }
2539       else if (cmp_status->state == COMPOSING_RULE)
2540         {
2541           int rule;
2542
2543           if (c >= 0)
2544             {
2545               EMACS_MULE_COMPOSITION_END ();
2546               *charbuf++ = c;
2547               char_offset++;
2548             }
2549           else
2550             {
2551               c = -c;
2552               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2553               if (rule < 0)
2554                 goto invalid_code;
2555               *charbuf++ = -2;
2556               *charbuf++ = rule;
2557               cmp_status->length += 2;
2558               cmp_status->state = COMPOSING_CHAR;
2559             }
2560         }
2561       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2562         {
2563           *charbuf++ = c;
2564           cmp_status->length++;
2565           if (cmp_status->ncomps == 0)
2566             cmp_status->state = COMPOSING_CHAR;
2567           else if (cmp_status->ncomps > 0)
2568             {
2569               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2570                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2571             }
2572           else
2573             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2574         }
2575       else                      /* COMPOSING_COMPONENT_RULE */
2576         {
2577           int rule;
2578
2579           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2580           if (rule < 0)
2581             goto invalid_code;
2582           *charbuf++ = -2;
2583           *charbuf++ = rule;
2584           cmp_status->length += 2;
2585           cmp_status->ncomps--;
2586           if (cmp_status->ncomps > 0)
2587             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2588           else
2589             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2590         }
2591       continue;
2592
2593     invalid_code:
2594       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2595       src = src_base;
2596       consumed_chars = consumed_chars_base;
2597       ONE_MORE_BYTE (c);
2598       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2599       char_offset++;
2600       coding->errors++;
2601     }
2602
2603  no_more_source:
2604   if (cmp_status->state != COMPOSING_NO)
2605     {
2606       if (coding->mode & CODING_MODE_LAST_BLOCK)
2607         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2608       else
2609         {
2610           int i;
2611
2612           charbuf -= cmp_status->length;
2613           for (i = 0; i < cmp_status->length; i++)
2614             cmp_status->carryover[i] = charbuf[i];
2615         }
2616     }
2617   if (last_id != charset_ascii)
2618     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2619   coding->consumed_char += consumed_chars_base;
2620   coding->consumed = src_base - coding->source;
2621   coding->charbuf_used = charbuf - coding->charbuf;
2622 }
2623
2624
2625 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2626   do {                                          \
2627     if (id < 0xA0)                              \
2628       codes[0] = id, codes[1] = 0;              \
2629     else if (id < 0xE0)                         \
2630       codes[0] = 0x9A, codes[1] = id;           \
2631     else if (id < 0xF0)                         \
2632       codes[0] = 0x9B, codes[1] = id;           \
2633     else if (id < 0xF5)                         \
2634       codes[0] = 0x9C, codes[1] = id;           \
2635     else                                        \
2636       codes[0] = 0x9D, codes[1] = id;           \
2637   } while (0);
2638
2639
2640 static bool
2641 encode_coding_emacs_mule (struct coding_system *coding)
2642 {
2643   bool multibytep = coding->dst_multibyte;
2644   int *charbuf = coding->charbuf;
2645   int *charbuf_end = charbuf + coding->charbuf_used;
2646   unsigned char *dst = coding->destination + coding->produced;
2647   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2648   int safe_room = 8;
2649   ptrdiff_t produced_chars = 0;
2650   Lisp_Object attrs, charset_list;
2651   int c;
2652   int preferred_charset_id = -1;
2653
2654   CODING_GET_INFO (coding, attrs, charset_list);
2655   if (! EQ (charset_list, Vemacs_mule_charset_list))
2656     {
2657       charset_list = Vemacs_mule_charset_list;
2658       ASET (attrs, coding_attr_charset_list, charset_list);
2659     }
2660
2661   while (charbuf < charbuf_end)
2662     {
2663       ASSURE_DESTINATION (safe_room);
2664       c = *charbuf++;
2665
2666       if (c < 0)
2667         {
2668           /* Handle an annotation.  */
2669           switch (*charbuf)
2670             {
2671             case CODING_ANNOTATE_COMPOSITION_MASK:
2672               /* Not yet implemented.  */
2673               break;
2674             case CODING_ANNOTATE_CHARSET_MASK:
2675               preferred_charset_id = charbuf[3];
2676               if (preferred_charset_id >= 0
2677                   && NILP (Fmemq (make_number (preferred_charset_id),
2678                                   charset_list)))
2679                 preferred_charset_id = -1;
2680               break;
2681             default:
2682               emacs_abort ();
2683             }
2684           charbuf += -c - 1;
2685           continue;
2686         }
2687
2688       if (ASCII_CHAR_P (c))
2689         EMIT_ONE_ASCII_BYTE (c);
2690       else if (CHAR_BYTE8_P (c))
2691         {
2692           c = CHAR_TO_BYTE8 (c);
2693           EMIT_ONE_BYTE (c);
2694         }
2695       else
2696         {
2697           struct charset *charset;
2698           unsigned code;
2699           int dimension;
2700           int emacs_mule_id;
2701           unsigned char leading_codes[2];
2702
2703           if (preferred_charset_id >= 0)
2704             {
2705               bool result;
2706
2707               charset = CHARSET_FROM_ID (preferred_charset_id);
2708               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
2709               if (result)
2710                 code = ENCODE_CHAR (charset, c);
2711               else
2712                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2713                                      &code, charset);
2714             }
2715           else
2716             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2717                                  &code, charset);
2718           if (! charset)
2719             {
2720               c = coding->default_char;
2721               if (ASCII_CHAR_P (c))
2722                 {
2723                   EMIT_ONE_ASCII_BYTE (c);
2724                   continue;
2725                 }
2726               CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
2727                                    &code, charset);
2728             }
2729           dimension = CHARSET_DIMENSION (charset);
2730           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2731           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2732           EMIT_ONE_BYTE (leading_codes[0]);
2733           if (leading_codes[1])
2734             EMIT_ONE_BYTE (leading_codes[1]);
2735           if (dimension == 1)
2736             EMIT_ONE_BYTE (code | 0x80);
2737           else
2738             {
2739               code |= 0x8080;
2740               EMIT_ONE_BYTE (code >> 8);
2741               EMIT_ONE_BYTE (code & 0xFF);
2742             }
2743         }
2744     }
2745   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2746   coding->produced_char += produced_chars;
2747   coding->produced = dst - coding->destination;
2748   return 0;
2749 }
2750
2751 \f
2752 /*** 7. ISO2022 handlers ***/
2753
2754 /* The following note describes the coding system ISO2022 briefly.
2755    Since the intention of this note is to help understand the
2756    functions in this file, some parts are NOT ACCURATE or are OVERLY
2757    SIMPLIFIED.  For thorough understanding, please refer to the
2758    original document of ISO2022.  This is equivalent to the standard
2759    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2760
2761    ISO2022 provides many mechanisms to encode several character sets
2762    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2763    is encoded using bytes less than 128.  This may make the encoded
2764    text a little bit longer, but the text passes more easily through
2765    several types of gateway, some of which strip off the MSB (Most
2766    Significant Bit).
2767
2768    There are two kinds of character sets: control character sets and
2769    graphic character sets.  The former contain control characters such
2770    as `newline' and `escape' to provide control functions (control
2771    functions are also provided by escape sequences).  The latter
2772    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2773    two control character sets and many graphic character sets.
2774
2775    Graphic character sets are classified into one of the following
2776    four classes, according to the number of bytes (DIMENSION) and
2777    number of characters in one dimension (CHARS) of the set:
2778    - DIMENSION1_CHARS94
2779    - DIMENSION1_CHARS96
2780    - DIMENSION2_CHARS94
2781    - DIMENSION2_CHARS96
2782
2783    In addition, each character set is assigned an identification tag,
2784    unique for each set, called the "final character" (denoted as <F>
2785    hereafter).  The <F> of each character set is decided by ECMA(*)
2786    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2787    (0x30..0x3F are for private use only).
2788
2789    Note (*): ECMA = European Computer Manufacturers Association
2790
2791    Here are examples of graphic character sets [NAME(<F>)]:
2792         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2793         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2794         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2795         o DIMENSION2_CHARS96 -- none for the moment
2796
2797    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2798         C0 [0x00..0x1F] -- control character plane 0
2799         GL [0x20..0x7F] -- graphic character plane 0
2800         C1 [0x80..0x9F] -- control character plane 1
2801         GR [0xA0..0xFF] -- graphic character plane 1
2802
2803    A control character set is directly designated and invoked to C0 or
2804    C1 by an escape sequence.  The most common case is that:
2805    - ISO646's  control character set is designated/invoked to C0, and
2806    - ISO6429's control character set is designated/invoked to C1,
2807    and usually these designations/invocations are omitted in encoded
2808    text.  In a 7-bit environment, only C0 can be used, and a control
2809    character for C1 is encoded by an appropriate escape sequence to
2810    fit into the environment.  All control characters for C1 are
2811    defined to have corresponding escape sequences.
2812
2813    A graphic character set is at first designated to one of four
2814    graphic registers (G0 through G3), then these graphic registers are
2815    invoked to GL or GR.  These designations and invocations can be
2816    done independently.  The most common case is that G0 is invoked to
2817    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2818    these invocations and designations are omitted in encoded text.
2819    In a 7-bit environment, only GL can be used.
2820
2821    When a graphic character set of CHARS94 is invoked to GL, codes
2822    0x20 and 0x7F of the GL area work as control characters SPACE and
2823    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2824    be used.
2825
2826    There are two ways of invocation: locking-shift and single-shift.
2827    With locking-shift, the invocation lasts until the next different
2828    invocation, whereas with single-shift, the invocation affects the
2829    following character only and doesn't affect the locking-shift
2830    state.  Invocations are done by the following control characters or
2831    escape sequences:
2832
2833    ----------------------------------------------------------------------
2834    abbrev  function                  cntrl escape seq   description
2835    ----------------------------------------------------------------------
2836    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2837    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2838    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2839    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2840    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2841    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2842    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2843    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2844    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2845    ----------------------------------------------------------------------
2846    (*) These are not used by any known coding system.
2847
2848    Control characters for these functions are defined by macros
2849    ISO_CODE_XXX in `coding.h'.
2850
2851    Designations are done by the following escape sequences:
2852    ----------------------------------------------------------------------
2853    escape sequence      description
2854    ----------------------------------------------------------------------
2855    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2856    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2857    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2858    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2859    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2860    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2861    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2862    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2863    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2864    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2865    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2866    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2867    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2868    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2869    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2870    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2871    ----------------------------------------------------------------------
2872
2873    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2874    of dimension 1, chars 94, and final character <F>, etc...
2875
2876    Note (*): Although these designations are not allowed in ISO2022,
2877    Emacs accepts them on decoding, and produces them on encoding
2878    CHARS96 character sets in a coding system which is characterized as
2879    7-bit environment, non-locking-shift, and non-single-shift.
2880
2881    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2882    '(' must be omitted.  We refer to this as "short-form" hereafter.
2883
2884    Now you may notice that there are a lot of ways of encoding the
2885    same multilingual text in ISO2022.  Actually, there exist many
2886    coding systems such as Compound Text (used in X11's inter client
2887    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2888    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2889    localized platforms), and all of these are variants of ISO2022.
2890
2891    In addition to the above, Emacs handles two more kinds of escape
2892    sequences: ISO6429's direction specification and Emacs' private
2893    sequence for specifying character composition.
2894
2895    ISO6429's direction specification takes the following form:
2896         o CSI ']'      -- end of the current direction
2897         o CSI '0' ']'  -- end of the current direction
2898         o CSI '1' ']'  -- start of left-to-right text
2899         o CSI '2' ']'  -- start of right-to-left text
2900    The control character CSI (0x9B: control sequence introducer) is
2901    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2902
2903    Character composition specification takes the following form:
2904         o ESC '0' -- start relative composition
2905         o ESC '1' -- end composition
2906         o ESC '2' -- start rule-base composition (*)
2907         o ESC '3' -- start relative composition with alternate chars  (**)
2908         o ESC '4' -- start rule-base composition with alternate chars  (**)
2909   Since these are not standard escape sequences of any ISO standard,
2910   the use of them with these meanings is restricted to Emacs only.
2911
2912   (*) This form is used only in Emacs 20.7 and older versions,
2913   but newer versions can safely decode it.
2914   (**) This form is used only in Emacs 21.1 and newer versions,
2915   and older versions can't decode it.
2916
2917   Here's a list of example usages of these composition escape
2918   sequences (categorized by `enum composition_method').
2919
2920   COMPOSITION_RELATIVE:
2921         ESC 0 CHAR [ CHAR ] ESC 1
2922   COMPOSITION_WITH_RULE:
2923         ESC 2 CHAR [ RULE CHAR ] ESC 1
2924   COMPOSITION_WITH_ALTCHARS:
2925         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2926   COMPOSITION_WITH_RULE_ALTCHARS:
2927         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2928
2929 static enum iso_code_class_type iso_code_class[256];
2930
2931 #define SAFE_CHARSET_P(coding, id)      \
2932   ((id) <= (coding)->max_charset_id     \
2933    && (coding)->safe_charsets[id] != 255)
2934
2935 static void
2936 setup_iso_safe_charsets (Lisp_Object attrs)
2937 {
2938   Lisp_Object charset_list, safe_charsets;
2939   Lisp_Object request;
2940   Lisp_Object reg_usage;
2941   Lisp_Object tail;
2942   EMACS_INT reg94, reg96;
2943   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2944   int max_charset_id;
2945
2946   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2947   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2948       && ! EQ (charset_list, Viso_2022_charset_list))
2949     {
2950       charset_list = Viso_2022_charset_list;
2951       ASET (attrs, coding_attr_charset_list, charset_list);
2952       ASET (attrs, coding_attr_safe_charsets, Qnil);
2953     }
2954
2955   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2956     return;
2957
2958   max_charset_id = 0;
2959   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2960     {
2961       int id = XINT (XCAR (tail));
2962       if (max_charset_id < id)
2963         max_charset_id = id;
2964     }
2965
2966   safe_charsets = make_uninit_string (max_charset_id + 1);
2967   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2968   request = AREF (attrs, coding_attr_iso_request);
2969   reg_usage = AREF (attrs, coding_attr_iso_usage);
2970   reg94 = XINT (XCAR (reg_usage));
2971   reg96 = XINT (XCDR (reg_usage));
2972
2973   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2974     {
2975       Lisp_Object id;
2976       Lisp_Object reg;
2977       struct charset *charset;
2978
2979       id = XCAR (tail);
2980       charset = CHARSET_FROM_ID (XINT (id));
2981       reg = Fcdr (Fassq (id, request));
2982       if (! NILP (reg))
2983         SSET (safe_charsets, XINT (id), XINT (reg));
2984       else if (charset->iso_chars_96)
2985         {
2986           if (reg96 < 4)
2987             SSET (safe_charsets, XINT (id), reg96);
2988         }
2989       else
2990         {
2991           if (reg94 < 4)
2992             SSET (safe_charsets, XINT (id), reg94);
2993         }
2994     }
2995   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2996 }
2997
2998
2999 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3000    Return true if a text is encoded in one of ISO-2022 based coding
3001    systems.  */
3002
3003 static bool
3004 detect_coding_iso_2022 (struct coding_system *coding,
3005                         struct coding_detection_info *detect_info)
3006 {
3007   const unsigned char *src = coding->source, *src_base = src;
3008   const unsigned char *src_end = coding->source + coding->src_bytes;
3009   bool multibytep = coding->src_multibyte;
3010   bool single_shifting = 0;
3011   int id;
3012   int c, c1;
3013   ptrdiff_t consumed_chars = 0;
3014   int i;
3015   int rejected = 0;
3016   int found = 0;
3017   int composition_count = -1;
3018
3019   detect_info->checked |= CATEGORY_MASK_ISO;
3020
3021   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3022     {
3023       struct coding_system *this = &(coding_categories[i]);
3024       Lisp_Object attrs, val;
3025
3026       if (this->id < 0)
3027         continue;
3028       attrs = CODING_ID_ATTRS (this->id);
3029       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3030           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3031         setup_iso_safe_charsets (attrs);
3032       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3033       this->max_charset_id = SCHARS (val) - 1;
3034       this->safe_charsets = SDATA (val);
3035     }
3036
3037   /* A coding system of this category is always ASCII compatible.  */
3038   src += coding->head_ascii;
3039
3040   while (rejected != CATEGORY_MASK_ISO)
3041     {
3042       src_base = src;
3043       ONE_MORE_BYTE (c);
3044       switch (c)
3045         {
3046         case ISO_CODE_ESC:
3047           if (inhibit_iso_escape_detection)
3048             break;
3049           single_shifting = 0;
3050           ONE_MORE_BYTE (c);
3051           if (c == 'N' || c == 'O')
3052             {
3053               /* ESC <Fe> for SS2 or SS3.  */
3054               single_shifting = 1;
3055               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3056             }
3057           else if (c == '1')
3058             {
3059               /* End of composition.  */
3060               if (composition_count < 0
3061                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3062                 /* Invalid */
3063                 break;
3064               composition_count = -1;
3065               found |= CATEGORY_MASK_ISO;
3066             }
3067           else if (c >= '0' && c <= '4')
3068             {
3069               /* ESC <Fp> for start/end composition.  */
3070               composition_count = 0;
3071             }
3072           else
3073             {
3074               if (c >= '(' && c <= '/')
3075                 {
3076                   /* Designation sequence for a charset of dimension 1.  */
3077                   ONE_MORE_BYTE (c1);
3078                   if (c1 < ' ' || c1 >= 0x80
3079                       || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3080                     /* Invalid designation sequence.  Just ignore.  */
3081                     break;
3082                 }
3083               else if (c == '$')
3084                 {
3085                   /* Designation sequence for a charset of dimension 2.  */
3086                   ONE_MORE_BYTE (c);
3087                   if (c >= '@' && c <= 'B')
3088                     /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3089                     id = iso_charset_table[1][0][c];
3090                   else if (c >= '(' && c <= '/')
3091                     {
3092                       ONE_MORE_BYTE (c1);
3093                       if (c1 < ' ' || c1 >= 0x80
3094                           || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3095                         /* Invalid designation sequence.  Just ignore.  */
3096                         break;
3097                     }
3098                   else
3099                     /* Invalid designation sequence.  Just ignore it.  */
3100                     break;
3101                 }
3102               else
3103                 {
3104                   /* Invalid escape sequence.  Just ignore it.  */
3105                   break;
3106                 }
3107
3108               /* We found a valid designation sequence for CHARSET.  */
3109               rejected |= CATEGORY_MASK_ISO_8BIT;
3110               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3111                                   id))
3112                 found |= CATEGORY_MASK_ISO_7;
3113               else
3114                 rejected |= CATEGORY_MASK_ISO_7;
3115               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3116                                   id))
3117                 found |= CATEGORY_MASK_ISO_7_TIGHT;
3118               else
3119                 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3120               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3121                                   id))
3122                 found |= CATEGORY_MASK_ISO_7_ELSE;
3123               else
3124                 rejected |= CATEGORY_MASK_ISO_7_ELSE;
3125               if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3126                                   id))
3127                 found |= CATEGORY_MASK_ISO_8_ELSE;
3128               else
3129                 rejected |= CATEGORY_MASK_ISO_8_ELSE;
3130             }
3131           break;
3132
3133         case ISO_CODE_SO:
3134         case ISO_CODE_SI:
3135           /* Locking shift out/in.  */
3136           if (inhibit_iso_escape_detection)
3137             break;
3138           single_shifting = 0;
3139           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3140           break;
3141
3142         case ISO_CODE_CSI:
3143           /* Control sequence introducer.  */
3144           single_shifting = 0;
3145           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3146           found |= CATEGORY_MASK_ISO_8_ELSE;
3147           goto check_extra_latin;
3148
3149         case ISO_CODE_SS2:
3150         case ISO_CODE_SS3:
3151           /* Single shift.   */
3152           if (inhibit_iso_escape_detection)
3153             break;
3154           single_shifting = 0;
3155           rejected |= CATEGORY_MASK_ISO_7BIT;
3156           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3157               & CODING_ISO_FLAG_SINGLE_SHIFT)
3158             {
3159               found |= CATEGORY_MASK_ISO_8_1;
3160               single_shifting = 1;
3161             }
3162           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3163               & CODING_ISO_FLAG_SINGLE_SHIFT)
3164             {
3165               found |= CATEGORY_MASK_ISO_8_2;
3166               single_shifting = 1;
3167             }
3168           if (single_shifting)
3169             break;
3170           goto check_extra_latin;
3171
3172         default:
3173           if (c < 0)
3174             continue;
3175           if (c < 0x80)
3176             {
3177               if (composition_count >= 0)
3178                 composition_count++;
3179               single_shifting = 0;
3180               break;
3181             }
3182           if (c >= 0xA0)
3183             {
3184               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3185               found |= CATEGORY_MASK_ISO_8_1;
3186               /* Check the length of succeeding codes of the range
3187                  0xA0..0FF.  If the byte length is even, we include
3188                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3189                  only when we are not single shifting.  */
3190               if (! single_shifting
3191                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3192                 {
3193                   int len = 1;
3194                   while (src < src_end)
3195                     {
3196                       src_base = src;
3197                       ONE_MORE_BYTE (c);
3198                       if (c < 0xA0)
3199                         {
3200                           src = src_base;
3201                           break;
3202                         }
3203                       len++;
3204                     }
3205
3206                   if (len & 1 && src < src_end)
3207                     {
3208                       rejected |= CATEGORY_MASK_ISO_8_2;
3209                       if (composition_count >= 0)
3210                         composition_count += len;
3211                     }
3212                   else
3213                     {
3214                       found |= CATEGORY_MASK_ISO_8_2;
3215                       if (composition_count >= 0)
3216                         composition_count += len / 2;
3217                     }
3218                 }
3219               break;
3220             }
3221         check_extra_latin:
3222           if (! VECTORP (Vlatin_extra_code_table)
3223               || NILP (AREF (Vlatin_extra_code_table, c)))
3224             {
3225               rejected = CATEGORY_MASK_ISO;
3226               break;
3227             }
3228           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3229               & CODING_ISO_FLAG_LATIN_EXTRA)
3230             found |= CATEGORY_MASK_ISO_8_1;
3231           else
3232             rejected |= CATEGORY_MASK_ISO_8_1;
3233           rejected |= CATEGORY_MASK_ISO_8_2;
3234           break;
3235         }
3236     }
3237   detect_info->rejected |= CATEGORY_MASK_ISO;
3238   return 0;
3239
3240  no_more_source:
3241   detect_info->rejected |= rejected;
3242   detect_info->found |= (found & ~rejected);
3243   return 1;
3244 }
3245
3246
3247 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3248    escape sequence should be kept.  */
3249 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3250   do {                                                                  \
3251     int id, prev;                                                       \
3252                                                                         \
3253     if (final < '0' || final >= 128                                     \
3254         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3255         || !SAFE_CHARSET_P (coding, id))                                \
3256       {                                                                 \
3257         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3258         chars_96 = -1;                                                  \
3259         break;                                                          \
3260       }                                                                 \
3261     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3262     if (id == charset_jisx0201_roman)                                   \
3263       {                                                                 \
3264         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3265           id = charset_ascii;                                           \
3266       }                                                                 \
3267     else if (id == charset_jisx0208_1978)                               \
3268       {                                                                 \
3269         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3270           id = charset_jisx0208;                                        \
3271       }                                                                 \
3272     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3273     /* If there was an invalid designation to REG previously, and this  \
3274        designation is ASCII to REG, we should keep this designation     \
3275        sequence.  */                                                    \
3276     if (prev == -2 && id == charset_ascii)                              \
3277       chars_96 = -1;                                                    \
3278   } while (0)
3279
3280
3281 /* Handle these composition sequence (ALT: alternate char):
3282
3283    (1) relative composition: ESC 0 CHAR ... ESC 1
3284    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3285    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3286    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3287
3288    When the start sequence (ESC 0/2/3/4) is found, this annotation
3289    header is produced.
3290
3291         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3292
3293    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3294    produced until the end sequence (ESC 1) is found:
3295
3296    (1) CHAR ... CHAR
3297    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3298    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3299    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3300
3301    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3302    annotation header is updated as below:
3303
3304    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3305    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3306    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3307    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3308
3309    If an error is found while composing, the annotation header is
3310    changed to:
3311
3312         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3313
3314    and the sequence [ -2 DECODED-RULE ] is changed to the original
3315    byte sequence as below:
3316         o the original byte sequence is B: [ B -1 ]
3317         o the original byte sequence is B1 B2: [ B1 B2 ]
3318    and the sequence [ -1 -1 ] is changed to the original byte
3319    sequence:
3320         [ ESC '0' ]
3321 */
3322
3323 /* Decode a composition rule C1 and maybe one more byte from the
3324    source, and set RULE to the encoded composition rule.  If the rule
3325    is invalid, goto invalid_code.  */
3326
3327 #define DECODE_COMPOSITION_RULE(rule)                                   \
3328   do {                                                                  \
3329     rule = c1 - 32;                                                     \
3330     if (rule < 0)                                                       \
3331       goto invalid_code;                                                \
3332     if (rule < 81)              /* old format (before ver.21) */        \
3333       {                                                                 \
3334         int gref = (rule) / 9;                                          \
3335         int nref = (rule) % 9;                                          \
3336         if (gref == 4) gref = 10;                                       \
3337         if (nref == 4) nref = 10;                                       \
3338         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3339       }                                                                 \
3340     else                        /* new format (after ver.21) */         \
3341       {                                                                 \
3342         int b;                                                          \
3343                                                                         \
3344         ONE_MORE_BYTE (b);                                              \
3345         if (! COMPOSITION_ENCODE_RULE_VALID (rule - 81, b - 32))        \
3346           goto invalid_code;                                            \
3347         rule = COMPOSITION_ENCODE_RULE (rule - 81, b - 32);             \
3348         rule += 0x100;   /* Distinguish it from the old format.  */     \
3349       }                                                                 \
3350   } while (0)
3351
3352 #define ENCODE_COMPOSITION_RULE(rule)                           \
3353   do {                                                          \
3354     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3355                                                                 \
3356     if (rule < 0x100)           /* old format */                \
3357       {                                                         \
3358         if (gref == 10) gref = 4;                               \
3359         if (nref == 10) nref = 4;                               \
3360         charbuf[idx] = 32 + gref * 9 + nref;                    \
3361         charbuf[idx + 1] = -1;                                  \
3362         new_chars++;                                            \
3363       }                                                         \
3364     else                                /* new format */        \
3365       {                                                         \
3366         charbuf[idx] = 32 + 81 + gref;                          \
3367         charbuf[idx + 1] = 32 + nref;                           \
3368         new_chars += 2;                                         \
3369       }                                                         \
3370   } while (0)
3371
3372 /* Finish the current composition as invalid.  */
3373
3374 static int
3375 finish_composition (int *charbuf, struct composition_status *cmp_status)
3376 {
3377   int idx = - cmp_status->length;
3378   int new_chars;
3379
3380   /* Recover the original ESC sequence */
3381   charbuf[idx++] = ISO_CODE_ESC;
3382   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3383                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3384                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3385                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3386                     : '4');
3387   charbuf[idx++] = -2;
3388   charbuf[idx++] = 0;
3389   charbuf[idx++] = -1;
3390   new_chars = cmp_status->nchars;
3391   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3392     for (; idx < 0; idx++)
3393       {
3394         int elt = charbuf[idx];
3395
3396         if (elt == -2)
3397           {
3398             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3399             idx++;
3400           }
3401         else if (elt == -1)
3402           {
3403             charbuf[idx++] = ISO_CODE_ESC;
3404             charbuf[idx] = '0';
3405             new_chars += 2;
3406           }
3407       }
3408   cmp_status->state = COMPOSING_NO;
3409   return new_chars;
3410 }
3411
3412 /* If characters are under composition, finish the composition.  */
3413 #define MAYBE_FINISH_COMPOSITION()                              \
3414   do {                                                          \
3415     if (cmp_status->state != COMPOSING_NO)                      \
3416       char_offset += finish_composition (charbuf, cmp_status);  \
3417   } while (0)
3418
3419 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3420
3421    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3422    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3423    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3424    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3425
3426    Produce this annotation sequence now:
3427
3428    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3429 */
3430
3431 #define DECODE_COMPOSITION_START(c1)                                       \
3432   do {                                                                     \
3433     if (c1 == '0'                                                          \
3434         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3435              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3436             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3437                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3438       {                                                                    \
3439         *charbuf++ = -1;                                                   \
3440         *charbuf++= -1;                                                    \
3441         cmp_status->state = COMPOSING_CHAR;                                \
3442         cmp_status->length += 2;                                           \
3443       }                                                                    \
3444     else                                                                   \
3445       {                                                                    \
3446         MAYBE_FINISH_COMPOSITION ();                                       \
3447         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3448                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3449                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3450                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3451         cmp_status->state                                                  \
3452           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3453         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3454         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3455         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3456         coding->annotated = 1;                                             \
3457       }                                                                    \
3458   } while (0)
3459
3460
3461 /* Handle composition end sequence ESC 1.  */
3462
3463 #define DECODE_COMPOSITION_END()                                        \
3464   do {                                                                  \
3465     if (cmp_status->nchars == 0                                         \
3466         || ((cmp_status->state == COMPOSING_CHAR)                       \
3467             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3468       {                                                                 \
3469         MAYBE_FINISH_COMPOSITION ();                                    \
3470         goto invalid_code;                                              \
3471       }                                                                 \
3472     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3473       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3474     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3475       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3476     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3477     char_offset += cmp_status->nchars;                                  \
3478     cmp_status->state = COMPOSING_NO;                                   \
3479   } while (0)
3480
3481 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3482
3483 #define STORE_COMPOSITION_RULE(rule)    \
3484   do {                                  \
3485     *charbuf++ = -2;                    \
3486     *charbuf++ = rule;                  \
3487     cmp_status->length += 2;            \
3488     cmp_status->state--;                \
3489   } while (0)
3490
3491 /* Store a composed char or a component char C in charbuf, and update
3492    cmp_status.  */
3493
3494 #define STORE_COMPOSITION_CHAR(c)                                       \
3495   do {                                                                  \
3496     *charbuf++ = (c);                                                   \
3497     cmp_status->length++;                                               \
3498     if (cmp_status->state == COMPOSING_CHAR)                            \
3499       cmp_status->nchars++;                                             \
3500     else                                                                \
3501       cmp_status->ncomps++;                                             \
3502     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3503         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3504             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3505       cmp_status->state++;                                              \
3506   } while (0)
3507
3508
3509 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3510
3511 static void
3512 decode_coding_iso_2022 (struct coding_system *coding)
3513 {
3514   const unsigned char *src = coding->source + coding->consumed;
3515   const unsigned char *src_end = coding->source + coding->src_bytes;
3516   const unsigned char *src_base;
3517   int *charbuf = coding->charbuf + coding->charbuf_used;
3518   /* We may produce two annotations (charset and composition) in one
3519      loop and one more charset annotation at the end.  */
3520   int *charbuf_end
3521     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3522   ptrdiff_t consumed_chars = 0, consumed_chars_base;
3523   bool multibytep = coding->src_multibyte;
3524   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3525   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3526   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3527   int charset_id_2, charset_id_3;
3528   struct charset *charset;
3529   int c;
3530   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3531   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
3532   ptrdiff_t char_offset = coding->produced_char;
3533   ptrdiff_t last_offset = char_offset;
3534   int last_id = charset_ascii;
3535   bool eol_dos
3536     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3537   int byte_after_cr = -1;
3538   int i;
3539
3540   setup_iso_safe_charsets (attrs);
3541   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3542
3543   if (cmp_status->state != COMPOSING_NO)
3544     {
3545       if (charbuf_end - charbuf < cmp_status->length)
3546         emacs_abort ();
3547       for (i = 0; i < cmp_status->length; i++)
3548         *charbuf++ = cmp_status->carryover[i];
3549       coding->annotated = 1;
3550     }
3551
3552   while (1)
3553     {
3554       int c1, c2, c3;
3555
3556       src_base = src;
3557       consumed_chars_base = consumed_chars;
3558
3559       if (charbuf >= charbuf_end)
3560         {
3561           if (byte_after_cr >= 0)
3562             src_base--;
3563           break;
3564         }
3565
3566       if (byte_after_cr >= 0)
3567         c1 = byte_after_cr, byte_after_cr = -1;
3568       else
3569         ONE_MORE_BYTE (c1);
3570       if (c1 < 0)
3571         goto invalid_code;
3572
3573       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3574         {
3575           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3576           char_offset++;
3577           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3578           continue;
3579         }
3580
3581       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3582         {
3583           if (c1 == ISO_CODE_ESC)
3584             {
3585               if (src + 1 >= src_end)
3586                 goto no_more_source;
3587               *charbuf++ = ISO_CODE_ESC;
3588               char_offset++;
3589               if (src[0] == '%' && src[1] == '@')
3590                 {
3591                   src += 2;
3592                   consumed_chars += 2;
3593                   char_offset += 2;
3594                   /* We are sure charbuf can contain two more chars. */
3595                   *charbuf++ = '%';
3596                   *charbuf++ = '@';
3597                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3598                 }
3599             }
3600           else
3601             {
3602               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3603               char_offset++;
3604             }
3605           continue;
3606         }
3607
3608       if ((cmp_status->state == COMPOSING_RULE
3609            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3610           && c1 != ISO_CODE_ESC)
3611         {
3612           int rule;
3613
3614           DECODE_COMPOSITION_RULE (rule);
3615           STORE_COMPOSITION_RULE (rule);
3616           continue;
3617         }
3618
3619       /* We produce at most one character.  */
3620       switch (iso_code_class [c1])
3621         {
3622         case ISO_0x20_or_0x7F:
3623           if (charset_id_0 < 0
3624               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3625             /* This is SPACE or DEL.  */
3626             charset = CHARSET_FROM_ID (charset_ascii);
3627           else
3628             charset = CHARSET_FROM_ID (charset_id_0);
3629           break;
3630
3631         case ISO_graphic_plane_0:
3632           if (charset_id_0 < 0)
3633             charset = CHARSET_FROM_ID (charset_ascii);
3634           else
3635             charset = CHARSET_FROM_ID (charset_id_0);
3636           break;
3637
3638         case ISO_0xA0_or_0xFF:
3639           if (charset_id_1 < 0
3640               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3641               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3642             goto invalid_code;
3643           /* This is a graphic character, we fall down ... */
3644
3645         case ISO_graphic_plane_1:
3646           if (charset_id_1 < 0)
3647             goto invalid_code;
3648           charset = CHARSET_FROM_ID (charset_id_1);
3649           break;
3650
3651         case ISO_control_0:
3652           if (eol_dos && c1 == '\r')
3653             ONE_MORE_BYTE (byte_after_cr);
3654           MAYBE_FINISH_COMPOSITION ();
3655           charset = CHARSET_FROM_ID (charset_ascii);
3656           break;
3657
3658         case ISO_control_1:
3659           goto invalid_code;
3660
3661         case ISO_shift_out:
3662           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3663               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3664             goto invalid_code;
3665           CODING_ISO_INVOCATION (coding, 0) = 1;
3666           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3667           continue;
3668
3669         case ISO_shift_in:
3670           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3671             goto invalid_code;
3672           CODING_ISO_INVOCATION (coding, 0) = 0;
3673           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3674           continue;
3675
3676         case ISO_single_shift_2_7:
3677           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3678             goto invalid_code;
3679         case ISO_single_shift_2:
3680           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3681             goto invalid_code;
3682           /* SS2 is handled as an escape sequence of ESC 'N' */
3683           c1 = 'N';
3684           goto label_escape_sequence;
3685
3686         case ISO_single_shift_3:
3687           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3688             goto invalid_code;
3689           /* SS2 is handled as an escape sequence of ESC 'O' */
3690           c1 = 'O';
3691           goto label_escape_sequence;
3692
3693         case ISO_control_sequence_introducer:
3694           /* CSI is handled as an escape sequence of ESC '[' ...  */
3695           c1 = '[';
3696           goto label_escape_sequence;
3697
3698         case ISO_escape:
3699           ONE_MORE_BYTE (c1);
3700         label_escape_sequence:
3701           /* Escape sequences handled here are invocation,
3702              designation, direction specification, and character
3703              composition specification.  */
3704           switch (c1)
3705             {
3706             case '&':           /* revision of following character set */
3707               ONE_MORE_BYTE (c1);
3708               if (!(c1 >= '@' && c1 <= '~'))
3709                 goto invalid_code;
3710               ONE_MORE_BYTE (c1);
3711               if (c1 != ISO_CODE_ESC)
3712                 goto invalid_code;
3713               ONE_MORE_BYTE (c1);
3714               goto label_escape_sequence;
3715
3716             case '$':           /* designation of 2-byte character set */
3717               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3718                 goto invalid_code;
3719               {
3720                 int reg, chars96;
3721
3722                 ONE_MORE_BYTE (c1);
3723                 if (c1 >= '@' && c1 <= 'B')
3724                   {     /* designation of JISX0208.1978, GB2312.1980,
3725                            or JISX0208.1980 */
3726                     reg = 0, chars96 = 0;
3727                   }
3728                 else if (c1 >= 0x28 && c1 <= 0x2B)
3729                   { /* designation of DIMENSION2_CHARS94 character set */
3730                     reg = c1 - 0x28, chars96 = 0;
3731                     ONE_MORE_BYTE (c1);
3732                   }
3733                 else if (c1 >= 0x2C && c1 <= 0x2F)
3734                   { /* designation of DIMENSION2_CHARS96 character set */
3735                     reg = c1 - 0x2C, chars96 = 1;
3736                     ONE_MORE_BYTE (c1);
3737                   }
3738                 else
3739                   goto invalid_code;
3740                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3741                 /* We must update these variables now.  */
3742                 if (reg == 0)
3743                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3744                 else if (reg == 1)
3745                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3746                 if (chars96 < 0)
3747                   goto invalid_code;
3748               }
3749               continue;
3750
3751             case 'n':           /* invocation of locking-shift-2 */
3752               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3753                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3754                 goto invalid_code;
3755               CODING_ISO_INVOCATION (coding, 0) = 2;
3756               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3757               continue;
3758
3759             case 'o':           /* invocation of locking-shift-3 */
3760               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3761                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3762                 goto invalid_code;
3763               CODING_ISO_INVOCATION (coding, 0) = 3;
3764               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3765               continue;
3766
3767             case 'N':           /* invocation of single-shift-2 */
3768               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3769                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3770                 goto invalid_code;
3771               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3772               if (charset_id_2 < 0)
3773                 charset = CHARSET_FROM_ID (charset_ascii);
3774               else
3775                 charset = CHARSET_FROM_ID (charset_id_2);
3776               ONE_MORE_BYTE (c1);
3777               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3778                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3779                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3780                           ? c1 >= 0x80 : c1 < 0x80)))
3781                 goto invalid_code;
3782               break;
3783
3784             case 'O':           /* invocation of single-shift-3 */
3785               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3786                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3787                 goto invalid_code;
3788               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3789               if (charset_id_3 < 0)
3790                 charset = CHARSET_FROM_ID (charset_ascii);
3791               else
3792                 charset = CHARSET_FROM_ID (charset_id_3);
3793               ONE_MORE_BYTE (c1);
3794               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0)
3795                   || (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3796                       && ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LEVEL_4)
3797                           ? c1 >= 0x80 : c1 < 0x80)))
3798                 goto invalid_code;
3799               break;
3800
3801             case '0': case '2': case '3': case '4': /* start composition */
3802               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3803                 goto invalid_code;
3804               if (last_id != charset_ascii)
3805                 {
3806                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3807                   last_id = charset_ascii;
3808                   last_offset = char_offset;
3809                 }
3810               DECODE_COMPOSITION_START (c1);
3811               continue;
3812
3813             case '1':           /* end composition */
3814               if (cmp_status->state == COMPOSING_NO)
3815                 goto invalid_code;
3816               DECODE_COMPOSITION_END ();
3817               continue;
3818
3819             case '[':           /* specification of direction */
3820               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3821                 goto invalid_code;
3822               /* For the moment, nested direction is not supported.
3823                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3824                  left-to-right, and nonzero means right-to-left.  */
3825               ONE_MORE_BYTE (c1);
3826               switch (c1)
3827                 {
3828                 case ']':       /* end of the current direction */
3829                   coding->mode &= ~CODING_MODE_DIRECTION;
3830
3831                 case '0':       /* end of the current direction */
3832                 case '1':       /* start of left-to-right direction */
3833                   ONE_MORE_BYTE (c1);
3834                   if (c1 == ']')
3835                     coding->mode &= ~CODING_MODE_DIRECTION;
3836                   else
3837                     goto invalid_code;
3838                   break;
3839
3840                 case '2':       /* start of right-to-left direction */
3841                   ONE_MORE_BYTE (c1);
3842                   if (c1 == ']')
3843                     coding->mode |= CODING_MODE_DIRECTION;
3844                   else
3845                     goto invalid_code;
3846                   break;
3847
3848                 default:
3849                   goto invalid_code;
3850                 }
3851               continue;
3852
3853             case '%':
3854               ONE_MORE_BYTE (c1);
3855               if (c1 == '/')
3856                 {
3857                   /* CTEXT extended segment:
3858                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3859                      We keep these bytes as is for the moment.
3860                      They may be decoded by post-read-conversion.  */
3861                   int dim, M, L;
3862                   int size;
3863
3864                   ONE_MORE_BYTE (dim);
3865                   if (dim < '0' || dim > '4')
3866                     goto invalid_code;
3867                   ONE_MORE_BYTE (M);
3868                   if (M < 128)
3869                     goto invalid_code;
3870                   ONE_MORE_BYTE (L);
3871                   if (L < 128)
3872                     goto invalid_code;
3873                   size = ((M - 128) * 128) + (L - 128);
3874                   if (charbuf + 6 > charbuf_end)
3875                     goto break_loop;
3876                   *charbuf++ = ISO_CODE_ESC;
3877                   *charbuf++ = '%';
3878                   *charbuf++ = '/';
3879                   *charbuf++ = dim;
3880                   *charbuf++ = BYTE8_TO_CHAR (M);
3881                   *charbuf++ = BYTE8_TO_CHAR (L);
3882                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3883                 }
3884               else if (c1 == 'G')
3885                 {
3886                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3887                      ESC % G --UTF-8-BYTES-- ESC % @
3888                      We keep these bytes as is for the moment.
3889                      They may be decoded by post-read-conversion.  */
3890                   if (charbuf + 3 > charbuf_end)
3891                     goto break_loop;
3892                   *charbuf++ = ISO_CODE_ESC;
3893                   *charbuf++ = '%';
3894                   *charbuf++ = 'G';
3895                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3896                 }
3897               else
3898                 goto invalid_code;
3899               continue;
3900               break;
3901
3902             default:
3903               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3904                 goto invalid_code;
3905               {
3906                 int reg, chars96;
3907
3908                 if (c1 >= 0x28 && c1 <= 0x2B)
3909                   { /* designation of DIMENSION1_CHARS94 character set */
3910                     reg = c1 - 0x28, chars96 = 0;
3911                     ONE_MORE_BYTE (c1);
3912                   }
3913                 else if (c1 >= 0x2C && c1 <= 0x2F)
3914                   { /* designation of DIMENSION1_CHARS96 character set */
3915                     reg = c1 - 0x2C, chars96 = 1;
3916                     ONE_MORE_BYTE (c1);
3917                   }
3918                 else
3919                   goto invalid_code;
3920                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3921                 /* We must update these variables now.  */
3922                 if (reg == 0)
3923                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3924                 else if (reg == 1)
3925                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3926                 if (chars96 < 0)
3927                   goto invalid_code;
3928               }
3929               continue;
3930             }
3931           break;
3932
3933         default:
3934           emacs_abort ();
3935         }
3936
3937       if (cmp_status->state == COMPOSING_NO
3938           && charset->id != charset_ascii
3939           && last_id != charset->id)
3940         {
3941           if (last_id != charset_ascii)
3942             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3943           last_id = charset->id;
3944           last_offset = char_offset;
3945         }
3946
3947       /* Now we know CHARSET and 1st position code C1 of a character.
3948          Produce a decoded character while getting 2nd and 3rd
3949          position codes C2, C3 if necessary.  */
3950       if (CHARSET_DIMENSION (charset) > 1)
3951         {
3952           ONE_MORE_BYTE (c2);
3953           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3954               || ((c1 & 0x80) != (c2 & 0x80)))
3955             /* C2 is not in a valid range.  */
3956             goto invalid_code;
3957           if (CHARSET_DIMENSION (charset) == 2)
3958             c1 = (c1 << 8) | c2;
3959           else
3960             {
3961               ONE_MORE_BYTE (c3);
3962               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3963                   || ((c1 & 0x80) != (c3 & 0x80)))
3964                 /* C3 is not in a valid range.  */
3965                 goto invalid_code;
3966               c1 = (c1 << 16) | (c2 << 8) | c2;
3967             }
3968         }
3969       c1 &= 0x7F7F7F;
3970       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3971       if (c < 0)
3972         {
3973           MAYBE_FINISH_COMPOSITION ();
3974           for (; src_base < src; src_base++, char_offset++)
3975             {
3976               if (ASCII_BYTE_P (*src_base))
3977                 *charbuf++ = *src_base;
3978               else
3979                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3980             }
3981         }
3982       else if (cmp_status->state == COMPOSING_NO)
3983         {
3984           *charbuf++ = c;
3985           char_offset++;
3986         }
3987       else if ((cmp_status->state == COMPOSING_CHAR
3988                 ? cmp_status->nchars
3989                 : cmp_status->ncomps)
3990                >= MAX_COMPOSITION_COMPONENTS)
3991         {
3992           /* Too long composition.  */
3993           MAYBE_FINISH_COMPOSITION ();
3994           *charbuf++ = c;
3995           char_offset++;
3996         }
3997       else
3998         STORE_COMPOSITION_CHAR (c);
3999       continue;
4000
4001     invalid_code:
4002       MAYBE_FINISH_COMPOSITION ();
4003       src = src_base;
4004       consumed_chars = consumed_chars_base;
4005       ONE_MORE_BYTE (c);
4006       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4007       char_offset++;
4008       coding->errors++;
4009       /* Reset the invocation and designation status to the safest
4010          one; i.e. designate ASCII to the graphic register 0, and
4011          invoke that register to the graphic plane 0.  This typically
4012          helps the case that an designation sequence for ASCII "ESC (
4013          B" is somehow broken (e.g. broken by a newline).  */
4014       CODING_ISO_INVOCATION (coding, 0) = 0;
4015       CODING_ISO_DESIGNATION (coding, 0) = charset_ascii;
4016       charset_id_0 = charset_ascii;
4017       continue;
4018
4019     break_loop:
4020       break;
4021     }
4022
4023  no_more_source:
4024   if (cmp_status->state != COMPOSING_NO)
4025     {
4026       if (coding->mode & CODING_MODE_LAST_BLOCK)
4027         MAYBE_FINISH_COMPOSITION ();
4028       else
4029         {
4030           charbuf -= cmp_status->length;
4031           for (i = 0; i < cmp_status->length; i++)
4032             cmp_status->carryover[i] = charbuf[i];
4033         }
4034     }
4035   else if (last_id != charset_ascii)
4036     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4037   coding->consumed_char += consumed_chars_base;
4038   coding->consumed = src_base - coding->source;
4039   coding->charbuf_used = charbuf - coding->charbuf;
4040 }
4041
4042
4043 /* ISO2022 encoding stuff.  */
4044
4045 /*
4046    It is not enough to say just "ISO2022" on encoding, we have to
4047    specify more details.  In Emacs, each coding system of ISO2022
4048    variant has the following specifications:
4049         1. Initial designation to G0 thru G3.
4050         2. Allows short-form designation?
4051         3. ASCII should be designated to G0 before control characters?
4052         4. ASCII should be designated to G0 at end of line?
4053         5. 7-bit environment or 8-bit environment?
4054         6. Use locking-shift?
4055         7. Use Single-shift?
4056    And the following two are only for Japanese:
4057         8. Use ASCII in place of JIS0201-1976-Roman?
4058         9. Use JISX0208-1983 in place of JISX0208-1978?
4059    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4060    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4061    details.
4062 */
4063
4064 /* Produce codes (escape sequence) for designating CHARSET to graphic
4065    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4066    '@', 'A', or 'B' and the coding system CODING allows, produce
4067    designation sequence of short-form.  */
4068
4069 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4070   do {                                                                  \
4071     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4072     const char *intermediate_char_94 = "()*+";                          \
4073     const char *intermediate_char_96 = ",-./";                          \
4074     int revision = -1;                                                  \
4075                                                                         \
4076     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4077       revision = CHARSET_ISO_REVISION (charset);                        \
4078                                                                         \
4079     if (revision >= 0)                                                  \
4080       {                                                                 \
4081         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4082         EMIT_ONE_BYTE ('@' + revision);                                 \
4083       }                                                                 \
4084     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4085     if (CHARSET_DIMENSION (charset) == 1)                               \
4086       {                                                                 \
4087         int b;                                                          \
4088         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4089           b = intermediate_char_94[reg];                                \
4090         else                                                            \
4091           b = intermediate_char_96[reg];                                \
4092         EMIT_ONE_ASCII_BYTE (b);                                        \
4093       }                                                                 \
4094     else                                                                \
4095       {                                                                 \
4096         EMIT_ONE_ASCII_BYTE ('$');                                      \
4097         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4098           {                                                             \
4099             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4100                 || reg != 0                                             \
4101                 || final_char < '@' || final_char > 'B')                \
4102               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4103           }                                                             \
4104         else                                                            \
4105           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4106       }                                                                 \
4107     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4108                                                                         \
4109     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4110   } while (0)
4111
4112
4113 /* The following two macros produce codes (control character or escape
4114    sequence) for ISO2022 single-shift functions (single-shift-2 and
4115    single-shift-3).  */
4116
4117 #define ENCODE_SINGLE_SHIFT_2                                           \
4118   do {                                                                  \
4119     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4120       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4121     else                                                                \
4122       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4123     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4124   } while (0)
4125
4126
4127 #define ENCODE_SINGLE_SHIFT_3                                           \
4128   do {                                                                  \
4129     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4130       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4131     else                                                                \
4132       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4133     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4134   } while (0)
4135
4136
4137 /* The following four macros produce codes (control character or
4138    escape sequence) for ISO2022 locking-shift functions (shift-in,
4139    shift-out, locking-shift-2, and locking-shift-3).  */
4140
4141 #define ENCODE_SHIFT_IN                                 \
4142   do {                                                  \
4143     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4144     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4145   } while (0)
4146
4147
4148 #define ENCODE_SHIFT_OUT                                \
4149   do {                                                  \
4150     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4151     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4152   } while (0)
4153
4154
4155 #define ENCODE_LOCKING_SHIFT_2                          \
4156   do {                                                  \
4157     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4158     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4159   } while (0)
4160
4161
4162 #define ENCODE_LOCKING_SHIFT_3                          \
4163   do {                                                  \
4164     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4165     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4166   } while (0)
4167
4168
4169 /* Produce codes for a DIMENSION1 character whose character set is
4170    CHARSET and whose position-code is C1.  Designation and invocation
4171    sequences are also produced in advance if necessary.  */
4172
4173 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4174   do {                                                                  \
4175     int id = CHARSET_ID (charset);                                      \
4176                                                                         \
4177     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4178         && id == charset_ascii)                                         \
4179       {                                                                 \
4180         id = charset_jisx0201_roman;                                    \
4181         charset = CHARSET_FROM_ID (id);                                 \
4182       }                                                                 \
4183                                                                         \
4184     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4185       {                                                                 \
4186         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4187           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4188         else                                                            \
4189           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4190         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4191         break;                                                          \
4192       }                                                                 \
4193     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4194       {                                                                 \
4195         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4196         break;                                                          \
4197       }                                                                 \
4198     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4199       {                                                                 \
4200         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4201         break;                                                          \
4202       }                                                                 \
4203     else                                                                \
4204       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4205          must invoke it, or, at first, designate it to some graphic     \
4206          register.  Then repeat the loop to actually produce the        \
4207          character.  */                                                 \
4208       dst = encode_invocation_designation (charset, coding, dst,        \
4209                                            &produced_chars);            \
4210   } while (1)
4211
4212
4213 /* Produce codes for a DIMENSION2 character whose character set is
4214    CHARSET and whose position-codes are C1 and C2.  Designation and
4215    invocation codes are also produced in advance if necessary.  */
4216
4217 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4218   do {                                                                  \
4219     int id = CHARSET_ID (charset);                                      \
4220                                                                         \
4221     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4222         && id == charset_jisx0208)                                      \
4223       {                                                                 \
4224         id = charset_jisx0208_1978;                                     \
4225         charset = CHARSET_FROM_ID (id);                                 \
4226       }                                                                 \
4227                                                                         \
4228     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4229       {                                                                 \
4230         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4231           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4232         else                                                            \
4233           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4234         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4235         break;                                                          \
4236       }                                                                 \
4237     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4238       {                                                                 \
4239         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4240         break;                                                          \
4241       }                                                                 \
4242     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4243       {                                                                 \
4244         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4245         break;                                                          \
4246       }                                                                 \
4247     else                                                                \
4248       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4249          must invoke it, or, at first, designate it to some graphic     \
4250          register.  Then repeat the loop to actually produce the        \
4251          character.  */                                                 \
4252       dst = encode_invocation_designation (charset, coding, dst,        \
4253                                            &produced_chars);            \
4254   } while (1)
4255
4256
4257 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4258   do {                                                                     \
4259     unsigned code;                                                         \
4260     CODING_ENCODE_CHAR (coding, dst, dst_end, (charset), (c), code);       \
4261                                                                            \
4262     if (CHARSET_DIMENSION (charset) == 1)                                  \
4263       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4264     else                                                                   \
4265       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4266   } while (0)
4267
4268
4269 /* Produce designation and invocation codes at a place pointed by DST
4270    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4271    Return new DST.  */
4272
4273 static unsigned char *
4274 encode_invocation_designation (struct charset *charset,
4275                                struct coding_system *coding,
4276                                unsigned char *dst, ptrdiff_t *p_nchars)
4277 {
4278   bool multibytep = coding->dst_multibyte;
4279   ptrdiff_t produced_chars = *p_nchars;
4280   int reg;                      /* graphic register number */
4281   int id = CHARSET_ID (charset);
4282
4283   /* At first, check designations.  */
4284   for (reg = 0; reg < 4; reg++)
4285     if (id == CODING_ISO_DESIGNATION (coding, reg))
4286       break;
4287
4288   if (reg >= 4)
4289     {
4290       /* CHARSET is not yet designated to any graphic registers.  */
4291       /* At first check the requested designation.  */
4292       reg = CODING_ISO_REQUEST (coding, id);
4293       if (reg < 0)
4294         /* Since CHARSET requests no special designation, designate it
4295            to graphic register 0.  */
4296         reg = 0;
4297
4298       ENCODE_DESIGNATION (charset, reg, coding);
4299     }
4300
4301   if (CODING_ISO_INVOCATION (coding, 0) != reg
4302       && CODING_ISO_INVOCATION (coding, 1) != reg)
4303     {
4304       /* Since the graphic register REG is not invoked to any graphic
4305          planes, invoke it to graphic plane 0.  */
4306       switch (reg)
4307         {
4308         case 0:                 /* graphic register 0 */
4309           ENCODE_SHIFT_IN;
4310           break;
4311
4312         case 1:                 /* graphic register 1 */
4313           ENCODE_SHIFT_OUT;
4314           break;
4315
4316         case 2:                 /* graphic register 2 */
4317           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4318             ENCODE_SINGLE_SHIFT_2;
4319           else
4320             ENCODE_LOCKING_SHIFT_2;
4321           break;
4322
4323         case 3:                 /* graphic register 3 */
4324           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4325             ENCODE_SINGLE_SHIFT_3;
4326           else
4327             ENCODE_LOCKING_SHIFT_3;
4328           break;
4329         }
4330     }
4331
4332   *p_nchars = produced_chars;
4333   return dst;
4334 }
4335
4336
4337 /* Produce codes for designation and invocation to reset the graphic
4338    planes and registers to initial state.  */
4339 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4340   do {                                                                  \
4341     int reg;                                                            \
4342     struct charset *charset;                                            \
4343                                                                         \
4344     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4345       ENCODE_SHIFT_IN;                                                  \
4346     for (reg = 0; reg < 4; reg++)                                       \
4347       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4348           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4349               != CODING_ISO_INITIAL (coding, reg)))                     \
4350         {                                                               \
4351           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4352           ENCODE_DESIGNATION (charset, reg, coding);                    \
4353         }                                                               \
4354   } while (0)
4355
4356
4357 /* Produce designation sequences of charsets in the line started from
4358    CHARBUF to a place pointed by DST, and return the number of
4359    produced bytes.  DST should not directly point a buffer text area
4360    which may be relocated by char_charset call.
4361
4362    If the current block ends before any end-of-line, we may fail to
4363    find all the necessary designations.  */
4364
4365 static ptrdiff_t
4366 encode_designation_at_bol (struct coding_system *coding,
4367                            int *charbuf, int *charbuf_end,
4368                            unsigned char *dst)
4369 {
4370   unsigned char *orig = dst;
4371   struct charset *charset;
4372   /* Table of charsets to be designated to each graphic register.  */
4373   int r[4];
4374   int c, found = 0, reg;
4375   ptrdiff_t produced_chars = 0;
4376   bool multibytep = coding->dst_multibyte;
4377   Lisp_Object attrs;
4378   Lisp_Object charset_list;
4379
4380   attrs = CODING_ID_ATTRS (coding->id);
4381   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4382   if (EQ (charset_list, Qiso_2022))
4383     charset_list = Viso_2022_charset_list;
4384
4385   for (reg = 0; reg < 4; reg++)
4386     r[reg] = -1;
4387
4388   while (charbuf < charbuf_end && found < 4)
4389     {
4390       int id;
4391
4392       c = *charbuf++;
4393       if (c == '\n')
4394         break;
4395       charset = char_charset (c, charset_list, NULL);
4396       id = CHARSET_ID (charset);
4397       reg = CODING_ISO_REQUEST (coding, id);
4398       if (reg >= 0 && r[reg] < 0)
4399         {
4400           found++;
4401           r[reg] = id;
4402         }
4403     }
4404
4405   if (found)
4406     {
4407       for (reg = 0; reg < 4; reg++)
4408         if (r[reg] >= 0
4409             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4410           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4411     }
4412
4413   return dst - orig;
4414 }
4415
4416 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4417
4418 static bool
4419 encode_coding_iso_2022 (struct coding_system *coding)
4420 {
4421   bool multibytep = coding->dst_multibyte;
4422   int *charbuf = coding->charbuf;
4423   int *charbuf_end = charbuf + coding->charbuf_used;
4424   unsigned char *dst = coding->destination + coding->produced;
4425   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4426   int safe_room = 16;
4427   bool bol_designation
4428     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4429        && CODING_ISO_BOL (coding));
4430   ptrdiff_t produced_chars = 0;
4431   Lisp_Object attrs, eol_type, charset_list;
4432   bool ascii_compatible;
4433   int c;
4434   int preferred_charset_id = -1;
4435
4436   CODING_GET_INFO (coding, attrs, charset_list);
4437   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4438   if (VECTORP (eol_type))
4439     eol_type = Qunix;
4440
4441   setup_iso_safe_charsets (attrs);
4442   /* Charset list may have been changed.  */
4443   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4444   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4445
4446   ascii_compatible
4447     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4448        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4449                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4450
4451   while (charbuf < charbuf_end)
4452     {
4453       ASSURE_DESTINATION (safe_room);
4454
4455       if (bol_designation)
4456         {
4457           /* We have to produce designation sequences if any now.  */
4458           unsigned char desig_buf[16];
4459           int nbytes;
4460           ptrdiff_t offset;
4461
4462           charset_map_loaded = 0;
4463           nbytes = encode_designation_at_bol (coding, charbuf, charbuf_end,
4464                                               desig_buf);
4465           if (charset_map_loaded
4466               && (offset = coding_change_destination (coding)))
4467             {
4468               dst += offset;
4469               dst_end += offset;
4470             }
4471           memcpy (dst, desig_buf, nbytes);
4472           dst += nbytes;
4473           /* We are sure that designation sequences are all ASCII bytes.  */
4474           produced_chars += nbytes;
4475           bol_designation = 0;
4476           ASSURE_DESTINATION (safe_room);
4477         }
4478
4479       c = *charbuf++;
4480
4481       if (c < 0)
4482         {
4483           /* Handle an annotation.  */
4484           switch (*charbuf)
4485             {
4486             case CODING_ANNOTATE_COMPOSITION_MASK:
4487               /* Not yet implemented.  */
4488               break;
4489             case CODING_ANNOTATE_CHARSET_MASK:
4490               preferred_charset_id = charbuf[2];
4491               if (preferred_charset_id >= 0
4492                   && NILP (Fmemq (make_number (preferred_charset_id),
4493                                   charset_list)))
4494                 preferred_charset_id = -1;
4495               break;
4496             default:
4497               emacs_abort ();
4498             }
4499           charbuf += -c - 1;
4500           continue;
4501         }
4502
4503       /* Now encode the character C.  */
4504       if (c < 0x20 || c == 0x7F)
4505         {
4506           if (c == '\n'
4507               || (c == '\r' && EQ (eol_type, Qmac)))
4508             {
4509               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4510                 ENCODE_RESET_PLANE_AND_REGISTER ();
4511               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4512                 {
4513                   int i;
4514
4515                   for (i = 0; i < 4; i++)
4516                     CODING_ISO_DESIGNATION (coding, i)
4517                       = CODING_ISO_INITIAL (coding, i);
4518                 }
4519               bol_designation = ((CODING_ISO_FLAGS (coding)
4520                                   & CODING_ISO_FLAG_DESIGNATE_AT_BOL)
4521                                  != 0);
4522             }
4523           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4524             ENCODE_RESET_PLANE_AND_REGISTER ();
4525           EMIT_ONE_ASCII_BYTE (c);
4526         }
4527       else if (ASCII_CHAR_P (c))
4528         {
4529           if (ascii_compatible)
4530             EMIT_ONE_ASCII_BYTE (c);
4531           else
4532             {
4533               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4534               ENCODE_ISO_CHARACTER (charset, c);
4535             }
4536         }
4537       else if (CHAR_BYTE8_P (c))
4538         {
4539           c = CHAR_TO_BYTE8 (c);
4540           EMIT_ONE_BYTE (c);
4541         }
4542       else
4543         {
4544           struct charset *charset;
4545
4546           if (preferred_charset_id >= 0)
4547             {
4548               bool result;
4549
4550               charset = CHARSET_FROM_ID (preferred_charset_id);
4551               CODING_CHAR_CHARSET_P (coding, dst, dst_end, c, charset, result);
4552               if (! result)
4553                 CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4554                                      NULL, charset);
4555             }
4556           else
4557             CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4558                                  NULL, charset);
4559           if (!charset)
4560             {
4561               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4562                 {
4563                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4564                   charset = CHARSET_FROM_ID (charset_ascii);
4565                 }
4566               else
4567                 {
4568                   c = coding->default_char;
4569                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
4570                                        charset_list, NULL, charset);
4571                 }
4572             }
4573           ENCODE_ISO_CHARACTER (charset, c);
4574         }
4575     }
4576
4577   if (coding->mode & CODING_MODE_LAST_BLOCK
4578       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4579     {
4580       ASSURE_DESTINATION (safe_room);
4581       ENCODE_RESET_PLANE_AND_REGISTER ();
4582     }
4583   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4584   CODING_ISO_BOL (coding) = bol_designation;
4585   coding->produced_char += produced_chars;
4586   coding->produced = dst - coding->destination;
4587   return 0;
4588 }
4589
4590 \f
4591 /*** 8,9. SJIS and BIG5 handlers ***/
4592
4593 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4594    quite widely.  So, for the moment, Emacs supports them in the bare
4595    C code.  But, in the future, they may be supported only by CCL.  */
4596
4597 /* SJIS is a coding system encoding three character sets: ASCII, right
4598    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4599    as is.  A character of charset katakana-jisx0201 is encoded by
4600    "position-code + 0x80".  A character of charset japanese-jisx0208
4601    is encoded in 2-byte but two position-codes are divided and shifted
4602    so that it fit in the range below.
4603
4604    --- CODE RANGE of SJIS ---
4605    (character set)      (range)
4606    ASCII                0x00 .. 0x7F
4607    KATAKANA-JISX0201    0xA0 .. 0xDF
4608    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4609             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4610    -------------------------------
4611
4612 */
4613
4614 /* BIG5 is a coding system encoding two character sets: ASCII and
4615    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4616    character set and is encoded in two-byte.
4617
4618    --- CODE RANGE of BIG5 ---
4619    (character set)      (range)
4620    ASCII                0x00 .. 0x7F
4621    Big5 (1st byte)      0xA1 .. 0xFE
4622         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4623    --------------------------
4624
4625   */
4626
4627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4628    Return true if a text is encoded in SJIS.  */
4629
4630 static bool
4631 detect_coding_sjis (struct coding_system *coding,
4632                     struct coding_detection_info *detect_info)
4633 {
4634   const unsigned char *src = coding->source, *src_base;
4635   const unsigned char *src_end = coding->source + coding->src_bytes;
4636   bool multibytep = coding->src_multibyte;
4637   ptrdiff_t consumed_chars = 0;
4638   int found = 0;
4639   int c;
4640   Lisp_Object attrs, charset_list;
4641   int max_first_byte_of_2_byte_code;
4642
4643   CODING_GET_INFO (coding, attrs, charset_list);
4644   max_first_byte_of_2_byte_code
4645     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4646
4647   detect_info->checked |= CATEGORY_MASK_SJIS;
4648   /* A coding system of this category is always ASCII compatible.  */
4649   src += coding->head_ascii;
4650
4651   while (1)
4652     {
4653       src_base = src;
4654       ONE_MORE_BYTE (c);
4655       if (c < 0x80)
4656         continue;
4657       if ((c >= 0x81 && c <= 0x9F)
4658           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4659         {
4660           ONE_MORE_BYTE (c);
4661           if (c < 0x40 || c == 0x7F || c > 0xFC)
4662             break;
4663           found = CATEGORY_MASK_SJIS;
4664         }
4665       else if (c >= 0xA0 && c < 0xE0)
4666         found = CATEGORY_MASK_SJIS;
4667       else
4668         break;
4669     }
4670   detect_info->rejected |= CATEGORY_MASK_SJIS;
4671   return 0;
4672
4673  no_more_source:
4674   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4675     {
4676       detect_info->rejected |= CATEGORY_MASK_SJIS;
4677       return 0;
4678     }
4679   detect_info->found |= found;
4680   return 1;
4681 }
4682
4683 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4684    Return true if a text is encoded in BIG5.  */
4685
4686 static bool
4687 detect_coding_big5 (struct coding_system *coding,
4688                     struct coding_detection_info *detect_info)
4689 {
4690   const unsigned char *src = coding->source, *src_base;
4691   const unsigned char *src_end = coding->source + coding->src_bytes;
4692   bool multibytep = coding->src_multibyte;
4693   ptrdiff_t consumed_chars = 0;
4694   int found = 0;
4695   int c;
4696
4697   detect_info->checked |= CATEGORY_MASK_BIG5;
4698   /* A coding system of this category is always ASCII compatible.  */
4699   src += coding->head_ascii;
4700
4701   while (1)
4702     {
4703       src_base = src;
4704       ONE_MORE_BYTE (c);
4705       if (c < 0x80)
4706         continue;
4707       if (c >= 0xA1)
4708         {
4709           ONE_MORE_BYTE (c);
4710           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4711             return 0;
4712           found = CATEGORY_MASK_BIG5;
4713         }
4714       else
4715         break;
4716     }
4717   detect_info->rejected |= CATEGORY_MASK_BIG5;
4718   return 0;
4719
4720  no_more_source:
4721   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4722     {
4723       detect_info->rejected |= CATEGORY_MASK_BIG5;
4724       return 0;
4725     }
4726   detect_info->found |= found;
4727   return 1;
4728 }
4729
4730 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4731
4732 static void
4733 decode_coding_sjis (struct coding_system *coding)
4734 {
4735   const unsigned char *src = coding->source + coding->consumed;
4736   const unsigned char *src_end = coding->source + coding->src_bytes;
4737   const unsigned char *src_base;
4738   int *charbuf = coding->charbuf + coding->charbuf_used;
4739   /* We may produce one charset annotation in one loop and one more at
4740      the end.  */
4741   int *charbuf_end
4742     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4743   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4744   bool multibytep = coding->src_multibyte;
4745   struct charset *charset_roman, *charset_kanji, *charset_kana;
4746   struct charset *charset_kanji2;
4747   Lisp_Object attrs, charset_list, val;
4748   ptrdiff_t char_offset = coding->produced_char;
4749   ptrdiff_t last_offset = char_offset;
4750   int last_id = charset_ascii;
4751   bool eol_dos
4752     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4753   int byte_after_cr = -1;
4754
4755   CODING_GET_INFO (coding, attrs, charset_list);
4756
4757   val = charset_list;
4758   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4759   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4760   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4761   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4762
4763   while (1)
4764     {
4765       int c, c1;
4766       struct charset *charset;
4767
4768       src_base = src;
4769       consumed_chars_base = consumed_chars;
4770
4771       if (charbuf >= charbuf_end)
4772         {
4773           if (byte_after_cr >= 0)
4774             src_base--;
4775           break;
4776         }
4777
4778       if (byte_after_cr >= 0)
4779         c = byte_after_cr, byte_after_cr = -1;
4780       else
4781         ONE_MORE_BYTE (c);
4782       if (c < 0)
4783         goto invalid_code;
4784       if (c < 0x80)
4785         {
4786           if (eol_dos && c == '\r')
4787             ONE_MORE_BYTE (byte_after_cr);
4788           charset = charset_roman;
4789         }
4790       else if (c == 0x80 || c == 0xA0)
4791         goto invalid_code;
4792       else if (c >= 0xA1 && c <= 0xDF)
4793         {
4794           /* SJIS -> JISX0201-Kana */
4795           c &= 0x7F;
4796           charset = charset_kana;
4797         }
4798       else if (c <= 0xEF)
4799         {
4800           /* SJIS -> JISX0208 */
4801           ONE_MORE_BYTE (c1);
4802           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4803             goto invalid_code;
4804           c = (c << 8) | c1;
4805           SJIS_TO_JIS (c);
4806           charset = charset_kanji;
4807         }
4808       else if (c <= 0xFC && charset_kanji2)
4809         {
4810           /* SJIS -> JISX0213-2 */
4811           ONE_MORE_BYTE (c1);
4812           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4813             goto invalid_code;
4814           c = (c << 8) | c1;
4815           SJIS_TO_JIS2 (c);
4816           charset = charset_kanji2;
4817         }
4818       else
4819         goto invalid_code;
4820       if (charset->id != charset_ascii
4821           && last_id != charset->id)
4822         {
4823           if (last_id != charset_ascii)
4824             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4825           last_id = charset->id;
4826           last_offset = char_offset;
4827         }
4828       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4829       *charbuf++ = c;
4830       char_offset++;
4831       continue;
4832
4833     invalid_code:
4834       src = src_base;
4835       consumed_chars = consumed_chars_base;
4836       ONE_MORE_BYTE (c);
4837       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4838       char_offset++;
4839       coding->errors++;
4840     }
4841
4842  no_more_source:
4843   if (last_id != charset_ascii)
4844     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4845   coding->consumed_char += consumed_chars_base;
4846   coding->consumed = src_base - coding->source;
4847   coding->charbuf_used = charbuf - coding->charbuf;
4848 }
4849
4850 static void
4851 decode_coding_big5 (struct coding_system *coding)
4852 {
4853   const unsigned char *src = coding->source + coding->consumed;
4854   const unsigned char *src_end = coding->source + coding->src_bytes;
4855   const unsigned char *src_base;
4856   int *charbuf = coding->charbuf + coding->charbuf_used;
4857   /* We may produce one charset annotation in one loop and one more at
4858      the end.  */
4859   int *charbuf_end
4860     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4861   ptrdiff_t consumed_chars = 0, consumed_chars_base;
4862   bool multibytep = coding->src_multibyte;
4863   struct charset *charset_roman, *charset_big5;
4864   Lisp_Object attrs, charset_list, val;
4865   ptrdiff_t char_offset = coding->produced_char;
4866   ptrdiff_t last_offset = char_offset;
4867   int last_id = charset_ascii;
4868   bool eol_dos
4869     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4870   int byte_after_cr = -1;
4871
4872   CODING_GET_INFO (coding, attrs, charset_list);
4873   val = charset_list;
4874   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4875   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4876
4877   while (1)
4878     {
4879       int c, c1;
4880       struct charset *charset;
4881
4882       src_base = src;
4883       consumed_chars_base = consumed_chars;
4884
4885       if (charbuf >= charbuf_end)
4886         {
4887           if (byte_after_cr >= 0)
4888             src_base--;
4889           break;
4890         }
4891
4892       if (byte_after_cr >= 0)
4893         c = byte_after_cr, byte_after_cr = -1;
4894       else
4895         ONE_MORE_BYTE (c);
4896
4897       if (c < 0)
4898         goto invalid_code;
4899       if (c < 0x80)
4900         {
4901           if (eol_dos && c == '\r')
4902             ONE_MORE_BYTE (byte_after_cr);
4903           charset = charset_roman;
4904         }
4905       else
4906         {
4907           /* BIG5 -> Big5 */
4908           if (c < 0xA1 || c > 0xFE)
4909             goto invalid_code;
4910           ONE_MORE_BYTE (c1);
4911           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4912             goto invalid_code;
4913           c = c << 8 | c1;
4914           charset = charset_big5;
4915         }
4916       if (charset->id != charset_ascii
4917           && last_id != charset->id)
4918         {
4919           if (last_id != charset_ascii)
4920             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4921           last_id = charset->id;
4922           last_offset = char_offset;
4923         }
4924       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4925       *charbuf++ = c;
4926       char_offset++;
4927       continue;
4928
4929     invalid_code:
4930       src = src_base;
4931       consumed_chars = consumed_chars_base;
4932       ONE_MORE_BYTE (c);
4933       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4934       char_offset++;
4935       coding->errors++;
4936     }
4937
4938  no_more_source:
4939   if (last_id != charset_ascii)
4940     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4941   coding->consumed_char += consumed_chars_base;
4942   coding->consumed = src_base - coding->source;
4943   coding->charbuf_used = charbuf - coding->charbuf;
4944 }
4945
4946 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4947    This function can encode charsets `ascii', `katakana-jisx0201',
4948    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4949    are sure that all these charsets are registered as official charset
4950    (i.e. do not have extended leading-codes).  Characters of other
4951    charsets are produced without any encoding.  */
4952
4953 static bool
4954 encode_coding_sjis (struct coding_system *coding)
4955 {
4956   bool multibytep = coding->dst_multibyte;
4957   int *charbuf = coding->charbuf;
4958   int *charbuf_end = charbuf + coding->charbuf_used;
4959   unsigned char *dst = coding->destination + coding->produced;
4960   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4961   int safe_room = 4;
4962   ptrdiff_t produced_chars = 0;
4963   Lisp_Object attrs, charset_list, val;
4964   bool ascii_compatible;
4965   struct charset *charset_kanji, *charset_kana;
4966   struct charset *charset_kanji2;
4967   int c;
4968
4969   CODING_GET_INFO (coding, attrs, charset_list);
4970   val = XCDR (charset_list);
4971   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4972   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4973   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4974
4975   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4976
4977   while (charbuf < charbuf_end)
4978     {
4979       ASSURE_DESTINATION (safe_room);
4980       c = *charbuf++;
4981       /* Now encode the character C.  */
4982       if (ASCII_CHAR_P (c) && ascii_compatible)
4983         EMIT_ONE_ASCII_BYTE (c);
4984       else if (CHAR_BYTE8_P (c))
4985         {
4986           c = CHAR_TO_BYTE8 (c);
4987           EMIT_ONE_BYTE (c);
4988         }
4989       else
4990         {
4991           unsigned code;
4992           struct charset *charset;
4993           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
4994                                &code, charset);
4995
4996           if (!charset)
4997             {
4998               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4999                 {
5000                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5001                   charset = CHARSET_FROM_ID (charset_ascii);
5002                 }
5003               else
5004                 {
5005                   c = coding->default_char;
5006                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5007                                        charset_list, &code, charset);
5008                 }
5009             }
5010           if (code == CHARSET_INVALID_CODE (charset))
5011             emacs_abort ();
5012           if (charset == charset_kanji)
5013             {
5014               int c1, c2;
5015               JIS_TO_SJIS (code);
5016               c1 = code >> 8, c2 = code & 0xFF;
5017               EMIT_TWO_BYTES (c1, c2);
5018             }
5019           else if (charset == charset_kana)
5020             EMIT_ONE_BYTE (code | 0x80);
5021           else if (charset_kanji2 && charset == charset_kanji2)
5022             {
5023               int c1, c2;
5024
5025               c1 = code >> 8;
5026               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5027                   || c1 == 0x28
5028                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5029                 {
5030                   JIS_TO_SJIS2 (code);
5031                   c1 = code >> 8, c2 = code & 0xFF;
5032                   EMIT_TWO_BYTES (c1, c2);
5033                 }
5034               else
5035                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5036             }
5037           else
5038             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5039         }
5040     }
5041   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5042   coding->produced_char += produced_chars;
5043   coding->produced = dst - coding->destination;
5044   return 0;
5045 }
5046
5047 static bool
5048 encode_coding_big5 (struct coding_system *coding)
5049 {
5050   bool multibytep = coding->dst_multibyte;
5051   int *charbuf = coding->charbuf;
5052   int *charbuf_end = charbuf + coding->charbuf_used;
5053   unsigned char *dst = coding->destination + coding->produced;
5054   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5055   int safe_room = 4;
5056   ptrdiff_t produced_chars = 0;
5057   Lisp_Object attrs, charset_list, val;
5058   bool ascii_compatible;
5059   struct charset *charset_big5;
5060   int c;
5061
5062   CODING_GET_INFO (coding, attrs, charset_list);
5063   val = XCDR (charset_list);
5064   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5065   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5066
5067   while (charbuf < charbuf_end)
5068     {
5069       ASSURE_DESTINATION (safe_room);
5070       c = *charbuf++;
5071       /* Now encode the character C.  */
5072       if (ASCII_CHAR_P (c) && ascii_compatible)
5073         EMIT_ONE_ASCII_BYTE (c);
5074       else if (CHAR_BYTE8_P (c))
5075         {
5076           c = CHAR_TO_BYTE8 (c);
5077           EMIT_ONE_BYTE (c);
5078         }
5079       else
5080         {
5081           unsigned code;
5082           struct charset *charset;
5083           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5084                                &code, charset);
5085
5086           if (! charset)
5087             {
5088               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5089                 {
5090                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5091                   charset = CHARSET_FROM_ID (charset_ascii);
5092                 }
5093               else
5094                 {
5095                   c = coding->default_char;
5096                   CODING_CHAR_CHARSET (coding, dst, dst_end, c,
5097                                        charset_list, &code, charset);
5098                 }
5099             }
5100           if (code == CHARSET_INVALID_CODE (charset))
5101             emacs_abort ();
5102           if (charset == charset_big5)
5103             {
5104               int c1, c2;
5105
5106               c1 = code >> 8, c2 = code & 0xFF;
5107               EMIT_TWO_BYTES (c1, c2);
5108             }
5109           else
5110             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5111         }
5112     }
5113   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5114   coding->produced_char += produced_chars;
5115   coding->produced = dst - coding->destination;
5116   return 0;
5117 }
5118
5119 \f
5120 /*** 10. CCL handlers ***/
5121
5122 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5123    Return true if a text is encoded in a coding system of which
5124    encoder/decoder are written in CCL program.  */
5125
5126 static bool
5127 detect_coding_ccl (struct coding_system *coding,
5128                    struct coding_detection_info *detect_info)
5129 {
5130   const unsigned char *src = coding->source, *src_base;
5131   const unsigned char *src_end = coding->source + coding->src_bytes;
5132   bool multibytep = coding->src_multibyte;
5133   ptrdiff_t consumed_chars = 0;
5134   int found = 0;
5135   unsigned char *valids;
5136   ptrdiff_t head_ascii = coding->head_ascii;
5137   Lisp_Object attrs;
5138
5139   detect_info->checked |= CATEGORY_MASK_CCL;
5140
5141   coding = &coding_categories[coding_category_ccl];
5142   valids = CODING_CCL_VALIDS (coding);
5143   attrs = CODING_ID_ATTRS (coding->id);
5144   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5145     src += head_ascii;
5146
5147   while (1)
5148     {
5149       int c;
5150
5151       src_base = src;
5152       ONE_MORE_BYTE (c);
5153       if (c < 0 || ! valids[c])
5154         break;
5155       if ((valids[c] > 1))
5156         found = CATEGORY_MASK_CCL;
5157     }
5158   detect_info->rejected |= CATEGORY_MASK_CCL;
5159   return 0;
5160
5161  no_more_source:
5162   detect_info->found |= found;
5163   return 1;
5164 }
5165
5166 static void
5167 decode_coding_ccl (struct coding_system *coding)
5168 {
5169   const unsigned char *src = coding->source + coding->consumed;
5170   const unsigned char *src_end = coding->source + coding->src_bytes;
5171   int *charbuf = coding->charbuf + coding->charbuf_used;
5172   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5173   ptrdiff_t consumed_chars = 0;
5174   bool multibytep = coding->src_multibyte;
5175   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5176   int source_charbuf[1024];
5177   int source_byteidx[1025];
5178   Lisp_Object attrs, charset_list;
5179
5180   CODING_GET_INFO (coding, attrs, charset_list);
5181
5182   while (1)
5183     {
5184       const unsigned char *p = src;
5185       ptrdiff_t offset;
5186       int i = 0;
5187
5188       if (multibytep)
5189         {
5190           while (i < 1024 && p < src_end)
5191             {
5192               source_byteidx[i] = p - src;
5193               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5194             }
5195           source_byteidx[i] = p - src;
5196         }
5197       else
5198         while (i < 1024 && p < src_end)
5199           source_charbuf[i++] = *p++;
5200
5201       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5202         ccl->last_block = true;
5203       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5204       charset_map_loaded = 0;
5205       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5206                   charset_list);
5207       if (charset_map_loaded
5208           && (offset = coding_change_source (coding)))
5209         {
5210           p += offset;
5211           src += offset;
5212           src_end += offset;
5213         }
5214       charbuf += ccl->produced;
5215       if (multibytep)
5216         src += source_byteidx[ccl->consumed];
5217       else
5218         src += ccl->consumed;
5219       consumed_chars += ccl->consumed;
5220       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5221         break;
5222     }
5223
5224   switch (ccl->status)
5225     {
5226     case CCL_STAT_SUSPEND_BY_SRC:
5227       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5228       break;
5229     case CCL_STAT_SUSPEND_BY_DST:
5230       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5231       break;
5232     case CCL_STAT_QUIT:
5233     case CCL_STAT_INVALID_CMD:
5234       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5235       break;
5236     default:
5237       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5238       break;
5239     }
5240   coding->consumed_char += consumed_chars;
5241   coding->consumed = src - coding->source;
5242   coding->charbuf_used = charbuf - coding->charbuf;
5243 }
5244
5245 static bool
5246 encode_coding_ccl (struct coding_system *coding)
5247 {
5248   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5249   bool multibytep = coding->dst_multibyte;
5250   int *charbuf = coding->charbuf;
5251   int *charbuf_end = charbuf + coding->charbuf_used;
5252   unsigned char *dst = coding->destination + coding->produced;
5253   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5254   int destination_charbuf[1024];
5255   ptrdiff_t produced_chars = 0;
5256   int i;
5257   Lisp_Object attrs, charset_list;
5258
5259   CODING_GET_INFO (coding, attrs, charset_list);
5260   if (coding->consumed_char == coding->src_chars
5261       && coding->mode & CODING_MODE_LAST_BLOCK)
5262     ccl->last_block = true;
5263
5264   do
5265     {
5266       ptrdiff_t offset;
5267
5268       /* As ccl_driver calls DECODE_CHAR, buffer may be relocated.  */
5269       charset_map_loaded = 0;
5270       ccl_driver (ccl, charbuf, destination_charbuf,
5271                   charbuf_end - charbuf, 1024, charset_list);
5272       if (charset_map_loaded
5273           && (offset = coding_change_destination (coding)))
5274         dst += offset;
5275       if (multibytep)
5276         {
5277           ASSURE_DESTINATION (ccl->produced * 2);
5278           for (i = 0; i < ccl->produced; i++)
5279             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5280         }
5281       else
5282         {
5283           ASSURE_DESTINATION (ccl->produced);
5284           for (i = 0; i < ccl->produced; i++)
5285             *dst++ = destination_charbuf[i] & 0xFF;
5286           produced_chars += ccl->produced;
5287         }
5288       charbuf += ccl->consumed;
5289       if (ccl->status == CCL_STAT_QUIT
5290           || ccl->status == CCL_STAT_INVALID_CMD)
5291         break;
5292     }
5293   while (charbuf < charbuf_end);
5294
5295   switch (ccl->status)
5296     {
5297     case CCL_STAT_SUSPEND_BY_SRC:
5298       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5299       break;
5300     case CCL_STAT_SUSPEND_BY_DST:
5301       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5302       break;
5303     case CCL_STAT_QUIT:
5304     case CCL_STAT_INVALID_CMD:
5305       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5306       break;
5307     default:
5308       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5309       break;
5310     }
5311
5312   coding->produced_char += produced_chars;
5313   coding->produced = dst - coding->destination;
5314   return 0;
5315 }
5316
5317 \f
5318 /*** 10, 11. no-conversion handlers ***/
5319
5320 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5321
5322 static void
5323 decode_coding_raw_text (struct coding_system *coding)
5324 {
5325   bool eol_dos
5326     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5327
5328   coding->chars_at_source = 1;
5329   coding->consumed_char = coding->src_chars;
5330   coding->consumed = coding->src_bytes;
5331   if (eol_dos && coding->source[coding->src_bytes - 1] == '\r')
5332     {
5333       coding->consumed_char--;
5334       coding->consumed--;
5335       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5336     }
5337   else
5338     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5339 }
5340
5341 static bool
5342 encode_coding_raw_text (struct coding_system *coding)
5343 {
5344   bool multibytep = coding->dst_multibyte;
5345   int *charbuf = coding->charbuf;
5346   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5347   unsigned char *dst = coding->destination + coding->produced;
5348   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5349   ptrdiff_t produced_chars = 0;
5350   int c;
5351
5352   if (multibytep)
5353     {
5354       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5355
5356       if (coding->src_multibyte)
5357         while (charbuf < charbuf_end)
5358           {
5359             ASSURE_DESTINATION (safe_room);
5360             c = *charbuf++;
5361             if (ASCII_CHAR_P (c))
5362               EMIT_ONE_ASCII_BYTE (c);
5363             else if (CHAR_BYTE8_P (c))
5364               {
5365                 c = CHAR_TO_BYTE8 (c);
5366                 EMIT_ONE_BYTE (c);
5367               }
5368             else
5369               {
5370                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5371
5372                 CHAR_STRING_ADVANCE (c, p1);
5373                 do
5374                   {
5375                     EMIT_ONE_BYTE (*p0);
5376                     p0++;
5377                   }
5378                 while (p0 < p1);
5379               }
5380           }
5381       else
5382         while (charbuf < charbuf_end)
5383           {
5384             ASSURE_DESTINATION (safe_room);
5385             c = *charbuf++;
5386             EMIT_ONE_BYTE (c);
5387           }
5388     }
5389   else
5390     {
5391       if (coding->src_multibyte)
5392         {
5393           int safe_room = MAX_MULTIBYTE_LENGTH;
5394
5395           while (charbuf < charbuf_end)
5396             {
5397               ASSURE_DESTINATION (safe_room);
5398               c = *charbuf++;
5399               if (ASCII_CHAR_P (c))
5400                 *dst++ = c;
5401               else if (CHAR_BYTE8_P (c))
5402                 *dst++ = CHAR_TO_BYTE8 (c);
5403               else
5404                 CHAR_STRING_ADVANCE (c, dst);
5405             }
5406         }
5407       else
5408         {
5409           ASSURE_DESTINATION (charbuf_end - charbuf);
5410           while (charbuf < charbuf_end && dst < dst_end)
5411             *dst++ = *charbuf++;
5412         }
5413       produced_chars = dst - (coding->destination + coding->produced);
5414     }
5415   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5416   coding->produced_char += produced_chars;
5417   coding->produced = dst - coding->destination;
5418   return 0;
5419 }
5420
5421 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5422    Return true if a text is encoded in a charset-based coding system.  */
5423
5424 static bool
5425 detect_coding_charset (struct coding_system *coding,
5426                        struct coding_detection_info *detect_info)
5427 {
5428   const unsigned char *src = coding->source, *src_base;
5429   const unsigned char *src_end = coding->source + coding->src_bytes;
5430   bool multibytep = coding->src_multibyte;
5431   ptrdiff_t consumed_chars = 0;
5432   Lisp_Object attrs, valids, name;
5433   int found = 0;
5434   ptrdiff_t head_ascii = coding->head_ascii;
5435   bool check_latin_extra = 0;
5436
5437   detect_info->checked |= CATEGORY_MASK_CHARSET;
5438
5439   coding = &coding_categories[coding_category_charset];
5440   attrs = CODING_ID_ATTRS (coding->id);
5441   valids = AREF (attrs, coding_attr_charset_valids);
5442   name = CODING_ID_NAME (coding->id);
5443   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5444                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5445       || strncmp (SSDATA (SYMBOL_NAME (name)),
5446                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5447     check_latin_extra = 1;
5448
5449   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5450     src += head_ascii;
5451
5452   while (1)
5453     {
5454       int c;
5455       Lisp_Object val;
5456       struct charset *charset;
5457       int dim, idx;
5458
5459       src_base = src;
5460       ONE_MORE_BYTE (c);
5461       if (c < 0)
5462         continue;
5463       val = AREF (valids, c);
5464       if (NILP (val))
5465         break;
5466       if (c >= 0x80)
5467         {
5468           if (c < 0xA0
5469               && check_latin_extra
5470               && (!VECTORP (Vlatin_extra_code_table)
5471                   || NILP (AREF (Vlatin_extra_code_table, c))))
5472             break;
5473           found = CATEGORY_MASK_CHARSET;
5474         }
5475       if (INTEGERP (val))
5476         {
5477           charset = CHARSET_FROM_ID (XFASTINT (val));
5478           dim = CHARSET_DIMENSION (charset);
5479           for (idx = 1; idx < dim; idx++)
5480             {
5481               if (src == src_end)
5482                 goto too_short;
5483               ONE_MORE_BYTE (c);
5484               if (c < charset->code_space[(dim - 1 - idx) * 4]
5485                   || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5486                 break;
5487             }
5488           if (idx < dim)
5489             break;
5490         }
5491       else
5492         {
5493           idx = 1;
5494           for (; CONSP (val); val = XCDR (val))
5495             {
5496               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5497               dim = CHARSET_DIMENSION (charset);
5498               while (idx < dim)
5499                 {
5500                   if (src == src_end)
5501                     goto too_short;
5502                   ONE_MORE_BYTE (c);
5503                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5504                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5505                     break;
5506                   idx++;
5507                 }
5508               if (idx == dim)
5509                 {
5510                   val = Qnil;
5511                   break;
5512                 }
5513             }
5514           if (CONSP (val))
5515             break;
5516         }
5517     }
5518  too_short:
5519   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5520   return 0;
5521
5522  no_more_source:
5523   detect_info->found |= found;
5524   return 1;
5525 }
5526
5527 static void
5528 decode_coding_charset (struct coding_system *coding)
5529 {
5530   const unsigned char *src = coding->source + coding->consumed;
5531   const unsigned char *src_end = coding->source + coding->src_bytes;
5532   const unsigned char *src_base;
5533   int *charbuf = coding->charbuf + coding->charbuf_used;
5534   /* We may produce one charset annotation in one loop and one more at
5535      the end.  */
5536   int *charbuf_end
5537     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5538   ptrdiff_t consumed_chars = 0, consumed_chars_base;
5539   bool multibytep = coding->src_multibyte;
5540   Lisp_Object attrs = CODING_ID_ATTRS (coding->id);
5541   Lisp_Object valids;
5542   ptrdiff_t char_offset = coding->produced_char;
5543   ptrdiff_t last_offset = char_offset;
5544   int last_id = charset_ascii;
5545   bool eol_dos
5546     = !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5547   int byte_after_cr = -1;
5548
5549   valids = AREF (attrs, coding_attr_charset_valids);
5550
5551   while (1)
5552     {
5553       int c;
5554       Lisp_Object val;
5555       struct charset *charset;
5556       int dim;
5557       int len = 1;
5558       unsigned code;
5559
5560       src_base = src;
5561       consumed_chars_base = consumed_chars;
5562
5563       if (charbuf >= charbuf_end)
5564         {
5565           if (byte_after_cr >= 0)
5566             src_base--;
5567           break;
5568         }
5569
5570       if (byte_after_cr >= 0)
5571         {
5572           c = byte_after_cr;
5573           byte_after_cr = -1;
5574         }
5575       else
5576         {
5577           ONE_MORE_BYTE (c);
5578           if (eol_dos && c == '\r')
5579             ONE_MORE_BYTE (byte_after_cr);
5580         }
5581       if (c < 0)
5582         goto invalid_code;
5583       code = c;
5584
5585       val = AREF (valids, c);
5586       if (! INTEGERP (val) && ! CONSP (val))
5587         goto invalid_code;
5588       if (INTEGERP (val))
5589         {
5590           charset = CHARSET_FROM_ID (XFASTINT (val));
5591           dim = CHARSET_DIMENSION (charset);
5592           while (len < dim)
5593             {
5594               ONE_MORE_BYTE (c);
5595               code = (code << 8) | c;
5596               len++;
5597             }
5598           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5599                               charset, code, c);
5600         }
5601       else
5602         {
5603           /* VAL is a list of charset IDs.  It is assured that the
5604              list is sorted by charset dimensions (smaller one
5605              comes first).  */
5606           while (CONSP (val))
5607             {
5608               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5609               dim = CHARSET_DIMENSION (charset);
5610               while (len < dim)
5611                 {
5612                   ONE_MORE_BYTE (c);
5613                   code = (code << 8) | c;
5614                   len++;
5615                 }
5616               CODING_DECODE_CHAR (coding, src, src_base,
5617                                   src_end, charset, code, c);
5618               if (c >= 0)
5619                 break;
5620               val = XCDR (val);
5621             }
5622         }
5623       if (c < 0)
5624         goto invalid_code;
5625       if (charset->id != charset_ascii
5626           && last_id != charset->id)
5627         {
5628           if (last_id != charset_ascii)
5629             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5630           last_id = charset->id;
5631           last_offset = char_offset;
5632         }
5633
5634       *charbuf++ = c;
5635       char_offset++;
5636       continue;
5637
5638     invalid_code:
5639       src = src_base;
5640       consumed_chars = consumed_chars_base;
5641       ONE_MORE_BYTE (c);
5642       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5643       char_offset++;
5644       coding->errors++;
5645     }
5646
5647  no_more_source:
5648   if (last_id != charset_ascii)
5649     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5650   coding->consumed_char += consumed_chars_base;
5651   coding->consumed = src_base - coding->source;
5652   coding->charbuf_used = charbuf - coding->charbuf;
5653 }
5654
5655 static bool
5656 encode_coding_charset (struct coding_system *coding)
5657 {
5658   bool multibytep = coding->dst_multibyte;
5659   int *charbuf = coding->charbuf;
5660   int *charbuf_end = charbuf + coding->charbuf_used;
5661   unsigned char *dst = coding->destination + coding->produced;
5662   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5663   int safe_room = MAX_MULTIBYTE_LENGTH;
5664   ptrdiff_t produced_chars = 0;
5665   Lisp_Object attrs, charset_list;
5666   bool ascii_compatible;
5667   int c;
5668
5669   CODING_GET_INFO (coding, attrs, charset_list);
5670   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5671
5672   while (charbuf < charbuf_end)
5673     {
5674       struct charset *charset;
5675       unsigned code;
5676
5677       ASSURE_DESTINATION (safe_room);
5678       c = *charbuf++;
5679       if (ascii_compatible && ASCII_CHAR_P (c))
5680         EMIT_ONE_ASCII_BYTE (c);
5681       else if (CHAR_BYTE8_P (c))
5682         {
5683           c = CHAR_TO_BYTE8 (c);
5684           EMIT_ONE_BYTE (c);
5685         }
5686       else
5687         {
5688           CODING_CHAR_CHARSET (coding, dst, dst_end, c, charset_list,
5689                                &code, charset);
5690
5691           if (charset)
5692             {
5693               if (CHARSET_DIMENSION (charset) == 1)
5694                 EMIT_ONE_BYTE (code);
5695               else if (CHARSET_DIMENSION (charset) == 2)
5696                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5697               else if (CHARSET_DIMENSION (charset) == 3)
5698                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5699               else
5700                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5701                                  (code >> 8) & 0xFF, code & 0xFF);
5702             }
5703           else
5704             {
5705               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5706                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5707               else
5708                 c = coding->default_char;
5709               EMIT_ONE_BYTE (c);
5710             }
5711         }
5712     }
5713
5714   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5715   coding->produced_char += produced_chars;
5716   coding->produced = dst - coding->destination;
5717   return 0;
5718 }
5719
5720 \f
5721 /*** 7. C library functions ***/
5722
5723 /* Setup coding context CODING from information about CODING_SYSTEM.
5724    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5725    CODING_SYSTEM is invalid, signal an error.  */
5726
5727 void
5728 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5729 {
5730   Lisp_Object attrs;
5731   Lisp_Object eol_type;
5732   Lisp_Object coding_type;
5733   Lisp_Object val;
5734
5735   if (NILP (coding_system))
5736     coding_system = Qundecided;
5737
5738   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5739
5740   attrs = CODING_ID_ATTRS (coding->id);
5741   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5742
5743   coding->mode = 0;
5744   if (VECTORP (eol_type))
5745     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5746                             | CODING_REQUIRE_DETECTION_MASK);
5747   else if (! EQ (eol_type, Qunix))
5748     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5749                             | CODING_REQUIRE_ENCODING_MASK);
5750   else
5751     coding->common_flags = 0;
5752   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5753     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5754   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5755     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5756   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5757     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5758
5759   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5760   coding->max_charset_id = SCHARS (val) - 1;
5761   coding->safe_charsets = SDATA (val);
5762   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5763   coding->carryover_bytes = 0;
5764   coding->raw_destination = 0;
5765
5766   coding_type = CODING_ATTR_TYPE (attrs);
5767   if (EQ (coding_type, Qundecided))
5768     {
5769       coding->detector = NULL;
5770       coding->decoder = decode_coding_raw_text;
5771       coding->encoder = encode_coding_raw_text;
5772       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5773       coding->spec.undecided.inhibit_nbd
5774         = (encode_inhibit_flag
5775            (AREF (attrs, coding_attr_undecided_inhibit_null_byte_detection)));
5776       coding->spec.undecided.inhibit_ied
5777         = (encode_inhibit_flag
5778            (AREF (attrs, coding_attr_undecided_inhibit_iso_escape_detection)));
5779       coding->spec.undecided.prefer_utf_8
5780         = ! NILP (AREF (attrs, coding_attr_undecided_prefer_utf_8));
5781     }
5782   else if (EQ (coding_type, Qiso_2022))
5783     {
5784       int i;
5785       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5786
5787       /* Invoke graphic register 0 to plane 0.  */
5788       CODING_ISO_INVOCATION (coding, 0) = 0;
5789       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5790       CODING_ISO_INVOCATION (coding, 1)
5791         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5792       /* Setup the initial status of designation.  */
5793       for (i = 0; i < 4; i++)
5794         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5795       /* Not single shifting initially.  */
5796       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5797       /* Beginning of buffer should also be regarded as bol. */
5798       CODING_ISO_BOL (coding) = 1;
5799       coding->detector = detect_coding_iso_2022;
5800       coding->decoder = decode_coding_iso_2022;
5801       coding->encoder = encode_coding_iso_2022;
5802       if (flags & CODING_ISO_FLAG_SAFE)
5803         coding->mode |= CODING_MODE_SAFE_ENCODING;
5804       coding->common_flags
5805         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5806             | CODING_REQUIRE_FLUSHING_MASK);
5807       if (flags & CODING_ISO_FLAG_COMPOSITION)
5808         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5809       if (flags & CODING_ISO_FLAG_DESIGNATION)
5810         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5811       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5812         {
5813           setup_iso_safe_charsets (attrs);
5814           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5815           coding->max_charset_id = SCHARS (val) - 1;
5816           coding->safe_charsets = SDATA (val);
5817         }
5818       CODING_ISO_FLAGS (coding) = flags;
5819       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5820       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5821       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5822       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5823     }
5824   else if (EQ (coding_type, Qcharset))
5825     {
5826       coding->detector = detect_coding_charset;
5827       coding->decoder = decode_coding_charset;
5828       coding->encoder = encode_coding_charset;
5829       coding->common_flags
5830         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5831     }
5832   else if (EQ (coding_type, Qutf_8))
5833     {
5834       val = AREF (attrs, coding_attr_utf_bom);
5835       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5836                                    : EQ (val, Qt) ? utf_with_bom
5837                                    : utf_without_bom);
5838       coding->detector = detect_coding_utf_8;
5839       coding->decoder = decode_coding_utf_8;
5840       coding->encoder = encode_coding_utf_8;
5841       coding->common_flags
5842         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5843       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5844         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5845     }
5846   else if (EQ (coding_type, Qutf_16))
5847     {
5848       val = AREF (attrs, coding_attr_utf_bom);
5849       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5850                                     : EQ (val, Qt) ? utf_with_bom
5851                                     : utf_without_bom);
5852       val = AREF (attrs, coding_attr_utf_16_endian);
5853       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5854                                        : utf_16_little_endian);
5855       CODING_UTF_16_SURROGATE (coding) = 0;
5856       coding->detector = detect_coding_utf_16;
5857       coding->decoder = decode_coding_utf_16;
5858       coding->encoder = encode_coding_utf_16;
5859       coding->common_flags
5860         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5861       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5862         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5863     }
5864   else if (EQ (coding_type, Qccl))
5865     {
5866       coding->detector = detect_coding_ccl;
5867       coding->decoder = decode_coding_ccl;
5868       coding->encoder = encode_coding_ccl;
5869       coding->common_flags
5870         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5871             | CODING_REQUIRE_FLUSHING_MASK);
5872     }
5873   else if (EQ (coding_type, Qemacs_mule))
5874     {
5875       coding->detector = detect_coding_emacs_mule;
5876       coding->decoder = decode_coding_emacs_mule;
5877       coding->encoder = encode_coding_emacs_mule;
5878       coding->common_flags
5879         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5880       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5881           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5882         {
5883           Lisp_Object tail, safe_charsets;
5884           int max_charset_id = 0;
5885
5886           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5887                tail = XCDR (tail))
5888             if (max_charset_id < XFASTINT (XCAR (tail)))
5889               max_charset_id = XFASTINT (XCAR (tail));
5890           safe_charsets = make_uninit_string (max_charset_id + 1);
5891           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5892           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5893                tail = XCDR (tail))
5894             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5895           coding->max_charset_id = max_charset_id;
5896           coding->safe_charsets = SDATA (safe_charsets);
5897         }
5898       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5899       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5900     }
5901   else if (EQ (coding_type, Qshift_jis))
5902     {
5903       coding->detector = detect_coding_sjis;
5904       coding->decoder = decode_coding_sjis;
5905       coding->encoder = encode_coding_sjis;
5906       coding->common_flags
5907         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5908     }
5909   else if (EQ (coding_type, Qbig5))
5910     {
5911       coding->detector = detect_coding_big5;
5912       coding->decoder = decode_coding_big5;
5913       coding->encoder = encode_coding_big5;
5914       coding->common_flags
5915         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5916     }
5917   else                          /* EQ (coding_type, Qraw_text) */
5918     {
5919       coding->detector = NULL;
5920       coding->decoder = decode_coding_raw_text;
5921       coding->encoder = encode_coding_raw_text;
5922       if (! EQ (eol_type, Qunix))
5923         {
5924           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5925           if (! VECTORP (eol_type))
5926             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5927         }
5928
5929     }
5930
5931   return;
5932 }
5933
5934 /* Return a list of charsets supported by CODING.  */
5935
5936 Lisp_Object
5937 coding_charset_list (struct coding_system *coding)
5938 {
5939   Lisp_Object attrs, charset_list;
5940
5941   CODING_GET_INFO (coding, attrs, charset_list);
5942   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5943     {
5944       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5945
5946       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5947         charset_list = Viso_2022_charset_list;
5948     }
5949   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5950     {
5951       charset_list = Vemacs_mule_charset_list;
5952     }
5953   return charset_list;
5954 }
5955
5956
5957 /* Return a list of charsets supported by CODING-SYSTEM.  */
5958
5959 Lisp_Object
5960 coding_system_charset_list (Lisp_Object coding_system)
5961 {
5962   ptrdiff_t id;
5963   Lisp_Object attrs, charset_list;
5964
5965   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5966   attrs = CODING_ID_ATTRS (id);
5967
5968   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5969     {
5970       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5971
5972       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5973         charset_list = Viso_2022_charset_list;
5974       else
5975         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5976     }
5977   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5978     {
5979       charset_list = Vemacs_mule_charset_list;
5980     }
5981   else
5982     {
5983       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5984     }
5985   return charset_list;
5986 }
5987
5988
5989 /* Return raw-text or one of its subsidiaries that has the same
5990    eol_type as CODING-SYSTEM.  */
5991
5992 Lisp_Object
5993 raw_text_coding_system (Lisp_Object coding_system)
5994 {
5995   Lisp_Object spec, attrs;
5996   Lisp_Object eol_type, raw_text_eol_type;
5997
5998   if (NILP (coding_system))
5999     return Qraw_text;
6000   spec = CODING_SYSTEM_SPEC (coding_system);
6001   attrs = AREF (spec, 0);
6002
6003   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6004     return coding_system;
6005
6006   eol_type = AREF (spec, 2);
6007   if (VECTORP (eol_type))
6008     return Qraw_text;
6009   spec = CODING_SYSTEM_SPEC (Qraw_text);
6010   raw_text_eol_type = AREF (spec, 2);
6011   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6012           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6013           : AREF (raw_text_eol_type, 2));
6014 }
6015
6016
6017 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
6018    the subsidiary that has the same eol-spec as PARENT (if it is not
6019    nil and specifies end-of-line format) or the system's setting
6020    (system_eol_type).  */
6021
6022 Lisp_Object
6023 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
6024 {
6025   Lisp_Object spec, eol_type;
6026
6027   if (NILP (coding_system))
6028     coding_system = Qraw_text;
6029   spec = CODING_SYSTEM_SPEC (coding_system);
6030   eol_type = AREF (spec, 2);
6031   if (VECTORP (eol_type))
6032     {
6033       Lisp_Object parent_eol_type;
6034
6035       if (! NILP (parent))
6036         {
6037           Lisp_Object parent_spec;
6038
6039           parent_spec = CODING_SYSTEM_SPEC (parent);
6040           parent_eol_type = AREF (parent_spec, 2);
6041           if (VECTORP (parent_eol_type))
6042             parent_eol_type = system_eol_type;
6043         }
6044       else
6045         parent_eol_type = system_eol_type;
6046       if (EQ (parent_eol_type, Qunix))
6047         coding_system = AREF (eol_type, 0);
6048       else if (EQ (parent_eol_type, Qdos))
6049         coding_system = AREF (eol_type, 1);
6050       else if (EQ (parent_eol_type, Qmac))
6051         coding_system = AREF (eol_type, 2);
6052     }
6053   return coding_system;
6054 }
6055
6056
6057 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
6058    decided for writing to a process.  If not, complement them, and
6059    return a new coding system.  */
6060
6061 Lisp_Object
6062 complement_process_encoding_system (Lisp_Object coding_system)
6063 {
6064   Lisp_Object coding_base = Qnil, eol_base = Qnil;
6065   Lisp_Object spec, attrs;
6066   int i;
6067
6068   for (i = 0; i < 3; i++)
6069     {
6070       if (i == 1)
6071         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6072       else if (i == 2)
6073         coding_system = preferred_coding_system ();
6074       spec = CODING_SYSTEM_SPEC (coding_system);
6075       if (NILP (spec))
6076         continue;
6077       attrs = AREF (spec, 0);
6078       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6079         coding_base = CODING_ATTR_BASE_NAME (attrs);
6080       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6081         eol_base = coding_system;
6082       if (! NILP (coding_base) && ! NILP (eol_base))
6083         break;
6084     }
6085
6086   if (i > 0)
6087     /* The original CODING_SYSTEM didn't specify text-conversion or
6088        eol-conversion.  Be sure that we return a fully complemented
6089        coding system.  */
6090     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6091   return coding_system;
6092 }
6093
6094
6095 /* Emacs has a mechanism to automatically detect a coding system if it
6096    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6097    it's impossible to distinguish some coding systems accurately
6098    because they use the same range of codes.  So, at first, coding
6099    systems are categorized into 7, those are:
6100
6101    o coding-category-emacs-mule
6102
6103         The category for a coding system which has the same code range
6104         as Emacs' internal format.  Assigned the coding-system (Lisp
6105         symbol) `emacs-mule' by default.
6106
6107    o coding-category-sjis
6108
6109         The category for a coding system which has the same code range
6110         as SJIS.  Assigned the coding-system (Lisp
6111         symbol) `japanese-shift-jis' by default.
6112
6113    o coding-category-iso-7
6114
6115         The category for a coding system which has the same code range
6116         as ISO2022 of 7-bit environment.  This doesn't use any locking
6117         shift and single shift functions.  This can encode/decode all
6118         charsets.  Assigned the coding-system (Lisp symbol)
6119         `iso-2022-7bit' by default.
6120
6121    o coding-category-iso-7-tight
6122
6123         Same as coding-category-iso-7 except that this can
6124         encode/decode only the specified charsets.
6125
6126    o coding-category-iso-8-1
6127
6128         The category for a coding system which has the same code range
6129         as ISO2022 of 8-bit environment and graphic plane 1 used only
6130         for DIMENSION1 charset.  This doesn't use any locking shift
6131         and single shift functions.  Assigned the coding-system (Lisp
6132         symbol) `iso-latin-1' by default.
6133
6134    o coding-category-iso-8-2
6135
6136         The category for a coding system which has the same code range
6137         as ISO2022 of 8-bit environment and graphic plane 1 used only
6138         for DIMENSION2 charset.  This doesn't use any locking shift
6139         and single shift functions.  Assigned the coding-system (Lisp
6140         symbol) `japanese-iso-8bit' by default.
6141
6142    o coding-category-iso-7-else
6143
6144         The category for a coding system which has the same code range
6145         as ISO2022 of 7-bit environment but uses locking shift or
6146         single shift functions.  Assigned the coding-system (Lisp
6147         symbol) `iso-2022-7bit-lock' by default.
6148
6149    o coding-category-iso-8-else
6150
6151         The category for a coding system which has the same code range
6152         as ISO2022 of 8-bit environment but uses locking shift or
6153         single shift functions.  Assigned the coding-system (Lisp
6154         symbol) `iso-2022-8bit-ss2' by default.
6155
6156    o coding-category-big5
6157
6158         The category for a coding system which has the same code range
6159         as BIG5.  Assigned the coding-system (Lisp symbol)
6160         `cn-big5' by default.
6161
6162    o coding-category-utf-8
6163
6164         The category for a coding system which has the same code range
6165         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6166         symbol) `utf-8' by default.
6167
6168    o coding-category-utf-16-be
6169
6170         The category for a coding system in which a text has an
6171         Unicode signature (cf. Unicode Standard) in the order of BIG
6172         endian at the head.  Assigned the coding-system (Lisp symbol)
6173         `utf-16-be' by default.
6174
6175    o coding-category-utf-16-le
6176
6177         The category for a coding system in which a text has an
6178         Unicode signature (cf. Unicode Standard) in the order of
6179         LITTLE endian at the head.  Assigned the coding-system (Lisp
6180         symbol) `utf-16-le' by default.
6181
6182    o coding-category-ccl
6183
6184         The category for a coding system of which encoder/decoder is
6185         written in CCL programs.  The default value is nil, i.e., no
6186         coding system is assigned.
6187
6188    o coding-category-binary
6189
6190         The category for a coding system not categorized in any of the
6191         above.  Assigned the coding-system (Lisp symbol)
6192         `no-conversion' by default.
6193
6194    Each of them is a Lisp symbol and the value is an actual
6195    `coding-system's (this is also a Lisp symbol) assigned by a user.
6196    What Emacs does actually is to detect a category of coding system.
6197    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6198    decide only one possible category, it selects a category of the
6199    highest priority.  Priorities of categories are also specified by a
6200    user in a Lisp variable `coding-category-list'.
6201
6202 */
6203
6204 static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
6205                                            int eol_seen);
6206
6207
6208 /* Return the number of ASCII characters at the head of the source.
6209    By side effects, set coding->head_ascii and update
6210    coding->eol_seen.  The value of coding->eol_seen is "logical or" of
6211    EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is
6212    reliable only when all the source bytes are ASCII.  */
6213
6214 static int
6215 check_ascii (struct coding_system *coding)
6216 {
6217   const unsigned char *src, *end;
6218   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6219   int eol_seen = coding->eol_seen;
6220
6221   coding_set_source (coding);
6222   src = coding->source;
6223   end = src + coding->src_bytes;
6224
6225   if (inhibit_eol_conversion
6226       || SYMBOLP (eol_type))
6227     {
6228       /* We don't have to check EOL format.  */
6229       while (src < end && !( *src & 0x80))
6230         {
6231           if (*src++ == '\n')
6232             eol_seen |= EOL_SEEN_LF;
6233         }
6234     }
6235   else
6236     {
6237       end--;                /* We look ahead one byte for "CR LF".  */
6238       while (src < end)
6239         {
6240           int c = *src;
6241
6242           if (c & 0x80)
6243             break;
6244           src++;
6245           if (c == '\r')
6246             {
6247               if (*src == '\n')
6248                 {
6249                   eol_seen |= EOL_SEEN_CRLF;
6250                   src++;
6251                 }
6252               else
6253                 eol_seen |= EOL_SEEN_CR;
6254             }
6255           else if (c == '\n')
6256             eol_seen |= EOL_SEEN_LF;
6257         }
6258       if (src == end)
6259         {
6260           int c = *src;
6261
6262           /* All bytes but the last one C are ASCII.  */
6263           if (! (c & 0x80))
6264             {
6265               if (c == '\r')
6266                 eol_seen |= EOL_SEEN_CR;
6267               else if (c  == '\n')
6268                 eol_seen |= EOL_SEEN_LF;
6269               src++;
6270             }
6271         }
6272     }
6273   coding->head_ascii = src - coding->source;
6274   coding->eol_seen = eol_seen;
6275   return (coding->head_ascii);
6276 }
6277
6278
6279 /* Return the number of characters at the source if all the bytes are
6280    valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
6281    effects, update coding->eol_seen.  The value of coding->eol_seen is
6282    "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
6283    the value is reliable only when all the source bytes are valid
6284    UTF-8.  */
6285
6286 static int
6287 check_utf_8 (struct coding_system *coding)
6288 {
6289   const unsigned char *src, *end;
6290   int eol_seen;
6291   int nchars = coding->head_ascii;
6292
6293   if (coding->head_ascii < 0)
6294     check_ascii (coding);
6295   else
6296     coding_set_source (coding);
6297   src = coding->source + coding->head_ascii;
6298   /* We look ahead one byte for CR LF.  */
6299   end = coding->source + coding->src_bytes - 1;
6300   eol_seen = coding->eol_seen;
6301   while (src < end)
6302     {
6303       int c = *src;
6304
6305       if (UTF_8_1_OCTET_P (*src))
6306         {
6307           src++;
6308           if (c < 0x20)
6309             {
6310               if (c == '\r')
6311                 {
6312                   if (*src == '\n')
6313                     {
6314                       eol_seen |= EOL_SEEN_CRLF;
6315                       src++;
6316                       nchars++;
6317                     }
6318                   else
6319                     eol_seen |= EOL_SEEN_CR;
6320                 }
6321               else if (c == '\n')
6322                 eol_seen |= EOL_SEEN_LF;
6323             }
6324         }
6325       else if (UTF_8_2_OCTET_LEADING_P (c))
6326         {
6327           if (c < 0xC2          /* overlong sequence */
6328               || src + 1 >= end
6329               || ! UTF_8_EXTRA_OCTET_P (src[1]))
6330             return -1;
6331           src += 2;
6332         }
6333       else if (UTF_8_3_OCTET_LEADING_P (c))
6334         {
6335           if (src + 2 >= end
6336               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6337                     && UTF_8_EXTRA_OCTET_P (src[2])))
6338             return -1;
6339           c = (((c & 0xF) << 12)
6340                | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
6341           if (c < 0x800                       /* overlong sequence */
6342               || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
6343             return -1;
6344           src += 3;
6345         }
6346       else if (UTF_8_4_OCTET_LEADING_P (c))
6347         {
6348           if (src + 3 >= end
6349               || ! (UTF_8_EXTRA_OCTET_P (src[1])
6350                     && UTF_8_EXTRA_OCTET_P (src[2])
6351                     && UTF_8_EXTRA_OCTET_P (src[3])))
6352             return -1;
6353           c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
6354                | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
6355           if (c < 0x10000       /* overlong sequence */
6356               || c >= 0x110000) /* non-Unicode character  */
6357             return -1;
6358           src += 4;
6359         }
6360       else
6361         return -1;
6362       nchars++;
6363     }
6364
6365   if (src == end)
6366     {
6367       if (! UTF_8_1_OCTET_P (*src))
6368         return -1;
6369       nchars++;
6370       if (*src == '\r')
6371         eol_seen |= EOL_SEEN_CR;
6372       else if (*src  == '\n')
6373         eol_seen |= EOL_SEEN_LF;
6374     }
6375   coding->eol_seen = eol_seen;
6376   return nchars;
6377 }
6378
6379
6380 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6381    SOURCE is encoded.  If CATEGORY is one of
6382    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6383    two-byte, else they are encoded by one-byte.
6384
6385    Return one of EOL_SEEN_XXX.  */
6386
6387 #define MAX_EOL_CHECK_COUNT 3
6388
6389 static int
6390 detect_eol (const unsigned char *source, ptrdiff_t src_bytes,
6391             enum coding_category category)
6392 {
6393   const unsigned char *src = source, *src_end = src + src_bytes;
6394   unsigned char c;
6395   int total  = 0;
6396   int eol_seen = EOL_SEEN_NONE;
6397
6398   if ((1 << category) & CATEGORY_MASK_UTF_16)
6399     {
6400       bool msb = category == (coding_category_utf_16_le
6401                               | coding_category_utf_16_le_nosig);
6402       bool lsb = !msb;
6403
6404       while (src + 1 < src_end)
6405         {
6406           c = src[lsb];
6407           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6408             {
6409               int this_eol;
6410
6411               if (c == '\n')
6412                 this_eol = EOL_SEEN_LF;
6413               else if (src + 3 >= src_end
6414                        || src[msb + 2] != 0
6415                        || src[lsb + 2] != '\n')
6416                 this_eol = EOL_SEEN_CR;
6417               else
6418                 {
6419                   this_eol = EOL_SEEN_CRLF;
6420                   src += 2;
6421                 }
6422
6423               if (eol_seen == EOL_SEEN_NONE)
6424                 /* This is the first end-of-line.  */
6425                 eol_seen = this_eol;
6426               else if (eol_seen != this_eol)
6427                 {
6428                   /* The found type is different from what found before.
6429                      Allow for stray ^M characters in DOS EOL files.  */
6430                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6431                       || (eol_seen == EOL_SEEN_CRLF
6432                           && this_eol == EOL_SEEN_CR))
6433                     eol_seen = EOL_SEEN_CRLF;
6434                   else
6435                     {
6436                       eol_seen = EOL_SEEN_LF;
6437                       break;
6438                     }
6439                 }
6440               if (++total == MAX_EOL_CHECK_COUNT)
6441                 break;
6442             }
6443           src += 2;
6444         }
6445     }
6446   else
6447     while (src < src_end)
6448       {
6449         c = *src++;
6450         if (c == '\n' || c == '\r')
6451           {
6452             int this_eol;
6453
6454             if (c == '\n')
6455               this_eol = EOL_SEEN_LF;
6456             else if (src >= src_end || *src != '\n')
6457               this_eol = EOL_SEEN_CR;
6458             else
6459               this_eol = EOL_SEEN_CRLF, src++;
6460
6461             if (eol_seen == EOL_SEEN_NONE)
6462               /* This is the first end-of-line.  */
6463               eol_seen = this_eol;
6464             else if (eol_seen != this_eol)
6465               {
6466                 /* The found type is different from what found before.
6467                    Allow for stray ^M characters in DOS EOL files.  */
6468                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6469                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6470                   eol_seen = EOL_SEEN_CRLF;
6471                 else
6472                   {
6473                     eol_seen = EOL_SEEN_LF;
6474                     break;
6475                   }
6476               }
6477             if (++total == MAX_EOL_CHECK_COUNT)
6478               break;
6479           }
6480       }
6481   return eol_seen;
6482 }
6483
6484
6485 static Lisp_Object
6486 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6487 {
6488   Lisp_Object eol_type;
6489
6490   eol_type = CODING_ID_EOL_TYPE (coding->id);
6491   if (! VECTORP (eol_type))
6492     /* Already adjusted.  */
6493     return eol_type;
6494   if (eol_seen & EOL_SEEN_LF)
6495     {
6496       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6497       eol_type = Qunix;
6498     }
6499   else if (eol_seen & EOL_SEEN_CRLF)
6500     {
6501       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6502       eol_type = Qdos;
6503     }
6504   else if (eol_seen & EOL_SEEN_CR)
6505     {
6506       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6507       eol_type = Qmac;
6508     }
6509   return eol_type;
6510 }
6511
6512 /* Detect how a text specified in CODING is encoded.  If a coding
6513    system is detected, update fields of CODING by the detected coding
6514    system.  */
6515
6516 static void
6517 detect_coding (struct coding_system *coding)
6518 {
6519   const unsigned char *src, *src_end;
6520   unsigned int saved_mode = coding->mode;
6521   Lisp_Object found = Qnil;
6522   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
6523
6524   coding->consumed = coding->consumed_char = 0;
6525   coding->produced = coding->produced_char = 0;
6526   coding_set_source (coding);
6527
6528   src_end = coding->source + coding->src_bytes;
6529
6530   coding->eol_seen = EOL_SEEN_NONE;
6531   /* If we have not yet decided the text encoding type, detect it
6532      now.  */
6533   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6534     {
6535       int c, i;
6536       struct coding_detection_info detect_info;
6537       bool null_byte_found = 0, eight_bit_found = 0;
6538       bool inhibit_nbd = inhibit_flag (coding->spec.undecided.inhibit_nbd,
6539                                        inhibit_null_byte_detection);
6540       bool inhibit_ied = inhibit_flag (coding->spec.undecided.inhibit_ied,
6541                                        inhibit_iso_escape_detection);
6542       bool prefer_utf_8 = coding->spec.undecided.prefer_utf_8;
6543
6544       coding->head_ascii = 0;
6545       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6546       for (src = coding->source; src < src_end; src++)
6547         {
6548           c = *src;
6549           if (c & 0x80)
6550             {
6551               eight_bit_found = 1;
6552               if (null_byte_found)
6553                 break;
6554             }
6555           else if (c < 0x20)
6556             {
6557               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6558                   && ! inhibit_ied
6559                   && ! detect_info.checked)
6560                 {
6561                   if (detect_coding_iso_2022 (coding, &detect_info))
6562                     {
6563                       /* We have scanned the whole data.  */
6564                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6565                         {
6566                           /* We didn't find an 8-bit code.  We may
6567                              have found a null-byte, but it's very
6568                              rare that a binary file conforms to
6569                              ISO-2022.  */
6570                           src = src_end;
6571                           coding->head_ascii = src - coding->source;
6572                         }
6573                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6574                       break;
6575                     }
6576                 }
6577               else if (! c && !inhibit_nbd)
6578                 {
6579                   null_byte_found = 1;
6580                   if (eight_bit_found)
6581                     break;
6582                 }
6583               else if (! disable_ascii_optimization
6584                        && ! inhibit_eol_conversion)
6585                 {
6586                   if (c == '\r')
6587                     {
6588                       if (src < src_end && src[1] == '\n')
6589                         {
6590                           coding->eol_seen |= EOL_SEEN_CRLF;
6591                           src++;
6592                           if (! eight_bit_found)
6593                             coding->head_ascii++;
6594                         }
6595                       else
6596                         coding->eol_seen |= EOL_SEEN_CR;
6597                     }
6598                   else if (c == '\n')
6599                     {
6600                       coding->eol_seen |= EOL_SEEN_LF;
6601                     }
6602                 }
6603
6604               if (! eight_bit_found)
6605                 coding->head_ascii++;
6606             }
6607           else if (! eight_bit_found)
6608             coding->head_ascii++;
6609         }
6610
6611       if (null_byte_found || eight_bit_found
6612           || coding->head_ascii < coding->src_bytes
6613           || detect_info.found)
6614         {
6615           enum coding_category category;
6616           struct coding_system *this;
6617
6618           if (coding->head_ascii == coding->src_bytes)
6619             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6620             for (i = 0; i < coding_category_raw_text; i++)
6621               {
6622                 category = coding_priorities[i];
6623                 this = coding_categories + category;
6624                 if (detect_info.found & (1 << category))
6625                   break;
6626               }
6627           else
6628             {
6629               if (null_byte_found)
6630                 {
6631                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6632                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6633                 }
6634               else if (prefer_utf_8
6635                        && detect_coding_utf_8 (coding, &detect_info))
6636                 {
6637                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
6638                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
6639                 }
6640               for (i = 0; i < coding_category_raw_text; i++)
6641                 {
6642                   category = coding_priorities[i];
6643                   this = coding_categories + category;
6644                   /* Some of this->detector (e.g. detect_coding_sjis)
6645                      require this information.  */
6646                   coding->id = this->id;
6647                   if (this->id < 0)
6648                     {
6649                       /* No coding system of this category is defined.  */
6650                       detect_info.rejected |= (1 << category);
6651                     }
6652                   else if (category >= coding_category_raw_text)
6653                     continue;
6654                   else if (detect_info.checked & (1 << category))
6655                     {
6656                       if (detect_info.found & (1 << category))
6657                         break;
6658                     }
6659                   else if ((*(this->detector)) (coding, &detect_info)
6660                            && detect_info.found & (1 << category))
6661                     break;
6662                 }
6663             }
6664
6665           if (i < coding_category_raw_text)
6666             {
6667               if (category == coding_category_utf_8_auto)
6668                 {
6669                   Lisp_Object coding_systems;
6670
6671                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6672                                          coding_attr_utf_bom);
6673                   if (CONSP (coding_systems))
6674                     {
6675                       if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6676                         found = XCAR (coding_systems);
6677                       else
6678                         found = XCDR (coding_systems);
6679                     }
6680                   else
6681                     found = CODING_ID_NAME (this->id);
6682                 }
6683               else if (category == coding_category_utf_16_auto)
6684                 {
6685                   Lisp_Object coding_systems;
6686
6687                   coding_systems = AREF (CODING_ID_ATTRS (this->id),
6688                                          coding_attr_utf_bom);
6689                   if (CONSP (coding_systems))
6690                     {
6691                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6692                         found = XCAR (coding_systems);
6693                       else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6694                         found = XCDR (coding_systems);
6695                     }
6696                   else
6697                     found = CODING_ID_NAME (this->id);
6698                 }
6699               else
6700                 found = CODING_ID_NAME (this->id);
6701             }
6702           else if (null_byte_found)
6703             found = Qno_conversion;
6704           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6705                    == CATEGORY_MASK_ANY)
6706             found = Qraw_text;
6707           else if (detect_info.rejected)
6708             for (i = 0; i < coding_category_raw_text; i++)
6709               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6710                 {
6711                   this = coding_categories + coding_priorities[i];
6712                   found = CODING_ID_NAME (this->id);
6713                   break;
6714                 }
6715         }
6716     }
6717   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6718            == coding_category_utf_8_auto)
6719     {
6720       Lisp_Object coding_systems;
6721       struct coding_detection_info detect_info;
6722
6723       coding_systems
6724         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6725       detect_info.found = detect_info.rejected = 0;
6726       if (check_ascii (coding) == coding->src_bytes)
6727         {
6728           if (CONSP (coding_systems))
6729             found = XCDR (coding_systems);
6730         }
6731       else
6732         {
6733           if (CONSP (coding_systems)
6734               && detect_coding_utf_8 (coding, &detect_info))
6735             {
6736               if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6737                 found = XCAR (coding_systems);
6738               else
6739                 found = XCDR (coding_systems);
6740             }
6741         }
6742     }
6743   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6744            == coding_category_utf_16_auto)
6745     {
6746       Lisp_Object coding_systems;
6747       struct coding_detection_info detect_info;
6748
6749       coding_systems
6750         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6751       detect_info.found = detect_info.rejected = 0;
6752       coding->head_ascii = 0;
6753       if (CONSP (coding_systems)
6754           && detect_coding_utf_16 (coding, &detect_info))
6755         {
6756           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6757             found = XCAR (coding_systems);
6758           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6759             found = XCDR (coding_systems);
6760         }
6761     }
6762
6763   if (! NILP (found))
6764     {
6765       int specified_eol = (VECTORP (eol_type) ? EOL_SEEN_NONE
6766                            : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
6767                            : EQ (eol_type, Qmac) ? EOL_SEEN_CR
6768                            : EOL_SEEN_LF);
6769
6770       setup_coding_system (found, coding);
6771       if (specified_eol != EOL_SEEN_NONE)
6772         adjust_coding_eol_type (coding, specified_eol);
6773     }
6774
6775   coding->mode = saved_mode;
6776 }
6777
6778
6779 static void
6780 decode_eol (struct coding_system *coding)
6781 {
6782   Lisp_Object eol_type;
6783   unsigned char *p, *pbeg, *pend;
6784
6785   eol_type = CODING_ID_EOL_TYPE (coding->id);
6786   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6787     return;
6788
6789   if (NILP (coding->dst_object))
6790     pbeg = coding->destination;
6791   else
6792     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6793   pend = pbeg + coding->produced;
6794
6795   if (VECTORP (eol_type))
6796     {
6797       int eol_seen = EOL_SEEN_NONE;
6798
6799       for (p = pbeg; p < pend; p++)
6800         {
6801           if (*p == '\n')
6802             eol_seen |= EOL_SEEN_LF;
6803           else if (*p == '\r')
6804             {
6805               if (p + 1 < pend && *(p + 1) == '\n')
6806                 {
6807                   eol_seen |= EOL_SEEN_CRLF;
6808                   p++;
6809                 }
6810               else
6811                 eol_seen |= EOL_SEEN_CR;
6812             }
6813         }
6814       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6815       if ((eol_seen & EOL_SEEN_CRLF) != 0
6816           && (eol_seen & EOL_SEEN_CR) != 0
6817           && (eol_seen & EOL_SEEN_LF) == 0)
6818         eol_seen = EOL_SEEN_CRLF;
6819       else if (eol_seen != EOL_SEEN_NONE
6820           && eol_seen != EOL_SEEN_LF
6821           && eol_seen != EOL_SEEN_CRLF
6822           && eol_seen != EOL_SEEN_CR)
6823         eol_seen = EOL_SEEN_LF;
6824       if (eol_seen != EOL_SEEN_NONE)
6825         eol_type = adjust_coding_eol_type (coding, eol_seen);
6826     }
6827
6828   if (EQ (eol_type, Qmac))
6829     {
6830       for (p = pbeg; p < pend; p++)
6831         if (*p == '\r')
6832           *p = '\n';
6833     }
6834   else if (EQ (eol_type, Qdos))
6835     {
6836       ptrdiff_t n = 0;
6837
6838       if (NILP (coding->dst_object))
6839         {
6840           /* Start deleting '\r' from the tail to minimize the memory
6841              movement.  */
6842           for (p = pend - 2; p >= pbeg; p--)
6843             if (*p == '\r')
6844               {
6845                 memmove (p, p + 1, pend-- - p - 1);
6846                 n++;
6847               }
6848         }
6849       else
6850         {
6851           ptrdiff_t pos_byte = coding->dst_pos_byte;
6852           ptrdiff_t pos = coding->dst_pos;
6853           ptrdiff_t pos_end = pos + coding->produced_char - 1;
6854
6855           while (pos < pos_end)
6856             {
6857               p = BYTE_POS_ADDR (pos_byte);
6858               if (*p == '\r' && p[1] == '\n')
6859                 {
6860                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6861                   n++;
6862                   pos_end--;
6863                 }
6864               pos++;
6865               if (coding->dst_multibyte)
6866                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6867               else
6868                 pos_byte++;
6869             }
6870         }
6871       coding->produced -= n;
6872       coding->produced_char -= n;
6873     }
6874 }
6875
6876
6877 /* Return a translation table (or list of them) from coding system
6878    attribute vector ATTRS for encoding (if ENCODEP) or decoding (if
6879    not ENCODEP). */
6880
6881 static Lisp_Object
6882 get_translation_table (Lisp_Object attrs, bool encodep, int *max_lookup)
6883 {
6884   Lisp_Object standard, translation_table;
6885   Lisp_Object val;
6886
6887   if (NILP (Venable_character_translation))
6888     {
6889       if (max_lookup)
6890         *max_lookup = 0;
6891       return Qnil;
6892     }
6893   if (encodep)
6894     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6895       standard = Vstandard_translation_table_for_encode;
6896   else
6897     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6898       standard = Vstandard_translation_table_for_decode;
6899   if (NILP (translation_table))
6900     translation_table = standard;
6901   else
6902     {
6903       if (SYMBOLP (translation_table))
6904         translation_table = Fget (translation_table, Qtranslation_table);
6905       else if (CONSP (translation_table))
6906         {
6907           translation_table = Fcopy_sequence (translation_table);
6908           for (val = translation_table; CONSP (val); val = XCDR (val))
6909             if (SYMBOLP (XCAR (val)))
6910               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6911         }
6912       if (CHAR_TABLE_P (standard))
6913         {
6914           if (CONSP (translation_table))
6915             translation_table = nconc2 (translation_table, list1 (standard));
6916           else
6917             translation_table = list2 (translation_table, standard);
6918         }
6919     }
6920
6921   if (max_lookup)
6922     {
6923       *max_lookup = 1;
6924       if (CHAR_TABLE_P (translation_table)
6925           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6926         {
6927           val = XCHAR_TABLE (translation_table)->extras[1];
6928           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6929             *max_lookup = XFASTINT (val);
6930         }
6931       else if (CONSP (translation_table))
6932         {
6933           Lisp_Object tail;
6934
6935           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6936             if (CHAR_TABLE_P (XCAR (tail))
6937                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6938               {
6939                 Lisp_Object tailval = XCHAR_TABLE (XCAR (tail))->extras[1];
6940                 if (NATNUMP (tailval) && *max_lookup < XFASTINT (tailval))
6941                   *max_lookup = XFASTINT (tailval);
6942               }
6943         }
6944     }
6945   return translation_table;
6946 }
6947
6948 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6949   do {                                                          \
6950     trans = Qnil;                                               \
6951     if (CHAR_TABLE_P (table))                                   \
6952       {                                                         \
6953         trans = CHAR_TABLE_REF (table, c);                      \
6954         if (CHARACTERP (trans))                                 \
6955           c = XFASTINT (trans), trans = Qnil;                   \
6956       }                                                         \
6957     else if (CONSP (table))                                     \
6958       {                                                         \
6959         Lisp_Object tail;                                       \
6960                                                                 \
6961         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6962           if (CHAR_TABLE_P (XCAR (tail)))                       \
6963             {                                                   \
6964               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6965               if (CHARACTERP (trans))                           \
6966                 c = XFASTINT (trans), trans = Qnil;             \
6967               else if (! NILP (trans))                          \
6968                 break;                                          \
6969             }                                                   \
6970       }                                                         \
6971   } while (0)
6972
6973
6974 /* Return a translation of character(s) at BUF according to TRANS.
6975    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6976    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6977    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6978    translation is found, and Qnil if not found..
6979    If BUF is too short to lookup characters in FROM, return Qt.  */
6980
6981 static Lisp_Object
6982 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6983 {
6984
6985   if (INTEGERP (trans))
6986     return trans;
6987   for (; CONSP (trans); trans = XCDR (trans))
6988     {
6989       Lisp_Object val = XCAR (trans);
6990       Lisp_Object from = XCAR (val);
6991       ptrdiff_t len = ASIZE (from);
6992       ptrdiff_t i;
6993
6994       for (i = 0; i < len; i++)
6995         {
6996           if (buf + i == buf_end)
6997             return Qt;
6998           if (XINT (AREF (from, i)) != buf[i])
6999             break;
7000         }
7001       if (i == len)
7002         return val;
7003     }
7004   return Qnil;
7005 }
7006
7007
7008 static int
7009 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
7010                bool last_block)
7011 {
7012   unsigned char *dst = coding->destination + coding->produced;
7013   unsigned char *dst_end = coding->destination + coding->dst_bytes;
7014   ptrdiff_t produced;
7015   ptrdiff_t produced_chars = 0;
7016   int carryover = 0;
7017
7018   if (! coding->chars_at_source)
7019     {
7020       /* Source characters are in coding->charbuf.  */
7021       int *buf = coding->charbuf;
7022       int *buf_end = buf + coding->charbuf_used;
7023
7024       if (EQ (coding->src_object, coding->dst_object))
7025         {
7026           coding_set_source (coding);
7027           dst_end = ((unsigned char *) coding->source) + coding->consumed;
7028         }
7029
7030       while (buf < buf_end)
7031         {
7032           int c = *buf;
7033           ptrdiff_t i;
7034
7035           if (c >= 0)
7036             {
7037               ptrdiff_t from_nchars = 1, to_nchars = 1;
7038               Lisp_Object trans = Qnil;
7039
7040               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7041               if (! NILP (trans))
7042                 {
7043                   trans = get_translation (trans, buf, buf_end);
7044                   if (INTEGERP (trans))
7045                     c = XINT (trans);
7046                   else if (CONSP (trans))
7047                     {
7048                       from_nchars = ASIZE (XCAR (trans));
7049                       trans = XCDR (trans);
7050                       if (INTEGERP (trans))
7051                         c = XINT (trans);
7052                       else
7053                         {
7054                           to_nchars = ASIZE (trans);
7055                           c = XINT (AREF (trans, 0));
7056                         }
7057                     }
7058                   else if (EQ (trans, Qt) && ! last_block)
7059                     break;
7060                 }
7061
7062               if ((dst_end - dst) / MAX_MULTIBYTE_LENGTH < to_nchars)
7063                 {
7064                   if (((min (PTRDIFF_MAX, SIZE_MAX) - (buf_end - buf))
7065                        / MAX_MULTIBYTE_LENGTH)
7066                       < to_nchars)
7067                     memory_full (SIZE_MAX);
7068                   dst = alloc_destination (coding,
7069                                            buf_end - buf
7070                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
7071                                            dst);
7072                   if (EQ (coding->src_object, coding->dst_object))
7073                     {
7074                       coding_set_source (coding);
7075                       dst_end = (((unsigned char *) coding->source)
7076                                  + coding->consumed);
7077                     }
7078                   else
7079                     dst_end = coding->destination + coding->dst_bytes;
7080                 }
7081
7082               for (i = 0; i < to_nchars; i++)
7083                 {
7084                   if (i > 0)
7085                     c = XINT (AREF (trans, i));
7086                   if (coding->dst_multibyte
7087                       || ! CHAR_BYTE8_P (c))
7088                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
7089                   else
7090                     *dst++ = CHAR_TO_BYTE8 (c);
7091                 }
7092               produced_chars += to_nchars;
7093               buf += from_nchars;
7094             }
7095           else
7096             /* This is an annotation datum.  (-C) is the length.  */
7097             buf += -c;
7098         }
7099       carryover = buf_end - buf;
7100     }
7101   else
7102     {
7103       /* Source characters are at coding->source.  */
7104       const unsigned char *src = coding->source;
7105       const unsigned char *src_end = src + coding->consumed;
7106
7107       if (EQ (coding->dst_object, coding->src_object))
7108         dst_end = (unsigned char *) src;
7109       if (coding->src_multibyte != coding->dst_multibyte)
7110         {
7111           if (coding->src_multibyte)
7112             {
7113               bool multibytep = 1;
7114               ptrdiff_t consumed_chars = 0;
7115
7116               while (1)
7117                 {
7118                   const unsigned char *src_base = src;
7119                   int c;
7120
7121                   ONE_MORE_BYTE (c);
7122                   if (dst == dst_end)
7123                     {
7124                       if (EQ (coding->src_object, coding->dst_object))
7125                         dst_end = (unsigned char *) src;
7126                       if (dst == dst_end)
7127                         {
7128                           ptrdiff_t offset = src - coding->source;
7129
7130                           dst = alloc_destination (coding, src_end - src + 1,
7131                                                    dst);
7132                           dst_end = coding->destination + coding->dst_bytes;
7133                           coding_set_source (coding);
7134                           src = coding->source + offset;
7135                           src_end = coding->source + coding->consumed;
7136                           if (EQ (coding->src_object, coding->dst_object))
7137                             dst_end = (unsigned char *) src;
7138                         }
7139                     }
7140                   *dst++ = c;
7141                   produced_chars++;
7142                 }
7143             no_more_source:
7144               ;
7145             }
7146           else
7147             while (src < src_end)
7148               {
7149                 bool multibytep = 1;
7150                 int c = *src++;
7151
7152                 if (dst >= dst_end - 1)
7153                   {
7154                     if (EQ (coding->src_object, coding->dst_object))
7155                       dst_end = (unsigned char *) src;
7156                     if (dst >= dst_end - 1)
7157                       {
7158                         ptrdiff_t offset = src - coding->source;
7159                         ptrdiff_t more_bytes;
7160
7161                         if (EQ (coding->src_object, coding->dst_object))
7162                           more_bytes = ((src_end - src) / 2) + 2;
7163                         else
7164                           more_bytes = src_end - src + 2;
7165                         dst = alloc_destination (coding, more_bytes, dst);
7166                         dst_end = coding->destination + coding->dst_bytes;
7167                         coding_set_source (coding);
7168                         src = coding->source + offset;
7169                         src_end = coding->source + coding->consumed;
7170                         if (EQ (coding->src_object, coding->dst_object))
7171                           dst_end = (unsigned char *) src;
7172                       }
7173                   }
7174                 EMIT_ONE_BYTE (c);
7175               }
7176         }
7177       else
7178         {
7179           if (!EQ (coding->src_object, coding->dst_object))
7180             {
7181               ptrdiff_t require = coding->src_bytes - coding->dst_bytes;
7182
7183               if (require > 0)
7184                 {
7185                   ptrdiff_t offset = src - coding->source;
7186
7187                   dst = alloc_destination (coding, require, dst);
7188                   coding_set_source (coding);
7189                   src = coding->source + offset;
7190                   src_end = coding->source + coding->consumed;
7191                 }
7192             }
7193           produced_chars = coding->consumed_char;
7194           while (src < src_end)
7195             *dst++ = *src++;
7196         }
7197     }
7198
7199   produced = dst - (coding->destination + coding->produced);
7200   if (BUFFERP (coding->dst_object) && produced_chars > 0)
7201     insert_from_gap (produced_chars, produced, 0);
7202   coding->produced += produced;
7203   coding->produced_char += produced_chars;
7204   return carryover;
7205 }
7206
7207 /* Compose text in CODING->object according to the annotation data at
7208    CHARBUF.  CHARBUF is an array:
7209      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
7210  */
7211
7212 static void
7213 produce_composition (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7214 {
7215   int len;
7216   ptrdiff_t to;
7217   enum composition_method method;
7218   Lisp_Object components;
7219
7220   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
7221   to = pos + charbuf[2];
7222   method = (enum composition_method) (charbuf[4]);
7223
7224   if (method == COMPOSITION_RELATIVE)
7225     components = Qnil;
7226   else
7227     {
7228       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
7229       int i, j;
7230
7231       if (method == COMPOSITION_WITH_RULE)
7232         len = charbuf[2] * 3 - 2;
7233       charbuf += MAX_ANNOTATION_LENGTH;
7234       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
7235       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
7236         {
7237           if (charbuf[i] >= 0)
7238             args[j] = make_number (charbuf[i]);
7239           else
7240             {
7241               i++;
7242               args[j] = make_number (charbuf[i] % 0x100);
7243             }
7244         }
7245       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7246     }
7247   compose_text (pos, to, components, Qnil, coding->dst_object);
7248 }
7249
7250
7251 /* Put `charset' property on text in CODING->object according to
7252    the annotation data at CHARBUF.  CHARBUF is an array:
7253      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7254  */
7255
7256 static void
7257 produce_charset (struct coding_system *coding, int *charbuf, ptrdiff_t pos)
7258 {
7259   ptrdiff_t from = pos - charbuf[2];
7260   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7261
7262   Fput_text_property (make_number (from), make_number (pos),
7263                       Qcharset, CHARSET_NAME (charset),
7264                       coding->dst_object);
7265 }
7266
7267
7268 #define CHARBUF_SIZE 0x4000
7269
7270 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7271   do {                                                                  \
7272     coding->charbuf = SAFE_ALLOCA (CHARBUF_SIZE * sizeof (int));        \
7273     coding->charbuf_size = CHARBUF_SIZE;                                \
7274   } while (0)
7275
7276
7277 static void
7278 produce_annotation (struct coding_system *coding, ptrdiff_t pos)
7279 {
7280   int *charbuf = coding->charbuf;
7281   int *charbuf_end = charbuf + coding->charbuf_used;
7282
7283   if (NILP (coding->dst_object))
7284     return;
7285
7286   while (charbuf < charbuf_end)
7287     {
7288       if (*charbuf >= 0)
7289         pos++, charbuf++;
7290       else
7291         {
7292           int len = -*charbuf;
7293
7294           if (len > 2)
7295             switch (charbuf[1])
7296               {
7297               case CODING_ANNOTATE_COMPOSITION_MASK:
7298                 produce_composition (coding, charbuf, pos);
7299                 break;
7300               case CODING_ANNOTATE_CHARSET_MASK:
7301                 produce_charset (coding, charbuf, pos);
7302                 break;
7303               }
7304           charbuf += len;
7305         }
7306     }
7307 }
7308
7309 /* Decode the data at CODING->src_object into CODING->dst_object.
7310    CODING->src_object is a buffer, a string, or nil.
7311    CODING->dst_object is a buffer.
7312
7313    If CODING->src_object is a buffer, it must be the current buffer.
7314    In this case, if CODING->src_pos is positive, it is a position of
7315    the source text in the buffer, otherwise, the source text is in the
7316    gap area of the buffer, and CODING->src_pos specifies the offset of
7317    the text from GPT (which must be the same as PT).  If this is the
7318    same buffer as CODING->dst_object, CODING->src_pos must be
7319    negative.
7320
7321    If CODING->src_object is a string, CODING->src_pos is an index to
7322    that string.
7323
7324    If CODING->src_object is nil, CODING->source must already point to
7325    the non-relocatable memory area.  In this case, CODING->src_pos is
7326    an offset from CODING->source.
7327
7328    The decoded data is inserted at the current point of the buffer
7329    CODING->dst_object.
7330 */
7331
7332 static void
7333 decode_coding (struct coding_system *coding)
7334 {
7335   Lisp_Object attrs;
7336   Lisp_Object undo_list;
7337   Lisp_Object translation_table;
7338   struct ccl_spec cclspec;
7339   int carryover;
7340   int i;
7341
7342   USE_SAFE_ALLOCA;
7343
7344   if (BUFFERP (coding->src_object)
7345       && coding->src_pos > 0
7346       && coding->src_pos < GPT
7347       && coding->src_pos + coding->src_chars > GPT)
7348     move_gap_both (coding->src_pos, coding->src_pos_byte);
7349
7350   undo_list = Qt;
7351   if (BUFFERP (coding->dst_object))
7352     {
7353       set_buffer_internal (XBUFFER (coding->dst_object));
7354       if (GPT != PT)
7355         move_gap_both (PT, PT_BYTE);
7356
7357       /* We must disable undo_list in order to record the whole insert
7358          transaction via record_insert at the end.  But doing so also
7359          disables the recording of the first change to the undo_list.
7360          Therefore we check for first change here and record it via
7361          record_first_change if needed.  */
7362       if (MODIFF <= SAVE_MODIFF)
7363         record_first_change ();
7364
7365       undo_list = BVAR (current_buffer, undo_list);
7366       bset_undo_list (current_buffer, Qt);
7367     }
7368
7369   coding->consumed = coding->consumed_char = 0;
7370   coding->produced = coding->produced_char = 0;
7371   coding->chars_at_source = 0;
7372   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7373   coding->errors = 0;
7374
7375   ALLOC_CONVERSION_WORK_AREA (coding);
7376
7377   attrs = CODING_ID_ATTRS (coding->id);
7378   translation_table = get_translation_table (attrs, 0, NULL);
7379
7380   carryover = 0;
7381   if (coding->decoder == decode_coding_ccl)
7382     {
7383       coding->spec.ccl = &cclspec;
7384       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7385     }
7386   do
7387     {
7388       ptrdiff_t pos = coding->dst_pos + coding->produced_char;
7389
7390       coding_set_source (coding);
7391       coding->annotated = 0;
7392       coding->charbuf_used = carryover;
7393       (*(coding->decoder)) (coding);
7394       coding_set_destination (coding);
7395       carryover = produce_chars (coding, translation_table, 0);
7396       if (coding->annotated)
7397         produce_annotation (coding, pos);
7398       for (i = 0; i < carryover; i++)
7399         coding->charbuf[i]
7400           = coding->charbuf[coding->charbuf_used - carryover + i];
7401     }
7402   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7403          || (coding->consumed < coding->src_bytes
7404              && (coding->result == CODING_RESULT_SUCCESS
7405                  || coding->result == CODING_RESULT_INVALID_SRC)));
7406
7407   if (carryover > 0)
7408     {
7409       coding_set_destination (coding);
7410       coding->charbuf_used = carryover;
7411       produce_chars (coding, translation_table, 1);
7412     }
7413
7414   coding->carryover_bytes = 0;
7415   if (coding->consumed < coding->src_bytes)
7416     {
7417       int nbytes = coding->src_bytes - coding->consumed;
7418       const unsigned char *src;
7419
7420       coding_set_source (coding);
7421       coding_set_destination (coding);
7422       src = coding->source + coding->consumed;
7423
7424       if (coding->mode & CODING_MODE_LAST_BLOCK)
7425         {
7426           /* Flush out unprocessed data as binary chars.  We are sure
7427              that the number of data is less than the size of
7428              coding->charbuf.  */
7429           coding->charbuf_used = 0;
7430           coding->chars_at_source = 0;
7431
7432           while (nbytes-- > 0)
7433             {
7434               int c = *src++;
7435
7436               if (c & 0x80)
7437                 c = BYTE8_TO_CHAR (c);
7438               coding->charbuf[coding->charbuf_used++] = c;
7439             }
7440           produce_chars (coding, Qnil, 1);
7441         }
7442       else
7443         {
7444           /* Record unprocessed bytes in coding->carryover.  We are
7445              sure that the number of data is less than the size of
7446              coding->carryover.  */
7447           unsigned char *p = coding->carryover;
7448
7449           if (nbytes > sizeof coding->carryover)
7450             nbytes = sizeof coding->carryover;
7451           coding->carryover_bytes = nbytes;
7452           while (nbytes-- > 0)
7453             *p++ = *src++;
7454         }
7455       coding->consumed = coding->src_bytes;
7456     }
7457
7458   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7459       && !inhibit_eol_conversion)
7460     decode_eol (coding);
7461   if (BUFFERP (coding->dst_object))
7462     {
7463       bset_undo_list (current_buffer, undo_list);
7464       record_insert (coding->dst_pos, coding->produced_char);
7465     }
7466
7467   SAFE_FREE ();
7468 }
7469
7470
7471 /* Extract an annotation datum from a composition starting at POS and
7472    ending before LIMIT of CODING->src_object (buffer or string), store
7473    the data in BUF, set *STOP to a starting position of the next
7474    composition (if any) or to LIMIT, and return the address of the
7475    next element of BUF.
7476
7477    If such an annotation is not found, set *STOP to a starting
7478    position of a composition after POS (if any) or to LIMIT, and
7479    return BUF.  */
7480
7481 static int *
7482 handle_composition_annotation (ptrdiff_t pos, ptrdiff_t limit,
7483                                struct coding_system *coding, int *buf,
7484                                ptrdiff_t *stop)
7485 {
7486   ptrdiff_t start, end;
7487   Lisp_Object prop;
7488
7489   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7490       || end > limit)
7491     *stop = limit;
7492   else if (start > pos)
7493     *stop = start;
7494   else
7495     {
7496       if (start == pos)
7497         {
7498           /* We found a composition.  Store the corresponding
7499              annotation data in BUF.  */
7500           int *head = buf;
7501           enum composition_method method = composition_method (prop);
7502           int nchars = COMPOSITION_LENGTH (prop);
7503
7504           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7505           if (method != COMPOSITION_RELATIVE)
7506             {
7507               Lisp_Object components;
7508               ptrdiff_t i, len, i_byte;
7509
7510               components = COMPOSITION_COMPONENTS (prop);
7511               if (VECTORP (components))
7512                 {
7513                   len = ASIZE (components);
7514                   for (i = 0; i < len; i++)
7515                     *buf++ = XINT (AREF (components, i));
7516                 }
7517               else if (STRINGP (components))
7518                 {
7519                   len = SCHARS (components);
7520                   i = i_byte = 0;
7521                   while (i < len)
7522                     {
7523                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7524                       buf++;
7525                     }
7526                 }
7527               else if (INTEGERP (components))
7528                 {
7529                   len = 1;
7530                   *buf++ = XINT (components);
7531                 }
7532               else if (CONSP (components))
7533                 {
7534                   for (len = 0; CONSP (components);
7535                        len++, components = XCDR (components))
7536                     *buf++ = XINT (XCAR (components));
7537                 }
7538               else
7539                 emacs_abort ();
7540               *head -= len;
7541             }
7542         }
7543
7544       if (find_composition (end, limit, &start, &end, &prop,
7545                             coding->src_object)
7546           && end <= limit)
7547         *stop = start;
7548       else
7549         *stop = limit;
7550     }
7551   return buf;
7552 }
7553
7554
7555 /* Extract an annotation datum from a text property `charset' at POS of
7556    CODING->src_object (buffer of string), store the data in BUF, set
7557    *STOP to the position where the value of `charset' property changes
7558    (limiting by LIMIT), and return the address of the next element of
7559    BUF.
7560
7561    If the property value is nil, set *STOP to the position where the
7562    property value is non-nil (limiting by LIMIT), and return BUF.  */
7563
7564 static int *
7565 handle_charset_annotation (ptrdiff_t pos, ptrdiff_t limit,
7566                            struct coding_system *coding, int *buf,
7567                            ptrdiff_t *stop)
7568 {
7569   Lisp_Object val, next;
7570   int id;
7571
7572   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7573   if (! NILP (val) && CHARSETP (val))
7574     id = XINT (CHARSET_SYMBOL_ID (val));
7575   else
7576     id = -1;
7577   ADD_CHARSET_DATA (buf, 0, id);
7578   next = Fnext_single_property_change (make_number (pos), Qcharset,
7579                                        coding->src_object,
7580                                        make_number (limit));
7581   *stop = XINT (next);
7582   return buf;
7583 }
7584
7585
7586 static void
7587 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7588                int max_lookup)
7589 {
7590   int *buf = coding->charbuf;
7591   int *buf_end = coding->charbuf + coding->charbuf_size;
7592   const unsigned char *src = coding->source + coding->consumed;
7593   const unsigned char *src_end = coding->source + coding->src_bytes;
7594   ptrdiff_t pos = coding->src_pos + coding->consumed_char;
7595   ptrdiff_t end_pos = coding->src_pos + coding->src_chars;
7596   bool multibytep = coding->src_multibyte;
7597   Lisp_Object eol_type;
7598   int c;
7599   ptrdiff_t stop, stop_composition, stop_charset;
7600   int *lookup_buf = NULL;
7601
7602   if (! NILP (translation_table))
7603     lookup_buf = alloca (sizeof (int) * max_lookup);
7604
7605   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7606   if (VECTORP (eol_type))
7607     eol_type = Qunix;
7608
7609   /* Note: composition handling is not yet implemented.  */
7610   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7611
7612   if (NILP (coding->src_object))
7613     stop = stop_composition = stop_charset = end_pos;
7614   else
7615     {
7616       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7617         stop = stop_composition = pos;
7618       else
7619         stop = stop_composition = end_pos;
7620       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7621         stop = stop_charset = pos;
7622       else
7623         stop_charset = end_pos;
7624     }
7625
7626   /* Compensate for CRLF and conversion.  */
7627   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7628   while (buf < buf_end)
7629     {
7630       Lisp_Object trans;
7631
7632       if (pos == stop)
7633         {
7634           if (pos == end_pos)
7635             break;
7636           if (pos == stop_composition)
7637             buf = handle_composition_annotation (pos, end_pos, coding,
7638                                                  buf, &stop_composition);
7639           if (pos == stop_charset)
7640             buf = handle_charset_annotation (pos, end_pos, coding,
7641                                              buf, &stop_charset);
7642           stop = (stop_composition < stop_charset
7643                   ? stop_composition : stop_charset);
7644         }
7645
7646       if (! multibytep)
7647         {
7648           int bytes;
7649
7650           if (coding->encoder == encode_coding_raw_text
7651               || coding->encoder == encode_coding_ccl)
7652             c = *src++, pos++;
7653           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7654             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7655           else
7656             c = BYTE8_TO_CHAR (*src), src++, pos++;
7657         }
7658       else
7659         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7660       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7661         c = '\n';
7662       if (! EQ (eol_type, Qunix))
7663         {
7664           if (c == '\n')
7665             {
7666               if (EQ (eol_type, Qdos))
7667                 *buf++ = '\r';
7668               else
7669                 c = '\r';
7670             }
7671         }
7672
7673       trans = Qnil;
7674       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7675       if (NILP (trans))
7676         *buf++ = c;
7677       else
7678         {
7679           ptrdiff_t from_nchars = 1, to_nchars = 1;
7680           int *lookup_buf_end;
7681           const unsigned char *p = src;
7682           int i;
7683
7684           lookup_buf[0] = c;
7685           for (i = 1; i < max_lookup && p < src_end; i++)
7686             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7687           lookup_buf_end = lookup_buf + i;
7688           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7689           if (INTEGERP (trans))
7690             c = XINT (trans);
7691           else if (CONSP (trans))
7692             {
7693               from_nchars = ASIZE (XCAR (trans));
7694               trans = XCDR (trans);
7695               if (INTEGERP (trans))
7696                 c = XINT (trans);
7697               else
7698                 {
7699                   to_nchars = ASIZE (trans);
7700                   if (buf_end - buf < to_nchars)
7701                     break;
7702                   c = XINT (AREF (trans, 0));
7703                 }
7704             }
7705           else
7706             break;
7707           *buf++ = c;
7708           for (i = 1; i < to_nchars; i++)
7709             *buf++ = XINT (AREF (trans, i));
7710           for (i = 1; i < from_nchars; i++, pos++)
7711             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7712         }
7713     }
7714
7715   coding->consumed = src - coding->source;
7716   coding->consumed_char = pos - coding->src_pos;
7717   coding->charbuf_used = buf - coding->charbuf;
7718   coding->chars_at_source = 0;
7719 }
7720
7721
7722 /* Encode the text at CODING->src_object into CODING->dst_object.
7723    CODING->src_object is a buffer or a string.
7724    CODING->dst_object is a buffer or nil.
7725
7726    If CODING->src_object is a buffer, it must be the current buffer.
7727    In this case, if CODING->src_pos is positive, it is a position of
7728    the source text in the buffer, otherwise. the source text is in the
7729    gap area of the buffer, and coding->src_pos specifies the offset of
7730    the text from GPT (which must be the same as PT).  If this is the
7731    same buffer as CODING->dst_object, CODING->src_pos must be
7732    negative and CODING should not have `pre-write-conversion'.
7733
7734    If CODING->src_object is a string, CODING should not have
7735    `pre-write-conversion'.
7736
7737    If CODING->dst_object is a buffer, the encoded data is inserted at
7738    the current point of that buffer.
7739
7740    If CODING->dst_object is nil, the encoded data is placed at the
7741    memory area specified by CODING->destination.  */
7742
7743 static void
7744 encode_coding (struct coding_system *coding)
7745 {
7746   Lisp_Object attrs;
7747   Lisp_Object translation_table;
7748   int max_lookup;
7749   struct ccl_spec cclspec;
7750
7751   USE_SAFE_ALLOCA;
7752
7753   attrs = CODING_ID_ATTRS (coding->id);
7754   if (coding->encoder == encode_coding_raw_text)
7755     translation_table = Qnil, max_lookup = 0;
7756   else
7757     translation_table = get_translation_table (attrs, 1, &max_lookup);
7758
7759   if (BUFFERP (coding->dst_object))
7760     {
7761       set_buffer_internal (XBUFFER (coding->dst_object));
7762       coding->dst_multibyte
7763         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7764     }
7765
7766   coding->consumed = coding->consumed_char = 0;
7767   coding->produced = coding->produced_char = 0;
7768   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7769   coding->errors = 0;
7770
7771   ALLOC_CONVERSION_WORK_AREA (coding);
7772
7773   if (coding->encoder == encode_coding_ccl)
7774     {
7775       coding->spec.ccl = &cclspec;
7776       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7777     }
7778   do {
7779     coding_set_source (coding);
7780     consume_chars (coding, translation_table, max_lookup);
7781     coding_set_destination (coding);
7782     (*(coding->encoder)) (coding);
7783   } while (coding->consumed_char < coding->src_chars);
7784
7785   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7786     insert_from_gap (coding->produced_char, coding->produced, 0);
7787
7788   SAFE_FREE ();
7789 }
7790
7791
7792 /* Name (or base name) of work buffer for code conversion.  */
7793 static Lisp_Object Vcode_conversion_workbuf_name;
7794
7795 /* A working buffer used by the top level conversion.  Once it is
7796    created, it is never destroyed.  It has the name
7797    Vcode_conversion_workbuf_name.  The other working buffers are
7798    destroyed after the use is finished, and their names are modified
7799    versions of Vcode_conversion_workbuf_name.  */
7800 static Lisp_Object Vcode_conversion_reused_workbuf;
7801
7802 /* True iff Vcode_conversion_reused_workbuf is already in use.  */
7803 static bool reused_workbuf_in_use;
7804
7805
7806 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7807    multibyteness of returning buffer.  */
7808
7809 static Lisp_Object
7810 make_conversion_work_buffer (bool multibyte)
7811 {
7812   Lisp_Object name, workbuf;
7813   struct buffer *current;
7814
7815   if (reused_workbuf_in_use)
7816     {
7817       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7818       workbuf = Fget_buffer_create (name);
7819     }
7820   else
7821     {
7822       reused_workbuf_in_use = 1;
7823       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7824         Vcode_conversion_reused_workbuf
7825           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7826       workbuf = Vcode_conversion_reused_workbuf;
7827     }
7828   current = current_buffer;
7829   set_buffer_internal (XBUFFER (workbuf));
7830   /* We can't allow modification hooks to run in the work buffer.  For
7831      instance, directory_files_internal assumes that file decoding
7832      doesn't compile new regexps.  */
7833   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7834   Ferase_buffer ();
7835   bset_undo_list (current_buffer, Qt);
7836   bset_enable_multibyte_characters (current_buffer, multibyte ? Qt : Qnil);
7837   set_buffer_internal (current);
7838   return workbuf;
7839 }
7840
7841
7842 static void
7843 code_conversion_restore (Lisp_Object arg)
7844 {
7845   Lisp_Object current, workbuf;
7846   struct gcpro gcpro1;
7847
7848   GCPRO1 (arg);
7849   current = XCAR (arg);
7850   workbuf = XCDR (arg);
7851   if (! NILP (workbuf))
7852     {
7853       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7854         reused_workbuf_in_use = 0;
7855       else
7856         Fkill_buffer (workbuf);
7857     }
7858   set_buffer_internal (XBUFFER (current));
7859   UNGCPRO;
7860 }
7861
7862 Lisp_Object
7863 code_conversion_save (bool with_work_buf, bool multibyte)
7864 {
7865   Lisp_Object workbuf = Qnil;
7866
7867   if (with_work_buf)
7868     workbuf = make_conversion_work_buffer (multibyte);
7869   record_unwind_protect (code_conversion_restore,
7870                          Fcons (Fcurrent_buffer (), workbuf));
7871   return workbuf;
7872 }
7873
7874 void
7875 decode_coding_gap (struct coding_system *coding,
7876                    ptrdiff_t chars, ptrdiff_t bytes)
7877 {
7878   ptrdiff_t count = SPECPDL_INDEX ();
7879   Lisp_Object attrs;
7880
7881   coding->src_object = Fcurrent_buffer ();
7882   coding->src_chars = chars;
7883   coding->src_bytes = bytes;
7884   coding->src_pos = -chars;
7885   coding->src_pos_byte = -bytes;
7886   coding->src_multibyte = chars < bytes;
7887   coding->dst_object = coding->src_object;
7888   coding->dst_pos = PT;
7889   coding->dst_pos_byte = PT_BYTE;
7890   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7891
7892   coding->head_ascii = -1;
7893   coding->detected_utf8_chars = -1;
7894   coding->eol_seen = EOL_SEEN_NONE;
7895   if (CODING_REQUIRE_DETECTION (coding))
7896     detect_coding (coding);
7897   attrs = CODING_ID_ATTRS (coding->id);
7898   if (! disable_ascii_optimization
7899       && ! coding->src_multibyte
7900       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
7901       && NILP (CODING_ATTR_POST_READ (attrs))
7902       && NILP (get_translation_table (attrs, 0, NULL)))
7903     {
7904       chars = coding->head_ascii;
7905       if (chars < 0)
7906         chars = check_ascii (coding);
7907       if (chars != bytes)
7908         {
7909           /* There exists a non-ASCII byte.  */
7910           if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
7911             {
7912               if (coding->detected_utf8_chars >= 0)
7913                 chars = coding->detected_utf8_chars;
7914               else
7915                 chars = check_utf_8 (coding);
7916               if (CODING_UTF_8_BOM (coding) != utf_without_bom
7917                   && coding->head_ascii == 0
7918                   && coding->source[0] == UTF_8_BOM_1
7919                   && coding->source[1] == UTF_8_BOM_2
7920                   && coding->source[2] == UTF_8_BOM_3)
7921                 {
7922                   chars--;
7923                   bytes -= 3;
7924                   coding->src_bytes -= 3;
7925                 }
7926             }
7927           else
7928             chars = -1;
7929         }
7930       if (chars >= 0)
7931         {
7932           Lisp_Object eol_type;
7933
7934           eol_type = CODING_ID_EOL_TYPE (coding->id);
7935           if (VECTORP (eol_type))
7936             {
7937               if (coding->eol_seen != EOL_SEEN_NONE)
7938                 eol_type = adjust_coding_eol_type (coding, coding->eol_seen);
7939             }
7940           if (EQ (eol_type, Qmac))
7941             {
7942               unsigned char *src_end = GAP_END_ADDR;
7943               unsigned char *src = src_end - coding->src_bytes;
7944
7945               while (src < src_end)
7946                 {
7947                   if (*src++ == '\r')
7948                     src[-1] = '\n';
7949                 }
7950             }
7951           else if (EQ (eol_type, Qdos))
7952             {
7953               unsigned char *src = GAP_END_ADDR;
7954               unsigned char *src_beg = src - coding->src_bytes;
7955               unsigned char *dst = src;
7956               ptrdiff_t diff;
7957
7958               while (src_beg < src)
7959                 {
7960                   *--dst = *--src;
7961                   if (*src == '\n' && src > src_beg && src[-1] == '\r')
7962                     src--;
7963                 }
7964               diff = dst - src;
7965               bytes -= diff;
7966               chars -= diff;
7967             }
7968           coding->produced = bytes;
7969           coding->produced_char = chars;
7970           insert_from_gap (chars, bytes, 1);
7971           return;
7972         }
7973     }
7974   code_conversion_save (0, 0);
7975
7976   coding->mode |= CODING_MODE_LAST_BLOCK;
7977   current_buffer->text->inhibit_shrinking = 1;
7978   decode_coding (coding);
7979   current_buffer->text->inhibit_shrinking = 0;
7980
7981   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7982     {
7983       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7984       Lisp_Object val;
7985
7986       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7987       val = call1 (CODING_ATTR_POST_READ (attrs),
7988                    make_number (coding->produced_char));
7989       CHECK_NATNUM (val);
7990       coding->produced_char += Z - prev_Z;
7991       coding->produced += Z_BYTE - prev_Z_BYTE;
7992     }
7993
7994   unbind_to (count, Qnil);
7995 }
7996
7997
7998 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7999    SRC_OBJECT into DST_OBJECT by coding context CODING.
8000
8001    SRC_OBJECT is a buffer, a string, or Qnil.
8002
8003    If it is a buffer, the text is at point of the buffer.  FROM and TO
8004    are positions in the buffer.
8005
8006    If it is a string, the text is at the beginning of the string.
8007    FROM and TO are indices to the string.
8008
8009    If it is nil, the text is at coding->source.  FROM and TO are
8010    indices to coding->source.
8011
8012    DST_OBJECT is a buffer, Qt, or Qnil.
8013
8014    If it is a buffer, the decoded text is inserted at point of the
8015    buffer.  If the buffer is the same as SRC_OBJECT, the source text
8016    is deleted.
8017
8018    If it is Qt, a string is made from the decoded text, and
8019    set in CODING->dst_object.
8020
8021    If it is Qnil, the decoded text is stored at CODING->destination.
8022    The caller must allocate CODING->dst_bytes bytes at
8023    CODING->destination by xmalloc.  If the decoded text is longer than
8024    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
8025  */
8026
8027 void
8028 decode_coding_object (struct coding_system *coding,
8029                       Lisp_Object src_object,
8030                       ptrdiff_t from, ptrdiff_t from_byte,
8031                       ptrdiff_t to, ptrdiff_t to_byte,
8032                       Lisp_Object dst_object)
8033 {
8034   ptrdiff_t count = SPECPDL_INDEX ();
8035   unsigned char *destination IF_LINT (= NULL);
8036   ptrdiff_t dst_bytes IF_LINT (= 0);
8037   ptrdiff_t chars = to - from;
8038   ptrdiff_t bytes = to_byte - from_byte;
8039   Lisp_Object attrs;
8040   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8041   bool need_marker_adjustment = 0;
8042   Lisp_Object old_deactivate_mark;
8043
8044   old_deactivate_mark = Vdeactivate_mark;
8045
8046   if (NILP (dst_object))
8047     {
8048       destination = coding->destination;
8049       dst_bytes = coding->dst_bytes;
8050     }
8051
8052   coding->src_object = src_object;
8053   coding->src_chars = chars;
8054   coding->src_bytes = bytes;
8055   coding->src_multibyte = chars < bytes;
8056
8057   if (STRINGP (src_object))
8058     {
8059       coding->src_pos = from;
8060       coding->src_pos_byte = from_byte;
8061     }
8062   else if (BUFFERP (src_object))
8063     {
8064       set_buffer_internal (XBUFFER (src_object));
8065       if (from != GPT)
8066         move_gap_both (from, from_byte);
8067       if (EQ (src_object, dst_object))
8068         {
8069           struct Lisp_Marker *tail;
8070
8071           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8072             {
8073               tail->need_adjustment
8074                 = tail->charpos == (tail->insertion_type ? from : to);
8075               need_marker_adjustment |= tail->need_adjustment;
8076             }
8077           saved_pt = PT, saved_pt_byte = PT_BYTE;
8078           TEMP_SET_PT_BOTH (from, from_byte);
8079           current_buffer->text->inhibit_shrinking = 1;
8080           del_range_both (from, from_byte, to, to_byte, 1);
8081           coding->src_pos = -chars;
8082           coding->src_pos_byte = -bytes;
8083         }
8084       else
8085         {
8086           coding->src_pos = from;
8087           coding->src_pos_byte = from_byte;
8088         }
8089     }
8090
8091   if (CODING_REQUIRE_DETECTION (coding))
8092     detect_coding (coding);
8093   attrs = CODING_ID_ATTRS (coding->id);
8094
8095   if (EQ (dst_object, Qt)
8096       || (! NILP (CODING_ATTR_POST_READ (attrs))
8097           && NILP (dst_object)))
8098     {
8099       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
8100       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
8101       coding->dst_pos = BEG;
8102       coding->dst_pos_byte = BEG_BYTE;
8103     }
8104   else if (BUFFERP (dst_object))
8105     {
8106       code_conversion_save (0, 0);
8107       coding->dst_object = dst_object;
8108       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
8109       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
8110       coding->dst_multibyte
8111         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8112     }
8113   else
8114     {
8115       code_conversion_save (0, 0);
8116       coding->dst_object = Qnil;
8117       /* Most callers presume this will return a multibyte result, and they
8118          won't use `binary' or `raw-text' anyway, so let's not worry about
8119          CODING_FOR_UNIBYTE.  */
8120       coding->dst_multibyte = 1;
8121     }
8122
8123   decode_coding (coding);
8124
8125   if (BUFFERP (coding->dst_object))
8126     set_buffer_internal (XBUFFER (coding->dst_object));
8127
8128   if (! NILP (CODING_ATTR_POST_READ (attrs)))
8129     {
8130       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8131       ptrdiff_t prev_Z = Z, prev_Z_BYTE = Z_BYTE;
8132       Lisp_Object val;
8133
8134       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
8135       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8136               old_deactivate_mark);
8137       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
8138                         make_number (coding->produced_char));
8139       UNGCPRO;
8140       CHECK_NATNUM (val);
8141       coding->produced_char += Z - prev_Z;
8142       coding->produced += Z_BYTE - prev_Z_BYTE;
8143     }
8144
8145   if (EQ (dst_object, Qt))
8146     {
8147       coding->dst_object = Fbuffer_string ();
8148     }
8149   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
8150     {
8151       set_buffer_internal (XBUFFER (coding->dst_object));
8152       if (dst_bytes < coding->produced)
8153         {
8154           eassert (coding->produced > 0);
8155           destination = xrealloc (destination, coding->produced);
8156           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
8157             move_gap_both (BEGV, BEGV_BYTE);
8158           memcpy (destination, BEGV_ADDR, coding->produced);
8159           coding->destination = destination;
8160         }
8161     }
8162
8163   if (saved_pt >= 0)
8164     {
8165       /* This is the case of:
8166          (BUFFERP (src_object) && EQ (src_object, dst_object))
8167          As we have moved PT while replacing the original buffer
8168          contents, we must recover it now.  */
8169       set_buffer_internal (XBUFFER (src_object));
8170       current_buffer->text->inhibit_shrinking = 0;
8171       if (saved_pt < from)
8172         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8173       else if (saved_pt < from + chars)
8174         TEMP_SET_PT_BOTH (from, from_byte);
8175       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8176         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8177                           saved_pt_byte + (coding->produced - bytes));
8178       else
8179         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8180                           saved_pt_byte + (coding->produced - bytes));
8181
8182       if (need_marker_adjustment)
8183         {
8184           struct Lisp_Marker *tail;
8185
8186           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8187             if (tail->need_adjustment)
8188               {
8189                 tail->need_adjustment = 0;
8190                 if (tail->insertion_type)
8191                   {
8192                     tail->bytepos = from_byte;
8193                     tail->charpos = from;
8194                   }
8195                 else
8196                   {
8197                     tail->bytepos = from_byte + coding->produced;
8198                     tail->charpos
8199                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8200                          ? tail->bytepos : from + coding->produced_char);
8201                   }
8202               }
8203         }
8204     }
8205
8206   Vdeactivate_mark = old_deactivate_mark;
8207   unbind_to (count, coding->dst_object);
8208 }
8209
8210
8211 void
8212 encode_coding_object (struct coding_system *coding,
8213                       Lisp_Object src_object,
8214                       ptrdiff_t from, ptrdiff_t from_byte,
8215                       ptrdiff_t to, ptrdiff_t to_byte,
8216                       Lisp_Object dst_object)
8217 {
8218   ptrdiff_t count = SPECPDL_INDEX ();
8219   ptrdiff_t chars = to - from;
8220   ptrdiff_t bytes = to_byte - from_byte;
8221   Lisp_Object attrs;
8222   ptrdiff_t saved_pt = -1, saved_pt_byte IF_LINT (= 0);
8223   bool need_marker_adjustment = 0;
8224   bool kill_src_buffer = 0;
8225   Lisp_Object old_deactivate_mark;
8226
8227   old_deactivate_mark = Vdeactivate_mark;
8228
8229   coding->src_object = src_object;
8230   coding->src_chars = chars;
8231   coding->src_bytes = bytes;
8232   coding->src_multibyte = chars < bytes;
8233
8234   attrs = CODING_ID_ATTRS (coding->id);
8235
8236   if (EQ (src_object, dst_object))
8237     {
8238       struct Lisp_Marker *tail;
8239
8240       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8241         {
8242           tail->need_adjustment
8243             = tail->charpos == (tail->insertion_type ? from : to);
8244           need_marker_adjustment |= tail->need_adjustment;
8245         }
8246     }
8247
8248   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
8249     {
8250       coding->src_object = code_conversion_save (1, coding->src_multibyte);
8251       set_buffer_internal (XBUFFER (coding->src_object));
8252       if (STRINGP (src_object))
8253         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
8254       else if (BUFFERP (src_object))
8255         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
8256       else
8257         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
8258
8259       if (EQ (src_object, dst_object))
8260         {
8261           set_buffer_internal (XBUFFER (src_object));
8262           saved_pt = PT, saved_pt_byte = PT_BYTE;
8263           del_range_both (from, from_byte, to, to_byte, 1);
8264           set_buffer_internal (XBUFFER (coding->src_object));
8265         }
8266
8267       {
8268         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
8269
8270         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
8271                 old_deactivate_mark);
8272         safe_call2 (CODING_ATTR_PRE_WRITE (attrs),
8273                     make_number (BEG), make_number (Z));
8274         UNGCPRO;
8275       }
8276       if (XBUFFER (coding->src_object) != current_buffer)
8277         kill_src_buffer = 1;
8278       coding->src_object = Fcurrent_buffer ();
8279       if (BEG != GPT)
8280         move_gap_both (BEG, BEG_BYTE);
8281       coding->src_chars = Z - BEG;
8282       coding->src_bytes = Z_BYTE - BEG_BYTE;
8283       coding->src_pos = BEG;
8284       coding->src_pos_byte = BEG_BYTE;
8285       coding->src_multibyte = Z < Z_BYTE;
8286     }
8287   else if (STRINGP (src_object))
8288     {
8289       code_conversion_save (0, 0);
8290       coding->src_pos = from;
8291       coding->src_pos_byte = from_byte;
8292     }
8293   else if (BUFFERP (src_object))
8294     {
8295       code_conversion_save (0, 0);
8296       set_buffer_internal (XBUFFER (src_object));
8297       if (EQ (src_object, dst_object))
8298         {
8299           saved_pt = PT, saved_pt_byte = PT_BYTE;
8300           coding->src_object = del_range_1 (from, to, 1, 1);
8301           coding->src_pos = 0;
8302           coding->src_pos_byte = 0;
8303         }
8304       else
8305         {
8306           if (from < GPT && to >= GPT)
8307             move_gap_both (from, from_byte);
8308           coding->src_pos = from;
8309           coding->src_pos_byte = from_byte;
8310         }
8311     }
8312   else
8313     code_conversion_save (0, 0);
8314
8315   if (BUFFERP (dst_object))
8316     {
8317       coding->dst_object = dst_object;
8318       if (EQ (src_object, dst_object))
8319         {
8320           coding->dst_pos = from;
8321           coding->dst_pos_byte = from_byte;
8322         }
8323       else
8324         {
8325           struct buffer *current = current_buffer;
8326
8327           set_buffer_temp (XBUFFER (dst_object));
8328           coding->dst_pos = PT;
8329           coding->dst_pos_byte = PT_BYTE;
8330           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8331           set_buffer_temp (current);
8332         }
8333       coding->dst_multibyte
8334         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
8335     }
8336   else if (EQ (dst_object, Qt))
8337     {
8338       ptrdiff_t dst_bytes = max (1, coding->src_chars);
8339       coding->dst_object = Qnil;
8340       coding->destination = xmalloc (dst_bytes);
8341       coding->dst_bytes = dst_bytes;
8342       coding->dst_multibyte = 0;
8343     }
8344   else
8345     {
8346       coding->dst_object = Qnil;
8347       coding->dst_multibyte = 0;
8348     }
8349
8350   encode_coding (coding);
8351
8352   if (EQ (dst_object, Qt))
8353     {
8354       if (BUFFERP (coding->dst_object))
8355         coding->dst_object = Fbuffer_string ();
8356       else if (coding->raw_destination)
8357         /* This is used to avoid creating huge Lisp string.
8358            NOTE: caller who sets `raw_destination' is also
8359            responsible for freeing `destination' buffer.  */
8360         coding->dst_object = Qnil;
8361       else
8362         {
8363           coding->dst_object
8364             = make_unibyte_string ((char *) coding->destination,
8365                                    coding->produced);
8366           xfree (coding->destination);
8367         }
8368     }
8369
8370   if (saved_pt >= 0)
8371     {
8372       /* This is the case of:
8373          (BUFFERP (src_object) && EQ (src_object, dst_object))
8374          As we have moved PT while replacing the original buffer
8375          contents, we must recover it now.  */
8376       set_buffer_internal (XBUFFER (src_object));
8377       if (saved_pt < from)
8378         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8379       else if (saved_pt < from + chars)
8380         TEMP_SET_PT_BOTH (from, from_byte);
8381       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8382         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8383                           saved_pt_byte + (coding->produced - bytes));
8384       else
8385         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8386                           saved_pt_byte + (coding->produced - bytes));
8387
8388       if (need_marker_adjustment)
8389         {
8390           struct Lisp_Marker *tail;
8391
8392           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8393             if (tail->need_adjustment)
8394               {
8395                 tail->need_adjustment = 0;
8396                 if (tail->insertion_type)
8397                   {
8398                     tail->bytepos = from_byte;
8399                     tail->charpos = from;
8400                   }
8401                 else
8402                   {
8403                     tail->bytepos = from_byte + coding->produced;
8404                     tail->charpos
8405                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8406                          ? tail->bytepos : from + coding->produced_char);
8407                   }
8408               }
8409         }
8410     }
8411
8412   if (kill_src_buffer)
8413     Fkill_buffer (coding->src_object);
8414
8415   Vdeactivate_mark = old_deactivate_mark;
8416   unbind_to (count, Qnil);
8417 }
8418
8419
8420 Lisp_Object
8421 preferred_coding_system (void)
8422 {
8423   int id = coding_categories[coding_priorities[0]].id;
8424
8425   return CODING_ID_NAME (id);
8426 }
8427
8428 #if defined (WINDOWSNT) || defined (CYGWIN)
8429
8430 Lisp_Object
8431 from_unicode (Lisp_Object str)
8432 {
8433   CHECK_STRING (str);
8434   if (!STRING_MULTIBYTE (str) &&
8435       SBYTES (str) & 1)
8436     {
8437       str = Fsubstring (str, make_number (0), make_number (-1));
8438     }
8439
8440   return code_convert_string_norecord (str, Qutf_16le, 0);
8441 }
8442
8443 Lisp_Object
8444 from_unicode_buffer (const wchar_t* wstr)
8445 {
8446     return from_unicode (
8447         make_unibyte_string (
8448             (char*) wstr,
8449             /* we get one of the two final 0 bytes for free. */
8450             1 + sizeof (wchar_t) * wcslen (wstr)));
8451 }
8452
8453 wchar_t *
8454 to_unicode (Lisp_Object str, Lisp_Object *buf)
8455 {
8456   *buf = code_convert_string_norecord (str, Qutf_16le, 1);
8457   /* We need to make another copy (in addition to the one made by
8458      code_convert_string_norecord) to ensure that the final string is
8459      _doubly_ zero terminated --- that is, that the string is
8460      terminated by two zero bytes and one utf-16le null character.
8461      Because strings are already terminated with a single zero byte,
8462      we just add one additional zero. */
8463   str = make_uninit_string (SBYTES (*buf) + 1);
8464   memcpy (SDATA (str), SDATA (*buf), SBYTES (*buf));
8465   SDATA (str) [SBYTES (*buf)] = '\0';
8466   *buf = str;
8467   return WCSDATA (*buf);
8468 }
8469
8470 #endif /* WINDOWSNT || CYGWIN */
8471
8472 \f
8473 #ifdef emacs
8474 /*** 8. Emacs Lisp library functions ***/
8475
8476 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8477        doc: /* Return t if OBJECT is nil or a coding-system.
8478 See the documentation of `define-coding-system' for information
8479 about coding-system objects.  */)
8480   (Lisp_Object object)
8481 {
8482   if (NILP (object)
8483       || CODING_SYSTEM_ID (object) >= 0)
8484     return Qt;
8485   if (! SYMBOLP (object)
8486       || NILP (Fget (object, Qcoding_system_define_form)))
8487     return Qnil;
8488   return Qt;
8489 }
8490
8491 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8492        Sread_non_nil_coding_system, 1, 1, 0,
8493        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8494   (Lisp_Object prompt)
8495 {
8496   Lisp_Object val;
8497   do
8498     {
8499       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8500                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8501     }
8502   while (SCHARS (val) == 0);
8503   return (Fintern (val, Qnil));
8504 }
8505
8506 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8507        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8508 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8509 Ignores case when completing coding systems (all Emacs coding systems
8510 are lower-case).  */)
8511   (Lisp_Object prompt, Lisp_Object default_coding_system)
8512 {
8513   Lisp_Object val;
8514   ptrdiff_t count = SPECPDL_INDEX ();
8515
8516   if (SYMBOLP (default_coding_system))
8517     default_coding_system = SYMBOL_NAME (default_coding_system);
8518   specbind (Qcompletion_ignore_case, Qt);
8519   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8520                           Qt, Qnil, Qcoding_system_history,
8521                           default_coding_system, Qnil);
8522   unbind_to (count, Qnil);
8523   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8524 }
8525
8526 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8527        1, 1, 0,
8528        doc: /* Check validity of CODING-SYSTEM.
8529 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8530 It is valid if it is nil or a symbol defined as a coding system by the
8531 function `define-coding-system'.  */)
8532   (Lisp_Object coding_system)
8533 {
8534   Lisp_Object define_form;
8535
8536   define_form = Fget (coding_system, Qcoding_system_define_form);
8537   if (! NILP (define_form))
8538     {
8539       Fput (coding_system, Qcoding_system_define_form, Qnil);
8540       safe_eval (define_form);
8541     }
8542   if (!NILP (Fcoding_system_p (coding_system)))
8543     return coding_system;
8544   xsignal1 (Qcoding_system_error, coding_system);
8545 }
8546
8547 \f
8548 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8549    HIGHEST, return the coding system of the highest
8550    priority among the detected coding systems.  Otherwise return a
8551    list of detected coding systems sorted by their priorities.  If
8552    MULTIBYTEP, it is assumed that the bytes are in correct
8553    multibyte form but contains only ASCII and eight-bit chars.
8554    Otherwise, the bytes are raw bytes.
8555
8556    CODING-SYSTEM controls the detection as below:
8557
8558    If it is nil, detect both text-format and eol-format.  If the
8559    text-format part of CODING-SYSTEM is already specified
8560    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8561    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8562    detect only text-format.  */
8563
8564 Lisp_Object
8565 detect_coding_system (const unsigned char *src,
8566                       ptrdiff_t src_chars, ptrdiff_t src_bytes,
8567                       bool highest, bool multibytep,
8568                       Lisp_Object coding_system)
8569 {
8570   const unsigned char *src_end = src + src_bytes;
8571   Lisp_Object attrs, eol_type;
8572   Lisp_Object val = Qnil;
8573   struct coding_system coding;
8574   ptrdiff_t id;
8575   struct coding_detection_info detect_info;
8576   enum coding_category base_category;
8577   bool null_byte_found = 0, eight_bit_found = 0;
8578
8579   if (NILP (coding_system))
8580     coding_system = Qundecided;
8581   setup_coding_system (coding_system, &coding);
8582   attrs = CODING_ID_ATTRS (coding.id);
8583   eol_type = CODING_ID_EOL_TYPE (coding.id);
8584   coding_system = CODING_ATTR_BASE_NAME (attrs);
8585
8586   coding.source = src;
8587   coding.src_chars = src_chars;
8588   coding.src_bytes = src_bytes;
8589   coding.src_multibyte = multibytep;
8590   coding.consumed = 0;
8591   coding.mode |= CODING_MODE_LAST_BLOCK;
8592   coding.head_ascii = 0;
8593
8594   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8595
8596   /* At first, detect text-format if necessary.  */
8597   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8598   if (base_category == coding_category_undecided)
8599     {
8600       enum coding_category category IF_LINT (= 0);
8601       struct coding_system *this IF_LINT (= NULL);
8602       int c, i;
8603       bool inhibit_nbd = inhibit_flag (coding.spec.undecided.inhibit_nbd,
8604                                        inhibit_null_byte_detection);
8605       bool inhibit_ied = inhibit_flag (coding.spec.undecided.inhibit_ied,
8606                                        inhibit_iso_escape_detection);
8607       bool prefer_utf_8 = coding.spec.undecided.prefer_utf_8;
8608
8609       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8610       for (; src < src_end; src++)
8611         {
8612           c = *src;
8613           if (c & 0x80)
8614             {
8615               eight_bit_found = 1;
8616               if (null_byte_found)
8617                 break;
8618             }
8619           else if (c < 0x20)
8620             {
8621               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8622                   && ! inhibit_ied
8623                   && ! detect_info.checked)
8624                 {
8625                   if (detect_coding_iso_2022 (&coding, &detect_info))
8626                     {
8627                       /* We have scanned the whole data.  */
8628                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8629                         {
8630                           /* We didn't find an 8-bit code.  We may
8631                              have found a null-byte, but it's very
8632                              rare that a binary file confirm to
8633                              ISO-2022.  */
8634                           src = src_end;
8635                           coding.head_ascii = src - coding.source;
8636                         }
8637                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8638                       break;
8639                     }
8640                 }
8641               else if (! c && !inhibit_nbd)
8642                 {
8643                   null_byte_found = 1;
8644                   if (eight_bit_found)
8645                     break;
8646                 }
8647               if (! eight_bit_found)
8648                 coding.head_ascii++;
8649             }
8650           else if (! eight_bit_found)
8651             coding.head_ascii++;
8652         }
8653
8654       if (null_byte_found || eight_bit_found
8655           || coding.head_ascii < coding.src_bytes
8656           || detect_info.found)
8657         {
8658           if (coding.head_ascii == coding.src_bytes)
8659             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8660             for (i = 0; i < coding_category_raw_text; i++)
8661               {
8662                 category = coding_priorities[i];
8663                 this = coding_categories + category;
8664                 if (detect_info.found & (1 << category))
8665                   break;
8666               }
8667           else
8668             {
8669               if (null_byte_found)
8670                 {
8671                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8672                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8673                 }
8674               else if (prefer_utf_8
8675                        && detect_coding_utf_8 (&coding, &detect_info))
8676                 {
8677                   detect_info.checked |= ~CATEGORY_MASK_UTF_8;
8678                   detect_info.rejected |= ~CATEGORY_MASK_UTF_8;
8679                 }
8680               for (i = 0; i < coding_category_raw_text; i++)
8681                 {
8682                   category = coding_priorities[i];
8683                   this = coding_categories + category;
8684
8685                   if (this->id < 0)
8686                     {
8687                       /* No coding system of this category is defined.  */
8688                       detect_info.rejected |= (1 << category);
8689                     }
8690                   else if (category >= coding_category_raw_text)
8691                     continue;
8692                   else if (detect_info.checked & (1 << category))
8693                     {
8694                       if (highest
8695                           && (detect_info.found & (1 << category)))
8696                         break;
8697                     }
8698                   else if ((*(this->detector)) (&coding, &detect_info)
8699                            && highest
8700                            && (detect_info.found & (1 << category)))
8701                     {
8702                       if (category == coding_category_utf_16_auto)
8703                         {
8704                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8705                             category = coding_category_utf_16_le;
8706                           else
8707                             category = coding_category_utf_16_be;
8708                         }
8709                       break;
8710                     }
8711                 }
8712             }
8713         }
8714
8715       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8716           || null_byte_found)
8717         {
8718           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8719           id = CODING_SYSTEM_ID (Qno_conversion);
8720           val = list1 (make_number (id));
8721         }
8722       else if (! detect_info.rejected && ! detect_info.found)
8723         {
8724           detect_info.found = CATEGORY_MASK_ANY;
8725           id = coding_categories[coding_category_undecided].id;
8726           val = list1 (make_number (id));
8727         }
8728       else if (highest)
8729         {
8730           if (detect_info.found)
8731             {
8732               detect_info.found = 1 << category;
8733               val = list1 (make_number (this->id));
8734             }
8735           else
8736             for (i = 0; i < coding_category_raw_text; i++)
8737               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8738                 {
8739                   detect_info.found = 1 << coding_priorities[i];
8740                   id = coding_categories[coding_priorities[i]].id;
8741                   val = list1 (make_number (id));
8742                   break;
8743                 }
8744         }
8745       else
8746         {
8747           int mask = detect_info.rejected | detect_info.found;
8748           int found = 0;
8749
8750           for (i = coding_category_raw_text - 1; i >= 0; i--)
8751             {
8752               category = coding_priorities[i];
8753               if (! (mask & (1 << category)))
8754                 {
8755                   found |= 1 << category;
8756                   id = coding_categories[category].id;
8757                   if (id >= 0)
8758                     val = list1 (make_number (id));
8759                 }
8760             }
8761           for (i = coding_category_raw_text - 1; i >= 0; i--)
8762             {
8763               category = coding_priorities[i];
8764               if (detect_info.found & (1 << category))
8765                 {
8766                   id = coding_categories[category].id;
8767                   val = Fcons (make_number (id), val);
8768                 }
8769             }
8770           detect_info.found |= found;
8771         }
8772     }
8773   else if (base_category == coding_category_utf_8_auto)
8774     {
8775       if (detect_coding_utf_8 (&coding, &detect_info))
8776         {
8777           struct coding_system *this;
8778
8779           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8780             this = coding_categories + coding_category_utf_8_sig;
8781           else
8782             this = coding_categories + coding_category_utf_8_nosig;
8783           val = list1 (make_number (this->id));
8784         }
8785     }
8786   else if (base_category == coding_category_utf_16_auto)
8787     {
8788       if (detect_coding_utf_16 (&coding, &detect_info))
8789         {
8790           struct coding_system *this;
8791
8792           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8793             this = coding_categories + coding_category_utf_16_le;
8794           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8795             this = coding_categories + coding_category_utf_16_be;
8796           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8797             this = coding_categories + coding_category_utf_16_be_nosig;
8798           else
8799             this = coding_categories + coding_category_utf_16_le_nosig;
8800           val = list1 (make_number (this->id));
8801         }
8802     }
8803   else
8804     {
8805       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8806       val = list1 (make_number (coding.id));
8807     }
8808
8809   /* Then, detect eol-format if necessary.  */
8810   {
8811     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8812     Lisp_Object tail;
8813
8814     if (VECTORP (eol_type))
8815       {
8816         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8817           {
8818             if (null_byte_found)
8819               normal_eol = EOL_SEEN_LF;
8820             else
8821               normal_eol = detect_eol (coding.source, src_bytes,
8822                                        coding_category_raw_text);
8823           }
8824         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8825                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8826           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8827                                       coding_category_utf_16_be);
8828         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8829                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8830           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8831                                       coding_category_utf_16_le);
8832       }
8833     else
8834       {
8835         if (EQ (eol_type, Qunix))
8836           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8837         else if (EQ (eol_type, Qdos))
8838           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8839         else
8840           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8841       }
8842
8843     for (tail = val; CONSP (tail); tail = XCDR (tail))
8844       {
8845         enum coding_category category;
8846         int this_eol;
8847
8848         id = XINT (XCAR (tail));
8849         attrs = CODING_ID_ATTRS (id);
8850         category = XINT (CODING_ATTR_CATEGORY (attrs));
8851         eol_type = CODING_ID_EOL_TYPE (id);
8852         if (VECTORP (eol_type))
8853           {
8854             if (category == coding_category_utf_16_be
8855                 || category == coding_category_utf_16_be_nosig)
8856               this_eol = utf_16_be_eol;
8857             else if (category == coding_category_utf_16_le
8858                      || category == coding_category_utf_16_le_nosig)
8859               this_eol = utf_16_le_eol;
8860             else
8861               this_eol = normal_eol;
8862
8863             if (this_eol == EOL_SEEN_LF)
8864               XSETCAR (tail, AREF (eol_type, 0));
8865             else if (this_eol == EOL_SEEN_CRLF)
8866               XSETCAR (tail, AREF (eol_type, 1));
8867             else if (this_eol == EOL_SEEN_CR)
8868               XSETCAR (tail, AREF (eol_type, 2));
8869             else
8870               XSETCAR (tail, CODING_ID_NAME (id));
8871           }
8872         else
8873           XSETCAR (tail, CODING_ID_NAME (id));
8874       }
8875   }
8876
8877   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8878 }
8879
8880
8881 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8882        2, 3, 0,
8883        doc: /* Detect coding system of the text in the region between START and END.
8884 Return a list of possible coding systems ordered by priority.
8885 The coding systems to try and their priorities follows what
8886 the function `coding-system-priority-list' (which see) returns.
8887
8888 If only ASCII characters are found (except for such ISO-2022 control
8889 characters as ESC), it returns a list of single element `undecided'
8890 or its subsidiary coding system according to a detected end-of-line
8891 format.
8892
8893 If optional argument HIGHEST is non-nil, return the coding system of
8894 highest priority.  */)
8895   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8896 {
8897   ptrdiff_t from, to;
8898   ptrdiff_t from_byte, to_byte;
8899
8900   validate_region (&start, &end);
8901   from = XINT (start), to = XINT (end);
8902   from_byte = CHAR_TO_BYTE (from);
8903   to_byte = CHAR_TO_BYTE (to);
8904
8905   if (from < GPT && to >= GPT)
8906     move_gap_both (to, to_byte);
8907
8908   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8909                                to - from, to_byte - from_byte,
8910                                !NILP (highest),
8911                                !NILP (BVAR (current_buffer
8912                                       , enable_multibyte_characters)),
8913                                Qnil);
8914 }
8915
8916 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8917        1, 2, 0,
8918        doc: /* Detect coding system of the text in STRING.
8919 Return a list of possible coding systems ordered by priority.
8920 The coding systems to try and their priorities follows what
8921 the function `coding-system-priority-list' (which see) returns.
8922
8923 If only ASCII characters are found (except for such ISO-2022 control
8924 characters as ESC), it returns a list of single element `undecided'
8925 or its subsidiary coding system according to a detected end-of-line
8926 format.
8927
8928 If optional argument HIGHEST is non-nil, return the coding system of
8929 highest priority.  */)
8930   (Lisp_Object string, Lisp_Object highest)
8931 {
8932   CHECK_STRING (string);
8933
8934   return detect_coding_system (SDATA (string),
8935                                SCHARS (string), SBYTES (string),
8936                                !NILP (highest), STRING_MULTIBYTE (string),
8937                                Qnil);
8938 }
8939
8940
8941 static bool
8942 char_encodable_p (int c, Lisp_Object attrs)
8943 {
8944   Lisp_Object tail;
8945   struct charset *charset;
8946   Lisp_Object translation_table;
8947
8948   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8949   if (! NILP (translation_table))
8950     c = translate_char (translation_table, c);
8951   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8952        CONSP (tail); tail = XCDR (tail))
8953     {
8954       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8955       if (CHAR_CHARSET_P (c, charset))
8956         break;
8957     }
8958   return (! NILP (tail));
8959 }
8960
8961
8962 /* Return a list of coding systems that safely encode the text between
8963    START and END.  If EXCLUDE is non-nil, it is a list of coding
8964    systems not to check.  The returned list doesn't contain any such
8965    coding systems.  In any case, if the text contains only ASCII or is
8966    unibyte, return t.  */
8967
8968 DEFUN ("find-coding-systems-region-internal",
8969        Ffind_coding_systems_region_internal,
8970        Sfind_coding_systems_region_internal, 2, 3, 0,
8971        doc: /* Internal use only.  */)
8972   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8973 {
8974   Lisp_Object coding_attrs_list, safe_codings;
8975   ptrdiff_t start_byte, end_byte;
8976   const unsigned char *p, *pbeg, *pend;
8977   int c;
8978   Lisp_Object tail, elt, work_table;
8979
8980   if (STRINGP (start))
8981     {
8982       if (!STRING_MULTIBYTE (start)
8983           || SCHARS (start) == SBYTES (start))
8984         return Qt;
8985       start_byte = 0;
8986       end_byte = SBYTES (start);
8987     }
8988   else
8989     {
8990       CHECK_NUMBER_COERCE_MARKER (start);
8991       CHECK_NUMBER_COERCE_MARKER (end);
8992       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8993         args_out_of_range (start, end);
8994       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8995         return Qt;
8996       start_byte = CHAR_TO_BYTE (XINT (start));
8997       end_byte = CHAR_TO_BYTE (XINT (end));
8998       if (XINT (end) - XINT (start) == end_byte - start_byte)
8999         return Qt;
9000
9001       if (XINT (start) < GPT && XINT (end) > GPT)
9002         {
9003           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9004             move_gap_both (XINT (start), start_byte);
9005           else
9006             move_gap_both (XINT (end), end_byte);
9007         }
9008     }
9009
9010   coding_attrs_list = Qnil;
9011   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
9012     if (NILP (exclude)
9013         || NILP (Fmemq (XCAR (tail), exclude)))
9014       {
9015         Lisp_Object attrs;
9016
9017         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
9018         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs)))
9019           {
9020             ASET (attrs, coding_attr_trans_tbl,
9021                   get_translation_table (attrs, 1, NULL));
9022             coding_attrs_list = Fcons (attrs, coding_attrs_list);
9023           }
9024       }
9025
9026   if (STRINGP (start))
9027     p = pbeg = SDATA (start);
9028   else
9029     p = pbeg = BYTE_POS_ADDR (start_byte);
9030   pend = p + (end_byte - start_byte);
9031
9032   while (p < pend && ASCII_BYTE_P (*p)) p++;
9033   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9034
9035   work_table = Fmake_char_table (Qnil, Qnil);
9036   while (p < pend)
9037     {
9038       if (ASCII_BYTE_P (*p))
9039         p++;
9040       else
9041         {
9042           c = STRING_CHAR_ADVANCE (p);
9043           if (!NILP (char_table_ref (work_table, c)))
9044             /* This character was already checked.  Ignore it.  */
9045             continue;
9046
9047           charset_map_loaded = 0;
9048           for (tail = coding_attrs_list; CONSP (tail);)
9049             {
9050               elt = XCAR (tail);
9051               if (NILP (elt))
9052                 tail = XCDR (tail);
9053               else if (char_encodable_p (c, elt))
9054                 tail = XCDR (tail);
9055               else if (CONSP (XCDR (tail)))
9056                 {
9057                   XSETCAR (tail, XCAR (XCDR (tail)));
9058                   XSETCDR (tail, XCDR (XCDR (tail)));
9059                 }
9060               else
9061                 {
9062                   XSETCAR (tail, Qnil);
9063                   tail = XCDR (tail);
9064                 }
9065             }
9066           if (charset_map_loaded)
9067             {
9068               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9069
9070               if (STRINGP (start))
9071                 pbeg = SDATA (start);
9072               else
9073                 pbeg = BYTE_POS_ADDR (start_byte);
9074               p = pbeg + p_offset;
9075               pend = pbeg + pend_offset;
9076             }
9077           char_table_set (work_table, c, Qt);
9078         }
9079     }
9080
9081   safe_codings = list2 (Qraw_text, Qno_conversion);
9082   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
9083     if (! NILP (XCAR (tail)))
9084       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
9085
9086   return safe_codings;
9087 }
9088
9089
9090 DEFUN ("unencodable-char-position", Funencodable_char_position,
9091        Sunencodable_char_position, 3, 5, 0,
9092        doc: /*
9093 Return position of first un-encodable character in a region.
9094 START and END specify the region and CODING-SYSTEM specifies the
9095 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
9096
9097 If optional 4th argument COUNT is non-nil, it specifies at most how
9098 many un-encodable characters to search.  In this case, the value is a
9099 list of positions.
9100
9101 If optional 5th argument STRING is non-nil, it is a string to search
9102 for un-encodable characters.  In that case, START and END are indexes
9103 to the string.  */)
9104   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
9105 {
9106   EMACS_INT n;
9107   struct coding_system coding;
9108   Lisp_Object attrs, charset_list, translation_table;
9109   Lisp_Object positions;
9110   ptrdiff_t from, to;
9111   const unsigned char *p, *stop, *pend;
9112   bool ascii_compatible;
9113
9114   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
9115   attrs = CODING_ID_ATTRS (coding.id);
9116   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
9117     return Qnil;
9118   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
9119   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9120   translation_table = get_translation_table (attrs, 1, NULL);
9121
9122   if (NILP (string))
9123     {
9124       validate_region (&start, &end);
9125       from = XINT (start);
9126       to = XINT (end);
9127       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
9128           || (ascii_compatible
9129               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
9130         return Qnil;
9131       p = CHAR_POS_ADDR (from);
9132       pend = CHAR_POS_ADDR (to);
9133       if (from < GPT && to >= GPT)
9134         stop = GPT_ADDR;
9135       else
9136         stop = pend;
9137     }
9138   else
9139     {
9140       CHECK_STRING (string);
9141       CHECK_NATNUM (start);
9142       CHECK_NATNUM (end);
9143       if (! (XINT (start) <= XINT (end) && XINT (end) <= SCHARS (string)))
9144         args_out_of_range_3 (string, start, end);
9145       from = XINT (start);
9146       to = XINT (end);
9147       if (! STRING_MULTIBYTE (string))
9148         return Qnil;
9149       p = SDATA (string) + string_char_to_byte (string, from);
9150       stop = pend = SDATA (string) + string_char_to_byte (string, to);
9151       if (ascii_compatible && (to - from) == (pend - p))
9152         return Qnil;
9153     }
9154
9155   if (NILP (count))
9156     n = 1;
9157   else
9158     {
9159       CHECK_NATNUM (count);
9160       n = XINT (count);
9161     }
9162
9163   positions = Qnil;
9164   charset_map_loaded = 0;
9165   while (1)
9166     {
9167       int c;
9168
9169       if (ascii_compatible)
9170         while (p < stop && ASCII_BYTE_P (*p))
9171           p++, from++;
9172       if (p >= stop)
9173         {
9174           if (p >= pend)
9175             break;
9176           stop = pend;
9177           p = GAP_END_ADDR;
9178         }
9179
9180       c = STRING_CHAR_ADVANCE (p);
9181       if (! (ASCII_CHAR_P (c) && ascii_compatible)
9182           && ! char_charset (translate_char (translation_table, c),
9183                              charset_list, NULL))
9184         {
9185           positions = Fcons (make_number (from), positions);
9186           n--;
9187           if (n == 0)
9188             break;
9189         }
9190
9191       from++;
9192       if (charset_map_loaded && NILP (string))
9193         {
9194           p = CHAR_POS_ADDR (from);
9195           pend = CHAR_POS_ADDR (to);
9196           if (from < GPT && to >= GPT)
9197             stop = GPT_ADDR;
9198           else
9199             stop = pend;
9200           charset_map_loaded = 0;
9201         }
9202     }
9203
9204   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
9205 }
9206
9207
9208 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
9209        Scheck_coding_systems_region, 3, 3, 0,
9210        doc: /* Check if the region is encodable by coding systems.
9211
9212 START and END are buffer positions specifying the region.
9213 CODING-SYSTEM-LIST is a list of coding systems to check.
9214
9215 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
9216 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
9217 whole region, POS0, POS1, ... are buffer positions where non-encodable
9218 characters are found.
9219
9220 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
9221 value is nil.
9222
9223 START may be a string.  In that case, check if the string is
9224 encodable, and the value contains indices to the string instead of
9225 buffer positions.  END is ignored.
9226
9227 If the current buffer (or START if it is a string) is unibyte, the value
9228 is nil.  */)
9229   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
9230 {
9231   Lisp_Object list;
9232   ptrdiff_t start_byte, end_byte;
9233   ptrdiff_t pos;
9234   const unsigned char *p, *pbeg, *pend;
9235   int c;
9236   Lisp_Object tail, elt, attrs;
9237
9238   if (STRINGP (start))
9239     {
9240       if (!STRING_MULTIBYTE (start)
9241           || SCHARS (start) == SBYTES (start))
9242         return Qnil;
9243       start_byte = 0;
9244       end_byte = SBYTES (start);
9245       pos = 0;
9246     }
9247   else
9248     {
9249       CHECK_NUMBER_COERCE_MARKER (start);
9250       CHECK_NUMBER_COERCE_MARKER (end);
9251       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
9252         args_out_of_range (start, end);
9253       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
9254         return Qnil;
9255       start_byte = CHAR_TO_BYTE (XINT (start));
9256       end_byte = CHAR_TO_BYTE (XINT (end));
9257       if (XINT (end) - XINT (start) == end_byte - start_byte)
9258         return Qnil;
9259
9260       if (XINT (start) < GPT && XINT (end) > GPT)
9261         {
9262           if ((GPT - XINT (start)) < (XINT (end) - GPT))
9263             move_gap_both (XINT (start), start_byte);
9264           else
9265             move_gap_both (XINT (end), end_byte);
9266         }
9267       pos = XINT (start);
9268     }
9269
9270   list = Qnil;
9271   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
9272     {
9273       elt = XCAR (tail);
9274       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
9275       ASET (attrs, coding_attr_trans_tbl,
9276             get_translation_table (attrs, 1, NULL));
9277       list = Fcons (list2 (elt, attrs), list);
9278     }
9279
9280   if (STRINGP (start))
9281     p = pbeg = SDATA (start);
9282   else
9283     p = pbeg = BYTE_POS_ADDR (start_byte);
9284   pend = p + (end_byte - start_byte);
9285
9286   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
9287   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
9288
9289   while (p < pend)
9290     {
9291       if (ASCII_BYTE_P (*p))
9292         p++;
9293       else
9294         {
9295           c = STRING_CHAR_ADVANCE (p);
9296
9297           charset_map_loaded = 0;
9298           for (tail = list; CONSP (tail); tail = XCDR (tail))
9299             {
9300               elt = XCDR (XCAR (tail));
9301               if (! char_encodable_p (c, XCAR (elt)))
9302                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
9303             }
9304           if (charset_map_loaded)
9305             {
9306               ptrdiff_t p_offset = p - pbeg, pend_offset = pend - pbeg;
9307
9308               if (STRINGP (start))
9309                 pbeg = SDATA (start);
9310               else
9311                 pbeg = BYTE_POS_ADDR (start_byte);
9312               p = pbeg + p_offset;
9313               pend = pbeg + pend_offset;
9314             }
9315         }
9316       pos++;
9317     }
9318
9319   tail = list;
9320   list = Qnil;
9321   for (; CONSP (tail); tail = XCDR (tail))
9322     {
9323       elt = XCAR (tail);
9324       if (CONSP (XCDR (XCDR (elt))))
9325         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
9326                       list);
9327     }
9328
9329   return list;
9330 }
9331
9332
9333 static Lisp_Object
9334 code_convert_region (Lisp_Object start, Lisp_Object end,
9335                      Lisp_Object coding_system, Lisp_Object dst_object,
9336                      bool encodep, bool norecord)
9337 {
9338   struct coding_system coding;
9339   ptrdiff_t from, from_byte, to, to_byte;
9340   Lisp_Object src_object;
9341
9342   if (NILP (coding_system))
9343     coding_system = Qno_conversion;
9344   else
9345     CHECK_CODING_SYSTEM (coding_system);
9346   src_object = Fcurrent_buffer ();
9347   if (NILP (dst_object))
9348     dst_object = src_object;
9349   else if (! EQ (dst_object, Qt))
9350     CHECK_BUFFER (dst_object);
9351
9352   validate_region (&start, &end);
9353   from = XFASTINT (start);
9354   from_byte = CHAR_TO_BYTE (from);
9355   to = XFASTINT (end);
9356   to_byte = CHAR_TO_BYTE (to);
9357
9358   setup_coding_system (coding_system, &coding);
9359   coding.mode |= CODING_MODE_LAST_BLOCK;
9360
9361   if (BUFFERP (dst_object) && !EQ (dst_object, src_object))
9362     {
9363       struct buffer *buf = XBUFFER (dst_object);
9364       ptrdiff_t buf_pt = BUF_PT (buf);
9365
9366       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9367     }
9368
9369   if (encodep)
9370     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9371                           dst_object);
9372   else
9373     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9374                           dst_object);
9375   if (! norecord)
9376     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9377
9378   return (BUFFERP (dst_object)
9379           ? make_number (coding.produced_char)
9380           : coding.dst_object);
9381 }
9382
9383
9384 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9385        3, 4, "r\nzCoding system: ",
9386        doc: /* Decode the current region from the specified coding system.
9387 When called from a program, takes four arguments:
9388         START, END, CODING-SYSTEM, and DESTINATION.
9389 START and END are buffer positions.
9390
9391 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9392 If nil, the region between START and END is replaced by the decoded text.
9393 If buffer, the decoded text is inserted in that buffer after point (point
9394 does not move).
9395 In those cases, the length of the decoded text is returned.
9396 If DESTINATION is t, the decoded text is returned.
9397
9398 This function sets `last-coding-system-used' to the precise coding system
9399 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9400 not fully specified.)  */)
9401   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9402 {
9403   return code_convert_region (start, end, coding_system, destination, 0, 0);
9404 }
9405
9406 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9407        3, 4, "r\nzCoding system: ",
9408        doc: /* Encode the current region by specified coding system.
9409 When called from a program, takes four arguments:
9410         START, END, CODING-SYSTEM and DESTINATION.
9411 START and END are buffer positions.
9412
9413 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9414 If nil, the region between START and END is replace by the encoded text.
9415 If buffer, the encoded text is inserted in that buffer after point (point
9416 does not move).
9417 In those cases, the length of the encoded text is returned.
9418 If DESTINATION is t, the encoded text is returned.
9419
9420 This function sets `last-coding-system-used' to the precise coding system
9421 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9422 not fully specified.)  */)
9423   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
9424 {
9425   return code_convert_region (start, end, coding_system, destination, 1, 0);
9426 }
9427
9428 Lisp_Object
9429 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
9430                      Lisp_Object dst_object, bool encodep, bool nocopy,
9431                      bool norecord)
9432 {
9433   struct coding_system coding;
9434   ptrdiff_t chars, bytes;
9435
9436   CHECK_STRING (string);
9437   if (NILP (coding_system))
9438     {
9439       if (! norecord)
9440         Vlast_coding_system_used = Qno_conversion;
9441       if (NILP (dst_object))
9442         return (nocopy ? Fcopy_sequence (string) : string);
9443     }
9444
9445   if (NILP (coding_system))
9446     coding_system = Qno_conversion;
9447   else
9448     CHECK_CODING_SYSTEM (coding_system);
9449   if (NILP (dst_object))
9450     dst_object = Qt;
9451   else if (! EQ (dst_object, Qt))
9452     CHECK_BUFFER (dst_object);
9453
9454   setup_coding_system (coding_system, &coding);
9455   coding.mode |= CODING_MODE_LAST_BLOCK;
9456   chars = SCHARS (string);
9457   bytes = SBYTES (string);
9458
9459   if (BUFFERP (dst_object))
9460     {
9461       struct buffer *buf = XBUFFER (dst_object);
9462       ptrdiff_t buf_pt = BUF_PT (buf);
9463
9464       invalidate_buffer_caches (buf, buf_pt, buf_pt);
9465     }
9466
9467   if (encodep)
9468     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9469   else
9470     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9471   if (! norecord)
9472     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9473
9474   return (BUFFERP (dst_object)
9475           ? make_number (coding.produced_char)
9476           : coding.dst_object);
9477 }
9478
9479
9480 /* Encode or decode STRING according to CODING_SYSTEM.
9481    Do not set Vlast_coding_system_used.
9482
9483    This function is called only from macros DECODE_FILE and
9484    ENCODE_FILE, thus we ignore character composition.  */
9485
9486 Lisp_Object
9487 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9488                               bool encodep)
9489 {
9490   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9491 }
9492
9493 /* Encode or decode a file name, to or from a unibyte string suitable
9494    for passing to C library functions.  */
9495 Lisp_Object
9496 decode_file_name (Lisp_Object fname)
9497 {
9498 #ifdef WINDOWSNT
9499   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9500      converts the file names either to UTF-16LE or to the system ANSI
9501      codepage internally, depending on the underlying OS; see w32.c.  */
9502   if (! NILP (Fcoding_system_p (Qutf_8)))
9503     return code_convert_string_norecord (fname, Qutf_8, 0);
9504   return fname;
9505 #else  /* !WINDOWSNT */
9506   if (! NILP (Vfile_name_coding_system))
9507     return code_convert_string_norecord (fname, Vfile_name_coding_system, 0);
9508   else if (! NILP (Vdefault_file_name_coding_system))
9509     return code_convert_string_norecord (fname,
9510                                          Vdefault_file_name_coding_system, 0);
9511   else
9512     return fname;
9513 #endif
9514 }
9515
9516 Lisp_Object
9517 encode_file_name (Lisp_Object fname)
9518 {
9519   /* This is especially important during bootstrap and dumping, when
9520      file-name encoding is not yet known, and therefore any non-ASCII
9521      file names are unibyte strings, and could only be thrashed if we
9522      try to encode them.  */
9523   if (!STRING_MULTIBYTE (fname))
9524     return fname;
9525 #ifdef WINDOWSNT
9526   /* The w32 build pretends to use UTF-8 for file-name encoding, and
9527      converts the file names either to UTF-16LE or to the system ANSI
9528      codepage internally, depending on the underlying OS; see w32.c.  */
9529   if (! NILP (Fcoding_system_p (Qutf_8)))
9530     return code_convert_string_norecord (fname, Qutf_8, 1);
9531   return fname;
9532 #else  /* !WINDOWSNT */
9533   if (! NILP (Vfile_name_coding_system))
9534     return code_convert_string_norecord (fname, Vfile_name_coding_system, 1);
9535   else if (! NILP (Vdefault_file_name_coding_system))
9536     return code_convert_string_norecord (fname,
9537                                          Vdefault_file_name_coding_system, 1);
9538   else
9539     return fname;
9540 #endif
9541 }
9542
9543 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9544        2, 4, 0,
9545        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9546
9547 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9548 if the decoding operation is trivial.
9549
9550 Optional fourth arg BUFFER non-nil means that the decoded text is
9551 inserted in that buffer after point (point does not move).  In this
9552 case, the return value is the length of the decoded text.
9553
9554 This function sets `last-coding-system-used' to the precise coding system
9555 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9556 not fully specified.)  */)
9557   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9558 {
9559   return code_convert_string (string, coding_system, buffer,
9560                               0, ! NILP (nocopy), 0);
9561 }
9562
9563 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9564        2, 4, 0,
9565        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9566
9567 Optional third arg NOCOPY non-nil means it is OK to return STRING
9568 itself if the encoding operation is trivial.
9569
9570 Optional fourth arg BUFFER non-nil means that the encoded text is
9571 inserted in that buffer after point (point does not move).  In this
9572 case, the return value is the length of the encoded text.
9573
9574 This function sets `last-coding-system-used' to the precise coding system
9575 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9576 not fully specified.)  */)
9577   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9578 {
9579   return code_convert_string (string, coding_system, buffer,
9580                               1, ! NILP (nocopy), 0);
9581 }
9582
9583 \f
9584 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9585        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9586 Return the corresponding character.  */)
9587   (Lisp_Object code)
9588 {
9589   Lisp_Object spec, attrs, val;
9590   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9591   EMACS_INT ch;
9592   int c;
9593
9594   CHECK_NATNUM (code);
9595   ch = XFASTINT (code);
9596   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9597   attrs = AREF (spec, 0);
9598
9599   if (ASCII_BYTE_P (ch)
9600       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9601     return code;
9602
9603   val = CODING_ATTR_CHARSET_LIST (attrs);
9604   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9605   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9606   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9607
9608   if (ch <= 0x7F)
9609     {
9610       c = ch;
9611       charset = charset_roman;
9612     }
9613   else if (ch >= 0xA0 && ch < 0xDF)
9614     {
9615       c = ch - 0x80;
9616       charset = charset_kana;
9617     }
9618   else
9619     {
9620       EMACS_INT c1 = ch >> 8;
9621       int c2 = ch & 0xFF;
9622
9623       if (c1 < 0x81 || (c1 > 0x9F && c1 < 0xE0) || c1 > 0xEF
9624           || c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
9625         error ("Invalid code: %"pI"d", ch);
9626       c = ch;
9627       SJIS_TO_JIS (c);
9628       charset = charset_kanji;
9629     }
9630   c = DECODE_CHAR (charset, c);
9631   if (c < 0)
9632     error ("Invalid code: %"pI"d", ch);
9633   return make_number (c);
9634 }
9635
9636
9637 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9638        doc: /* Encode a Japanese character CH to shift_jis encoding.
9639 Return the corresponding code in SJIS.  */)
9640   (Lisp_Object ch)
9641 {
9642   Lisp_Object spec, attrs, charset_list;
9643   int c;
9644   struct charset *charset;
9645   unsigned code;
9646
9647   CHECK_CHARACTER (ch);
9648   c = XFASTINT (ch);
9649   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9650   attrs = AREF (spec, 0);
9651
9652   if (ASCII_CHAR_P (c)
9653       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9654     return ch;
9655
9656   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9657   charset = char_charset (c, charset_list, &code);
9658   if (code == CHARSET_INVALID_CODE (charset))
9659     error ("Can't encode by shift_jis encoding: %c", c);
9660   JIS_TO_SJIS (code);
9661
9662   return make_number (code);
9663 }
9664
9665 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9666        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9667 Return the corresponding character.  */)
9668   (Lisp_Object code)
9669 {
9670   Lisp_Object spec, attrs, val;
9671   struct charset *charset_roman, *charset_big5, *charset;
9672   EMACS_INT ch;
9673   int c;
9674
9675   CHECK_NATNUM (code);
9676   ch = XFASTINT (code);
9677   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9678   attrs = AREF (spec, 0);
9679
9680   if (ASCII_BYTE_P (ch)
9681       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9682     return code;
9683
9684   val = CODING_ATTR_CHARSET_LIST (attrs);
9685   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9686   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9687
9688   if (ch <= 0x7F)
9689     {
9690       c = ch;
9691       charset = charset_roman;
9692     }
9693   else
9694     {
9695       EMACS_INT b1 = ch >> 8;
9696       int b2 = ch & 0x7F;
9697       if (b1 < 0xA1 || b1 > 0xFE
9698           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9699         error ("Invalid code: %"pI"d", ch);
9700       c = ch;
9701       charset = charset_big5;
9702     }
9703   c = DECODE_CHAR (charset, c);
9704   if (c < 0)
9705     error ("Invalid code: %"pI"d", ch);
9706   return make_number (c);
9707 }
9708
9709 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9710        doc: /* Encode the Big5 character CH to BIG5 coding system.
9711 Return the corresponding character code in Big5.  */)
9712   (Lisp_Object ch)
9713 {
9714   Lisp_Object spec, attrs, charset_list;
9715   struct charset *charset;
9716   int c;
9717   unsigned code;
9718
9719   CHECK_CHARACTER (ch);
9720   c = XFASTINT (ch);
9721   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9722   attrs = AREF (spec, 0);
9723   if (ASCII_CHAR_P (c)
9724       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9725     return ch;
9726
9727   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9728   charset = char_charset (c, charset_list, &code);
9729   if (code == CHARSET_INVALID_CODE (charset))
9730     error ("Can't encode by Big5 encoding: %c", c);
9731
9732   return make_number (code);
9733 }
9734
9735 \f
9736 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9737        Sset_terminal_coding_system_internal, 1, 2, 0,
9738        doc: /* Internal use only.  */)
9739   (Lisp_Object coding_system, Lisp_Object terminal)
9740 {
9741   struct terminal *term = get_terminal (terminal, 1);
9742   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9743   CHECK_SYMBOL (coding_system);
9744   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9745   /* We had better not send unsafe characters to terminal.  */
9746   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9747   /* Character composition should be disabled.  */
9748   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9749   terminal_coding->src_multibyte = 1;
9750   terminal_coding->dst_multibyte = 0;
9751   tset_charset_list
9752     (term, (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK
9753             ? coding_charset_list (terminal_coding)
9754             : list1 (make_number (charset_ascii))));
9755   return Qnil;
9756 }
9757
9758 DEFUN ("set-safe-terminal-coding-system-internal",
9759        Fset_safe_terminal_coding_system_internal,
9760        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9761        doc: /* Internal use only.  */)
9762   (Lisp_Object coding_system)
9763 {
9764   CHECK_SYMBOL (coding_system);
9765   setup_coding_system (Fcheck_coding_system (coding_system),
9766                        &safe_terminal_coding);
9767   /* Character composition should be disabled.  */
9768   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9769   safe_terminal_coding.src_multibyte = 1;
9770   safe_terminal_coding.dst_multibyte = 0;
9771   return Qnil;
9772 }
9773
9774 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9775        Sterminal_coding_system, 0, 1, 0,
9776        doc: /* Return coding system specified for terminal output on the given terminal.
9777 TERMINAL may be a terminal object, a frame, or nil for the selected
9778 frame's terminal device.  */)
9779   (Lisp_Object terminal)
9780 {
9781   struct coding_system *terminal_coding
9782     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9783   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9784
9785   /* For backward compatibility, return nil if it is `undecided'.  */
9786   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9787 }
9788
9789 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9790        Sset_keyboard_coding_system_internal, 1, 2, 0,
9791        doc: /* Internal use only.  */)
9792   (Lisp_Object coding_system, Lisp_Object terminal)
9793 {
9794   struct terminal *t = get_terminal (terminal, 1);
9795   CHECK_SYMBOL (coding_system);
9796   if (NILP (coding_system))
9797     coding_system = Qno_conversion;
9798   else
9799     Fcheck_coding_system (coding_system);
9800   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9801   /* Character composition should be disabled.  */
9802   TERMINAL_KEYBOARD_CODING (t)->common_flags
9803     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9804   return Qnil;
9805 }
9806
9807 DEFUN ("keyboard-coding-system",
9808        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9809        doc: /* Return coding system specified for decoding keyboard input.  */)
9810   (Lisp_Object terminal)
9811 {
9812   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9813                          (get_terminal (terminal, 1))->id);
9814 }
9815
9816 \f
9817 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9818        Sfind_operation_coding_system,  1, MANY, 0,
9819        doc: /* Choose a coding system for an operation based on the target name.
9820 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9821 DECODING-SYSTEM is the coding system to use for decoding
9822 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9823 for encoding (in case OPERATION does encoding).
9824
9825 The first argument OPERATION specifies an I/O primitive:
9826   For file I/O, `insert-file-contents' or `write-region'.
9827   For process I/O, `call-process', `call-process-region', or `start-process'.
9828   For network I/O, `open-network-stream'.
9829
9830 The remaining arguments should be the same arguments that were passed
9831 to the primitive.  Depending on which primitive, one of those arguments
9832 is selected as the TARGET.  For example, if OPERATION does file I/O,
9833 whichever argument specifies the file name is TARGET.
9834
9835 TARGET has a meaning which depends on OPERATION:
9836   For file I/O, TARGET is a file name (except for the special case below).
9837   For process I/O, TARGET is a process name.
9838   For network I/O, TARGET is a service name or a port number.
9839
9840 This function looks up what is specified for TARGET in
9841 `file-coding-system-alist', `process-coding-system-alist',
9842 or `network-coding-system-alist' depending on OPERATION.
9843 They may specify a coding system, a cons of coding systems,
9844 or a function symbol to call.
9845 In the last case, we call the function with one argument,
9846 which is a list of all the arguments given to this function.
9847 If the function can't decide a coding system, it can return
9848 `undecided' so that the normal code-detection is performed.
9849
9850 If OPERATION is `insert-file-contents', the argument corresponding to
9851 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9852 file name to look up, and BUFFER is a buffer that contains the file's
9853 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9854 function to call for FILENAME, that function should examine the
9855 contents of BUFFER instead of reading the file.
9856
9857 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9858   (ptrdiff_t nargs, Lisp_Object *args)
9859 {
9860   Lisp_Object operation, target_idx, target, val;
9861   register Lisp_Object chain;
9862
9863   if (nargs < 2)
9864     error ("Too few arguments");
9865   operation = args[0];
9866   if (!SYMBOLP (operation)
9867       || (target_idx = Fget (operation, Qtarget_idx), !NATNUMP (target_idx)))
9868     error ("Invalid first argument");
9869   if (nargs <= 1 + XFASTINT (target_idx))
9870     error ("Too few arguments for operation `%s'",
9871            SDATA (SYMBOL_NAME (operation)));
9872   target = args[XFASTINT (target_idx) + 1];
9873   if (!(STRINGP (target)
9874         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9875             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9876         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9877     error ("Invalid argument %"pI"d of operation `%s'",
9878            XFASTINT (target_idx) + 1, SDATA (SYMBOL_NAME (operation)));
9879   if (CONSP (target))
9880     target = XCAR (target);
9881
9882   chain = ((EQ (operation, Qinsert_file_contents)
9883             || EQ (operation, Qwrite_region))
9884            ? Vfile_coding_system_alist
9885            : (EQ (operation, Qopen_network_stream)
9886               ? Vnetwork_coding_system_alist
9887               : Vprocess_coding_system_alist));
9888   if (NILP (chain))
9889     return Qnil;
9890
9891   for (; CONSP (chain); chain = XCDR (chain))
9892     {
9893       Lisp_Object elt;
9894
9895       elt = XCAR (chain);
9896       if (CONSP (elt)
9897           && ((STRINGP (target)
9898                && STRINGP (XCAR (elt))
9899                && fast_string_match (XCAR (elt), target) >= 0)
9900               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9901         {
9902           val = XCDR (elt);
9903           /* Here, if VAL is both a valid coding system and a valid
9904              function symbol, we return VAL as a coding system.  */
9905           if (CONSP (val))
9906             return val;
9907           if (! SYMBOLP (val))
9908             return Qnil;
9909           if (! NILP (Fcoding_system_p (val)))
9910             return Fcons (val, val);
9911           if (! NILP (Ffboundp (val)))
9912             {
9913               /* We use call1 rather than safe_call1
9914                  so as to get bug reports about functions called here
9915                  which don't handle the current interface.  */
9916               val = call1 (val, Flist (nargs, args));
9917               if (CONSP (val))
9918                 return val;
9919               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9920                 return Fcons (val, val);
9921             }
9922           return Qnil;
9923         }
9924     }
9925   return Qnil;
9926 }
9927
9928 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9929        Sset_coding_system_priority, 0, MANY, 0,
9930        doc: /* Assign higher priority to the coding systems given as arguments.
9931 If multiple coding systems belong to the same category,
9932 all but the first one are ignored.
9933
9934 usage: (set-coding-system-priority &rest coding-systems)  */)
9935   (ptrdiff_t nargs, Lisp_Object *args)
9936 {
9937   ptrdiff_t i, j;
9938   bool changed[coding_category_max];
9939   enum coding_category priorities[coding_category_max];
9940
9941   memset (changed, 0, sizeof changed);
9942
9943   for (i = j = 0; i < nargs; i++)
9944     {
9945       enum coding_category category;
9946       Lisp_Object spec, attrs;
9947
9948       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9949       attrs = AREF (spec, 0);
9950       category = XINT (CODING_ATTR_CATEGORY (attrs));
9951       if (changed[category])
9952         /* Ignore this coding system because a coding system of the
9953            same category already had a higher priority.  */
9954         continue;
9955       changed[category] = 1;
9956       priorities[j++] = category;
9957       if (coding_categories[category].id >= 0
9958           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9959         setup_coding_system (args[i], &coding_categories[category]);
9960       Fset (AREF (Vcoding_category_table, category), args[i]);
9961     }
9962
9963   /* Now we have decided top J priorities.  Reflect the order of the
9964      original priorities to the remaining priorities.  */
9965
9966   for (i = j, j = 0; i < coding_category_max; i++, j++)
9967     {
9968       while (j < coding_category_max
9969              && changed[coding_priorities[j]])
9970         j++;
9971       if (j == coding_category_max)
9972         emacs_abort ();
9973       priorities[i] = coding_priorities[j];
9974     }
9975
9976   memcpy (coding_priorities, priorities, sizeof priorities);
9977
9978   /* Update `coding-category-list'.  */
9979   Vcoding_category_list = Qnil;
9980   for (i = coding_category_max; i-- > 0; )
9981     Vcoding_category_list
9982       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9983                Vcoding_category_list);
9984
9985   return Qnil;
9986 }
9987
9988 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9989        Scoding_system_priority_list, 0, 1, 0,
9990        doc: /* Return a list of coding systems ordered by their priorities.
9991 The list contains a subset of coding systems; i.e. coding systems
9992 assigned to each coding category (see `coding-category-list').
9993
9994 HIGHESTP non-nil means just return the highest priority one.  */)
9995   (Lisp_Object highestp)
9996 {
9997   int i;
9998   Lisp_Object val;
9999
10000   for (i = 0, val = Qnil; i < coding_category_max; i++)
10001     {
10002       enum coding_category category = coding_priorities[i];
10003       int id = coding_categories[category].id;
10004       Lisp_Object attrs;
10005
10006       if (id < 0)
10007         continue;
10008       attrs = CODING_ID_ATTRS (id);
10009       if (! NILP (highestp))
10010         return CODING_ATTR_BASE_NAME (attrs);
10011       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
10012     }
10013   return Fnreverse (val);
10014 }
10015
10016 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
10017
10018 static Lisp_Object
10019 make_subsidiaries (Lisp_Object base)
10020 {
10021   Lisp_Object subsidiaries;
10022   ptrdiff_t base_name_len = SBYTES (SYMBOL_NAME (base));
10023   char *buf = alloca (base_name_len + 6);
10024   int i;
10025
10026   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
10027   subsidiaries = make_uninit_vector (3);
10028   for (i = 0; i < 3; i++)
10029     {
10030       strcpy (buf + base_name_len, suffixes[i]);
10031       ASET (subsidiaries, i, intern (buf));
10032     }
10033   return subsidiaries;
10034 }
10035
10036
10037 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
10038        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
10039        doc: /* For internal use only.
10040 usage: (define-coding-system-internal ...)  */)
10041   (ptrdiff_t nargs, Lisp_Object *args)
10042 {
10043   Lisp_Object name;
10044   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
10045   Lisp_Object attrs;            /* Vector of attributes.  */
10046   Lisp_Object eol_type;
10047   Lisp_Object aliases;
10048   Lisp_Object coding_type, charset_list, safe_charsets;
10049   enum coding_category category;
10050   Lisp_Object tail, val;
10051   int max_charset_id = 0;
10052   int i;
10053
10054   if (nargs < coding_arg_max)
10055     goto short_args;
10056
10057   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
10058
10059   name = args[coding_arg_name];
10060   CHECK_SYMBOL (name);
10061   ASET (attrs, coding_attr_base_name, name);
10062
10063   val = args[coding_arg_mnemonic];
10064   if (! STRINGP (val))
10065     CHECK_CHARACTER (val);
10066   ASET (attrs, coding_attr_mnemonic, val);
10067
10068   coding_type = args[coding_arg_coding_type];
10069   CHECK_SYMBOL (coding_type);
10070   ASET (attrs, coding_attr_type, coding_type);
10071
10072   charset_list = args[coding_arg_charset_list];
10073   if (SYMBOLP (charset_list))
10074     {
10075       if (EQ (charset_list, Qiso_2022))
10076         {
10077           if (! EQ (coding_type, Qiso_2022))
10078             error ("Invalid charset-list");
10079           charset_list = Viso_2022_charset_list;
10080         }
10081       else if (EQ (charset_list, Qemacs_mule))
10082         {
10083           if (! EQ (coding_type, Qemacs_mule))
10084             error ("Invalid charset-list");
10085           charset_list = Vemacs_mule_charset_list;
10086         }
10087       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10088         {
10089           if (! RANGED_INTEGERP (0, XCAR (tail), INT_MAX - 1))
10090             error ("Invalid charset-list");
10091           if (max_charset_id < XFASTINT (XCAR (tail)))
10092             max_charset_id = XFASTINT (XCAR (tail));
10093         }
10094     }
10095   else
10096     {
10097       charset_list = Fcopy_sequence (charset_list);
10098       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10099         {
10100           struct charset *charset;
10101
10102           val = XCAR (tail);
10103           CHECK_CHARSET_GET_CHARSET (val, charset);
10104           if (EQ (coding_type, Qiso_2022)
10105               ? CHARSET_ISO_FINAL (charset) < 0
10106               : EQ (coding_type, Qemacs_mule)
10107               ? CHARSET_EMACS_MULE_ID (charset) < 0
10108               : 0)
10109             error ("Can't handle charset `%s'",
10110                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10111
10112           XSETCAR (tail, make_number (charset->id));
10113           if (max_charset_id < charset->id)
10114             max_charset_id = charset->id;
10115         }
10116     }
10117   ASET (attrs, coding_attr_charset_list, charset_list);
10118
10119   safe_charsets = make_uninit_string (max_charset_id + 1);
10120   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
10121   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10122     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
10123   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
10124
10125   ASET (attrs, coding_attr_ascii_compat, args[coding_arg_ascii_compatible_p]);
10126
10127   val = args[coding_arg_decode_translation_table];
10128   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10129     CHECK_SYMBOL (val);
10130   ASET (attrs, coding_attr_decode_tbl, val);
10131
10132   val = args[coding_arg_encode_translation_table];
10133   if (! CHAR_TABLE_P (val) && ! CONSP (val))
10134     CHECK_SYMBOL (val);
10135   ASET (attrs, coding_attr_encode_tbl, val);
10136
10137   val = args[coding_arg_post_read_conversion];
10138   CHECK_SYMBOL (val);
10139   ASET (attrs, coding_attr_post_read, val);
10140
10141   val = args[coding_arg_pre_write_conversion];
10142   CHECK_SYMBOL (val);
10143   ASET (attrs, coding_attr_pre_write, val);
10144
10145   val = args[coding_arg_default_char];
10146   if (NILP (val))
10147     ASET (attrs, coding_attr_default_char, make_number (' '));
10148   else
10149     {
10150       CHECK_CHARACTER (val);
10151       ASET (attrs, coding_attr_default_char, val);
10152     }
10153
10154   val = args[coding_arg_for_unibyte];
10155   ASET (attrs, coding_attr_for_unibyte, NILP (val) ? Qnil : Qt);
10156
10157   val = args[coding_arg_plist];
10158   CHECK_LIST (val);
10159   ASET (attrs, coding_attr_plist, val);
10160
10161   if (EQ (coding_type, Qcharset))
10162     {
10163       /* Generate a lisp vector of 256 elements.  Each element is nil,
10164          integer, or a list of charset IDs.
10165
10166          If Nth element is nil, the byte code N is invalid in this
10167          coding system.
10168
10169          If Nth element is a number NUM, N is the first byte of a
10170          charset whose ID is NUM.
10171
10172          If Nth element is a list of charset IDs, N is the first byte
10173          of one of them.  The list is sorted by dimensions of the
10174          charsets.  A charset of smaller dimension comes first. */
10175       val = Fmake_vector (make_number (256), Qnil);
10176
10177       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
10178         {
10179           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
10180           int dim = CHARSET_DIMENSION (charset);
10181           int idx = (dim - 1) * 4;
10182
10183           if (CHARSET_ASCII_COMPATIBLE_P (charset))
10184             ASET (attrs, coding_attr_ascii_compat, Qt);
10185
10186           for (i = charset->code_space[idx];
10187                i <= charset->code_space[idx + 1]; i++)
10188             {
10189               Lisp_Object tmp, tmp2;
10190               int dim2;
10191
10192               tmp = AREF (val, i);
10193               if (NILP (tmp))
10194                 tmp = XCAR (tail);
10195               else if (NUMBERP (tmp))
10196                 {
10197                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
10198                   if (dim < dim2)
10199                     tmp = list2 (XCAR (tail), tmp);
10200                   else
10201                     tmp = list2 (tmp, XCAR (tail));
10202                 }
10203               else
10204                 {
10205                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
10206                     {
10207                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
10208                       if (dim < dim2)
10209                         break;
10210                     }
10211                   if (NILP (tmp2))
10212                     tmp = nconc2 (tmp, list1 (XCAR (tail)));
10213                   else
10214                     {
10215                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
10216                       XSETCAR (tmp2, XCAR (tail));
10217                     }
10218                 }
10219               ASET (val, i, tmp);
10220             }
10221         }
10222       ASET (attrs, coding_attr_charset_valids, val);
10223       category = coding_category_charset;
10224     }
10225   else if (EQ (coding_type, Qccl))
10226     {
10227       Lisp_Object valids;
10228
10229       if (nargs < coding_arg_ccl_max)
10230         goto short_args;
10231
10232       val = args[coding_arg_ccl_decoder];
10233       CHECK_CCL_PROGRAM (val);
10234       if (VECTORP (val))
10235         val = Fcopy_sequence (val);
10236       ASET (attrs, coding_attr_ccl_decoder, val);
10237
10238       val = args[coding_arg_ccl_encoder];
10239       CHECK_CCL_PROGRAM (val);
10240       if (VECTORP (val))
10241         val = Fcopy_sequence (val);
10242       ASET (attrs, coding_attr_ccl_encoder, val);
10243
10244       val = args[coding_arg_ccl_valids];
10245       valids = Fmake_string (make_number (256), make_number (0));
10246       for (tail = val; CONSP (tail); tail = XCDR (tail))
10247         {
10248           int from, to;
10249
10250           val = XCAR (tail);
10251           if (INTEGERP (val))
10252             {
10253               if (! (0 <= XINT (val) && XINT (val) <= 255))
10254                 args_out_of_range_3 (val, make_number (0), make_number (255));
10255               from = to = XINT (val);
10256             }
10257           else
10258             {
10259               CHECK_CONS (val);
10260               CHECK_NATNUM_CAR (val);
10261               CHECK_NUMBER_CDR (val);
10262               if (XINT (XCAR (val)) > 255)
10263                 args_out_of_range_3 (XCAR (val),
10264                                      make_number (0), make_number (255));
10265               from = XINT (XCAR (val));
10266               if (! (from <= XINT (XCDR (val)) && XINT (XCDR (val)) <= 255))
10267                 args_out_of_range_3 (XCDR (val),
10268                                      XCAR (val), make_number (255));
10269               to = XINT (XCDR (val));
10270             }
10271           for (i = from; i <= to; i++)
10272             SSET (valids, i, 1);
10273         }
10274       ASET (attrs, coding_attr_ccl_valids, valids);
10275
10276       category = coding_category_ccl;
10277     }
10278   else if (EQ (coding_type, Qutf_16))
10279     {
10280       Lisp_Object bom, endian;
10281
10282       ASET (attrs, coding_attr_ascii_compat, Qnil);
10283
10284       if (nargs < coding_arg_utf16_max)
10285         goto short_args;
10286
10287       bom = args[coding_arg_utf16_bom];
10288       if (! NILP (bom) && ! EQ (bom, Qt))
10289         {
10290           CHECK_CONS (bom);
10291           val = XCAR (bom);
10292           CHECK_CODING_SYSTEM (val);
10293           val = XCDR (bom);
10294           CHECK_CODING_SYSTEM (val);
10295         }
10296       ASET (attrs, coding_attr_utf_bom, bom);
10297
10298       endian = args[coding_arg_utf16_endian];
10299       CHECK_SYMBOL (endian);
10300       if (NILP (endian))
10301         endian = Qbig;
10302       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
10303         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
10304       ASET (attrs, coding_attr_utf_16_endian, endian);
10305
10306       category = (CONSP (bom)
10307                   ? coding_category_utf_16_auto
10308                   : NILP (bom)
10309                   ? (EQ (endian, Qbig)
10310                      ? coding_category_utf_16_be_nosig
10311                      : coding_category_utf_16_le_nosig)
10312                   : (EQ (endian, Qbig)
10313                      ? coding_category_utf_16_be
10314                      : coding_category_utf_16_le));
10315     }
10316   else if (EQ (coding_type, Qiso_2022))
10317     {
10318       Lisp_Object initial, reg_usage, request, flags;
10319
10320       if (nargs < coding_arg_iso2022_max)
10321         goto short_args;
10322
10323       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
10324       CHECK_VECTOR (initial);
10325       for (i = 0; i < 4; i++)
10326         {
10327           val = AREF (initial, i);
10328           if (! NILP (val))
10329             {
10330               struct charset *charset;
10331
10332               CHECK_CHARSET_GET_CHARSET (val, charset);
10333               ASET (initial, i, make_number (CHARSET_ID (charset)));
10334               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
10335                 ASET (attrs, coding_attr_ascii_compat, Qt);
10336             }
10337           else
10338             ASET (initial, i, make_number (-1));
10339         }
10340
10341       reg_usage = args[coding_arg_iso2022_reg_usage];
10342       CHECK_CONS (reg_usage);
10343       CHECK_NUMBER_CAR (reg_usage);
10344       CHECK_NUMBER_CDR (reg_usage);
10345
10346       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
10347       for (tail = request; CONSP (tail); tail = XCDR (tail))
10348         {
10349           int id;
10350           Lisp_Object tmp1;
10351
10352           val = XCAR (tail);
10353           CHECK_CONS (val);
10354           tmp1 = XCAR (val);
10355           CHECK_CHARSET_GET_ID (tmp1, id);
10356           CHECK_NATNUM_CDR (val);
10357           if (XINT (XCDR (val)) >= 4)
10358             error ("Invalid graphic register number: %"pI"d", XINT (XCDR (val)));
10359           XSETCAR (val, make_number (id));
10360         }
10361
10362       flags = args[coding_arg_iso2022_flags];
10363       CHECK_NATNUM (flags);
10364       i = XINT (flags) & INT_MAX;
10365       if (EQ (args[coding_arg_charset_list], Qiso_2022))
10366         i |= CODING_ISO_FLAG_FULL_SUPPORT;
10367       flags = make_number (i);
10368
10369       ASET (attrs, coding_attr_iso_initial, initial);
10370       ASET (attrs, coding_attr_iso_usage, reg_usage);
10371       ASET (attrs, coding_attr_iso_request, request);
10372       ASET (attrs, coding_attr_iso_flags, flags);
10373       setup_iso_safe_charsets (attrs);
10374
10375       if (i & CODING_ISO_FLAG_SEVEN_BITS)
10376         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
10377                           | CODING_ISO_FLAG_SINGLE_SHIFT))
10378                     ? coding_category_iso_7_else
10379                     : EQ (args[coding_arg_charset_list], Qiso_2022)
10380                     ? coding_category_iso_7
10381                     : coding_category_iso_7_tight);
10382       else
10383         {
10384           int id = XINT (AREF (initial, 1));
10385
10386           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
10387                        || EQ (args[coding_arg_charset_list], Qiso_2022)
10388                        || id < 0)
10389                       ? coding_category_iso_8_else
10390                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10391                       ? coding_category_iso_8_1
10392                       : coding_category_iso_8_2);
10393         }
10394       if (category != coding_category_iso_8_1
10395           && category != coding_category_iso_8_2)
10396         ASET (attrs, coding_attr_ascii_compat, Qnil);
10397     }
10398   else if (EQ (coding_type, Qemacs_mule))
10399     {
10400       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10401         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10402       ASET (attrs, coding_attr_ascii_compat, Qt);
10403       category = coding_category_emacs_mule;
10404     }
10405   else if (EQ (coding_type, Qshift_jis))
10406     {
10407
10408       struct charset *charset;
10409
10410       if (XINT (Flength (charset_list)) != 3
10411           && XINT (Flength (charset_list)) != 4)
10412         error ("There should be three or four charsets");
10413
10414       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10415       if (CHARSET_DIMENSION (charset) != 1)
10416         error ("Dimension of charset %s is not one",
10417                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10418       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10419         ASET (attrs, coding_attr_ascii_compat, Qt);
10420
10421       charset_list = XCDR (charset_list);
10422       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10423       if (CHARSET_DIMENSION (charset) != 1)
10424         error ("Dimension of charset %s is not one",
10425                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10426
10427       charset_list = XCDR (charset_list);
10428       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10429       if (CHARSET_DIMENSION (charset) != 2)
10430         error ("Dimension of charset %s is not two",
10431                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10432
10433       charset_list = XCDR (charset_list);
10434       if (! NILP (charset_list))
10435         {
10436           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10437           if (CHARSET_DIMENSION (charset) != 2)
10438             error ("Dimension of charset %s is not two",
10439                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10440         }
10441
10442       category = coding_category_sjis;
10443       Vsjis_coding_system = name;
10444     }
10445   else if (EQ (coding_type, Qbig5))
10446     {
10447       struct charset *charset;
10448
10449       if (XINT (Flength (charset_list)) != 2)
10450         error ("There should be just two charsets");
10451
10452       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10453       if (CHARSET_DIMENSION (charset) != 1)
10454         error ("Dimension of charset %s is not one",
10455                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10456       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10457         ASET (attrs, coding_attr_ascii_compat, Qt);
10458
10459       charset_list = XCDR (charset_list);
10460       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10461       if (CHARSET_DIMENSION (charset) != 2)
10462         error ("Dimension of charset %s is not two",
10463                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10464
10465       category = coding_category_big5;
10466       Vbig5_coding_system = name;
10467     }
10468   else if (EQ (coding_type, Qraw_text))
10469     {
10470       category = coding_category_raw_text;
10471       ASET (attrs, coding_attr_ascii_compat, Qt);
10472     }
10473   else if (EQ (coding_type, Qutf_8))
10474     {
10475       Lisp_Object bom;
10476
10477       if (nargs < coding_arg_utf8_max)
10478         goto short_args;
10479
10480       bom = args[coding_arg_utf8_bom];
10481       if (! NILP (bom) && ! EQ (bom, Qt))
10482         {
10483           CHECK_CONS (bom);
10484           val = XCAR (bom);
10485           CHECK_CODING_SYSTEM (val);
10486           val = XCDR (bom);
10487           CHECK_CODING_SYSTEM (val);
10488         }
10489       ASET (attrs, coding_attr_utf_bom, bom);
10490       if (NILP (bom))
10491         ASET (attrs, coding_attr_ascii_compat, Qt);
10492
10493       category = (CONSP (bom) ? coding_category_utf_8_auto
10494                   : NILP (bom) ? coding_category_utf_8_nosig
10495                   : coding_category_utf_8_sig);
10496     }
10497   else if (EQ (coding_type, Qundecided))
10498     {
10499       if (nargs < coding_arg_undecided_max)
10500         goto short_args;
10501       ASET (attrs, coding_attr_undecided_inhibit_null_byte_detection,
10502             args[coding_arg_undecided_inhibit_null_byte_detection]);
10503       ASET (attrs, coding_attr_undecided_inhibit_iso_escape_detection,
10504             args[coding_arg_undecided_inhibit_iso_escape_detection]);
10505       ASET (attrs, coding_attr_undecided_prefer_utf_8,
10506             args[coding_arg_undecided_prefer_utf_8]);
10507       category = coding_category_undecided;
10508     }
10509   else
10510     error ("Invalid coding system type: %s",
10511            SDATA (SYMBOL_NAME (coding_type)));
10512
10513   ASET (attrs, coding_attr_category, make_number (category));
10514   ASET (attrs, coding_attr_plist,
10515         Fcons (QCcategory,
10516                Fcons (AREF (Vcoding_category_table, category),
10517                       CODING_ATTR_PLIST (attrs))));
10518   ASET (attrs, coding_attr_plist,
10519         Fcons (QCascii_compatible_p,
10520                Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10521                       CODING_ATTR_PLIST (attrs))));
10522
10523   eol_type = args[coding_arg_eol_type];
10524   if (! NILP (eol_type)
10525       && ! EQ (eol_type, Qunix)
10526       && ! EQ (eol_type, Qdos)
10527       && ! EQ (eol_type, Qmac))
10528     error ("Invalid eol-type");
10529
10530   aliases = list1 (name);
10531
10532   if (NILP (eol_type))
10533     {
10534       eol_type = make_subsidiaries (name);
10535       for (i = 0; i < 3; i++)
10536         {
10537           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10538
10539           this_name = AREF (eol_type, i);
10540           this_aliases = list1 (this_name);
10541           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10542           this_spec = make_uninit_vector (3);
10543           ASET (this_spec, 0, attrs);
10544           ASET (this_spec, 1, this_aliases);
10545           ASET (this_spec, 2, this_eol_type);
10546           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10547           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10548           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10549           if (NILP (val))
10550             Vcoding_system_alist
10551               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10552                        Vcoding_system_alist);
10553         }
10554     }
10555
10556   spec_vec = make_uninit_vector (3);
10557   ASET (spec_vec, 0, attrs);
10558   ASET (spec_vec, 1, aliases);
10559   ASET (spec_vec, 2, eol_type);
10560
10561   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10562   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10563   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10564   if (NILP (val))
10565     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10566                                   Vcoding_system_alist);
10567
10568   {
10569     int id = coding_categories[category].id;
10570
10571     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10572       setup_coding_system (name, &coding_categories[category]);
10573   }
10574
10575   return Qnil;
10576
10577  short_args:
10578   return Fsignal (Qwrong_number_of_arguments,
10579                   Fcons (intern ("define-coding-system-internal"),
10580                          make_number (nargs)));
10581 }
10582
10583
10584 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10585        3, 3, 0,
10586        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10587   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10588 {
10589   Lisp_Object spec, attrs;
10590
10591   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10592   attrs = AREF (spec, 0);
10593   if (EQ (prop, QCmnemonic))
10594     {
10595       if (! STRINGP (val))
10596         CHECK_CHARACTER (val);
10597       ASET (attrs, coding_attr_mnemonic, val);
10598     }
10599   else if (EQ (prop, QCdefault_char))
10600     {
10601       if (NILP (val))
10602         val = make_number (' ');
10603       else
10604         CHECK_CHARACTER (val);
10605       ASET (attrs, coding_attr_default_char, val);
10606     }
10607   else if (EQ (prop, QCdecode_translation_table))
10608     {
10609       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10610         CHECK_SYMBOL (val);
10611       ASET (attrs, coding_attr_decode_tbl, val);
10612     }
10613   else if (EQ (prop, QCencode_translation_table))
10614     {
10615       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10616         CHECK_SYMBOL (val);
10617       ASET (attrs, coding_attr_encode_tbl, val);
10618     }
10619   else if (EQ (prop, QCpost_read_conversion))
10620     {
10621       CHECK_SYMBOL (val);
10622       ASET (attrs, coding_attr_post_read, val);
10623     }
10624   else if (EQ (prop, QCpre_write_conversion))
10625     {
10626       CHECK_SYMBOL (val);
10627       ASET (attrs, coding_attr_pre_write, val);
10628     }
10629   else if (EQ (prop, QCascii_compatible_p))
10630     {
10631       ASET (attrs, coding_attr_ascii_compat, val);
10632     }
10633
10634   ASET (attrs, coding_attr_plist,
10635         Fplist_put (CODING_ATTR_PLIST (attrs), prop, val));
10636   return val;
10637 }
10638
10639
10640 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10641        Sdefine_coding_system_alias, 2, 2, 0,
10642        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10643   (Lisp_Object alias, Lisp_Object coding_system)
10644 {
10645   Lisp_Object spec, aliases, eol_type, val;
10646
10647   CHECK_SYMBOL (alias);
10648   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10649   aliases = AREF (spec, 1);
10650   /* ALIASES should be a list of length more than zero, and the first
10651      element is a base coding system.  Append ALIAS at the tail of the
10652      list.  */
10653   while (!NILP (XCDR (aliases)))
10654     aliases = XCDR (aliases);
10655   XSETCDR (aliases, list1 (alias));
10656
10657   eol_type = AREF (spec, 2);
10658   if (VECTORP (eol_type))
10659     {
10660       Lisp_Object subsidiaries;
10661       int i;
10662
10663       subsidiaries = make_subsidiaries (alias);
10664       for (i = 0; i < 3; i++)
10665         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10666                                      AREF (eol_type, i));
10667     }
10668
10669   Fputhash (alias, spec, Vcoding_system_hash_table);
10670   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10671   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10672   if (NILP (val))
10673     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10674                                   Vcoding_system_alist);
10675
10676   return Qnil;
10677 }
10678
10679 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10680        1, 1, 0,
10681        doc: /* Return the base of CODING-SYSTEM.
10682 Any alias or subsidiary coding system is not a base coding system.  */)
10683   (Lisp_Object coding_system)
10684 {
10685   Lisp_Object spec, attrs;
10686
10687   if (NILP (coding_system))
10688     return (Qno_conversion);
10689   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10690   attrs = AREF (spec, 0);
10691   return CODING_ATTR_BASE_NAME (attrs);
10692 }
10693
10694 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10695        1, 1, 0,
10696        doc: "Return the property list of CODING-SYSTEM.")
10697   (Lisp_Object coding_system)
10698 {
10699   Lisp_Object spec, attrs;
10700
10701   if (NILP (coding_system))
10702     coding_system = Qno_conversion;
10703   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10704   attrs = AREF (spec, 0);
10705   return CODING_ATTR_PLIST (attrs);
10706 }
10707
10708
10709 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10710        1, 1, 0,
10711        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10712   (Lisp_Object coding_system)
10713 {
10714   Lisp_Object spec;
10715
10716   if (NILP (coding_system))
10717     coding_system = Qno_conversion;
10718   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10719   return AREF (spec, 1);
10720 }
10721
10722 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10723        Scoding_system_eol_type, 1, 1, 0,
10724        doc: /* Return eol-type of CODING-SYSTEM.
10725 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10726
10727 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10728 and CR respectively.
10729
10730 A vector value indicates that a format of end-of-line should be
10731 detected automatically.  Nth element of the vector is the subsidiary
10732 coding system whose eol-type is N.  */)
10733   (Lisp_Object coding_system)
10734 {
10735   Lisp_Object spec, eol_type;
10736   int n;
10737
10738   if (NILP (coding_system))
10739     coding_system = Qno_conversion;
10740   if (! CODING_SYSTEM_P (coding_system))
10741     return Qnil;
10742   spec = CODING_SYSTEM_SPEC (coding_system);
10743   eol_type = AREF (spec, 2);
10744   if (VECTORP (eol_type))
10745     return Fcopy_sequence (eol_type);
10746   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10747   return make_number (n);
10748 }
10749
10750 #endif /* emacs */
10751
10752 \f
10753 /*** 9. Post-amble ***/
10754
10755 void
10756 init_coding_once (void)
10757 {
10758   int i;
10759
10760   for (i = 0; i < coding_category_max; i++)
10761     {
10762       coding_categories[i].id = -1;
10763       coding_priorities[i] = i;
10764     }
10765
10766   /* ISO2022 specific initialize routine.  */
10767   for (i = 0; i < 0x20; i++)
10768     iso_code_class[i] = ISO_control_0;
10769   for (i = 0x21; i < 0x7F; i++)
10770     iso_code_class[i] = ISO_graphic_plane_0;
10771   for (i = 0x80; i < 0xA0; i++)
10772     iso_code_class[i] = ISO_control_1;
10773   for (i = 0xA1; i < 0xFF; i++)
10774     iso_code_class[i] = ISO_graphic_plane_1;
10775   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10776   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10777   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10778   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10779   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10780   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10781   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10782   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10783   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10784
10785   for (i = 0; i < 256; i++)
10786     {
10787       emacs_mule_bytes[i] = 1;
10788     }
10789   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10790   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10791   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10792   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10793 }
10794
10795 #ifdef emacs
10796
10797 void
10798 syms_of_coding (void)
10799 {
10800   staticpro (&Vcoding_system_hash_table);
10801   {
10802     Lisp_Object args[2];
10803     args[0] = QCtest;
10804     args[1] = Qeq;
10805     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10806   }
10807
10808   staticpro (&Vsjis_coding_system);
10809   Vsjis_coding_system = Qnil;
10810
10811   staticpro (&Vbig5_coding_system);
10812   Vbig5_coding_system = Qnil;
10813
10814   staticpro (&Vcode_conversion_reused_workbuf);
10815   Vcode_conversion_reused_workbuf = Qnil;
10816
10817   staticpro (&Vcode_conversion_workbuf_name);
10818   Vcode_conversion_workbuf_name = build_pure_c_string (" *code-conversion-work*");
10819
10820   reused_workbuf_in_use = 0;
10821
10822   DEFSYM (Qcharset, "charset");
10823   DEFSYM (Qtarget_idx, "target-idx");
10824   DEFSYM (Qcoding_system_history, "coding-system-history");
10825   Fset (Qcoding_system_history, Qnil);
10826
10827   /* Target FILENAME is the first argument.  */
10828   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10829   /* Target FILENAME is the third argument.  */
10830   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10831
10832   DEFSYM (Qcall_process, "call-process");
10833   /* Target PROGRAM is the first argument.  */
10834   Fput (Qcall_process, Qtarget_idx, make_number (0));
10835
10836   DEFSYM (Qcall_process_region, "call-process-region");
10837   /* Target PROGRAM is the third argument.  */
10838   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10839
10840   DEFSYM (Qstart_process, "start-process");
10841   /* Target PROGRAM is the third argument.  */
10842   Fput (Qstart_process, Qtarget_idx, make_number (2));
10843
10844   DEFSYM (Qopen_network_stream, "open-network-stream");
10845   /* Target SERVICE is the fourth argument.  */
10846   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10847
10848   DEFSYM (Qcoding_system, "coding-system");
10849   DEFSYM (Qcoding_aliases, "coding-aliases");
10850
10851   DEFSYM (Qeol_type, "eol-type");
10852   DEFSYM (Qunix, "unix");
10853   DEFSYM (Qdos, "dos");
10854   DEFSYM (Qmac, "mac");
10855
10856   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10857   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10858   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10859   DEFSYM (Qdefault_char, "default-char");
10860   DEFSYM (Qundecided, "undecided");
10861   DEFSYM (Qno_conversion, "no-conversion");
10862   DEFSYM (Qraw_text, "raw-text");
10863
10864   DEFSYM (Qiso_2022, "iso-2022");
10865
10866   DEFSYM (Qutf_8, "utf-8");
10867   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10868
10869 #if defined (WINDOWSNT) || defined (CYGWIN)
10870   /* No, not utf-16-le: that one has a BOM.  */
10871   DEFSYM (Qutf_16le, "utf-16le");
10872 #endif
10873
10874   DEFSYM (Qutf_16, "utf-16");
10875   DEFSYM (Qbig, "big");
10876   DEFSYM (Qlittle, "little");
10877
10878   DEFSYM (Qshift_jis, "shift-jis");
10879   DEFSYM (Qbig5, "big5");
10880
10881   DEFSYM (Qcoding_system_p, "coding-system-p");
10882
10883   DEFSYM (Qcoding_system_error, "coding-system-error");
10884   Fput (Qcoding_system_error, Qerror_conditions,
10885         listn (CONSTYPE_PURE, 2, Qcoding_system_error, Qerror));
10886   Fput (Qcoding_system_error, Qerror_message,
10887         build_pure_c_string ("Invalid coding system"));
10888
10889   DEFSYM (Qtranslation_table, "translation-table");
10890   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10891   DEFSYM (Qtranslation_table_id, "translation-table-id");
10892   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10893   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10894
10895   DEFSYM (Qvalid_codes, "valid-codes");
10896
10897   DEFSYM (Qemacs_mule, "emacs-mule");
10898
10899   DEFSYM (QCcategory, ":category");
10900   DEFSYM (QCmnemonic, ":mnemonic");
10901   DEFSYM (QCdefault_char, ":default-char");
10902   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10903   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10904   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10905   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10906   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10907
10908   Vcoding_category_table
10909     = Fmake_vector (make_number (coding_category_max), Qnil);
10910   staticpro (&Vcoding_category_table);
10911   /* Followings are target of code detection.  */
10912   ASET (Vcoding_category_table, coding_category_iso_7,
10913         intern_c_string ("coding-category-iso-7"));
10914   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10915         intern_c_string ("coding-category-iso-7-tight"));
10916   ASET (Vcoding_category_table, coding_category_iso_8_1,
10917         intern_c_string ("coding-category-iso-8-1"));
10918   ASET (Vcoding_category_table, coding_category_iso_8_2,
10919         intern_c_string ("coding-category-iso-8-2"));
10920   ASET (Vcoding_category_table, coding_category_iso_7_else,
10921         intern_c_string ("coding-category-iso-7-else"));
10922   ASET (Vcoding_category_table, coding_category_iso_8_else,
10923         intern_c_string ("coding-category-iso-8-else"));
10924   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10925         intern_c_string ("coding-category-utf-8-auto"));
10926   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10927         intern_c_string ("coding-category-utf-8"));
10928   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10929         intern_c_string ("coding-category-utf-8-sig"));
10930   ASET (Vcoding_category_table, coding_category_utf_16_be,
10931         intern_c_string ("coding-category-utf-16-be"));
10932   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10933         intern_c_string ("coding-category-utf-16-auto"));
10934   ASET (Vcoding_category_table, coding_category_utf_16_le,
10935         intern_c_string ("coding-category-utf-16-le"));
10936   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10937         intern_c_string ("coding-category-utf-16-be-nosig"));
10938   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10939         intern_c_string ("coding-category-utf-16-le-nosig"));
10940   ASET (Vcoding_category_table, coding_category_charset,
10941         intern_c_string ("coding-category-charset"));
10942   ASET (Vcoding_category_table, coding_category_sjis,
10943         intern_c_string ("coding-category-sjis"));
10944   ASET (Vcoding_category_table, coding_category_big5,
10945         intern_c_string ("coding-category-big5"));
10946   ASET (Vcoding_category_table, coding_category_ccl,
10947         intern_c_string ("coding-category-ccl"));
10948   ASET (Vcoding_category_table, coding_category_emacs_mule,
10949         intern_c_string ("coding-category-emacs-mule"));
10950   /* Followings are NOT target of code detection.  */
10951   ASET (Vcoding_category_table, coding_category_raw_text,
10952         intern_c_string ("coding-category-raw-text"));
10953   ASET (Vcoding_category_table, coding_category_undecided,
10954         intern_c_string ("coding-category-undecided"));
10955
10956   DEFSYM (Qinsufficient_source, "insufficient-source");
10957   DEFSYM (Qinvalid_source, "invalid-source");
10958   DEFSYM (Qinterrupted, "interrupted");
10959   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10960
10961   defsubr (&Scoding_system_p);
10962   defsubr (&Sread_coding_system);
10963   defsubr (&Sread_non_nil_coding_system);
10964   defsubr (&Scheck_coding_system);
10965   defsubr (&Sdetect_coding_region);
10966   defsubr (&Sdetect_coding_string);
10967   defsubr (&Sfind_coding_systems_region_internal);
10968   defsubr (&Sunencodable_char_position);
10969   defsubr (&Scheck_coding_systems_region);
10970   defsubr (&Sdecode_coding_region);
10971   defsubr (&Sencode_coding_region);
10972   defsubr (&Sdecode_coding_string);
10973   defsubr (&Sencode_coding_string);
10974   defsubr (&Sdecode_sjis_char);
10975   defsubr (&Sencode_sjis_char);
10976   defsubr (&Sdecode_big5_char);
10977   defsubr (&Sencode_big5_char);
10978   defsubr (&Sset_terminal_coding_system_internal);
10979   defsubr (&Sset_safe_terminal_coding_system_internal);
10980   defsubr (&Sterminal_coding_system);
10981   defsubr (&Sset_keyboard_coding_system_internal);
10982   defsubr (&Skeyboard_coding_system);
10983   defsubr (&Sfind_operation_coding_system);
10984   defsubr (&Sset_coding_system_priority);
10985   defsubr (&Sdefine_coding_system_internal);
10986   defsubr (&Sdefine_coding_system_alias);
10987   defsubr (&Scoding_system_put);
10988   defsubr (&Scoding_system_base);
10989   defsubr (&Scoding_system_plist);
10990   defsubr (&Scoding_system_aliases);
10991   defsubr (&Scoding_system_eol_type);
10992   defsubr (&Scoding_system_priority_list);
10993
10994   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10995                doc: /* List of coding systems.
10996
10997 Do not alter the value of this variable manually.  This variable should be
10998 updated by the functions `define-coding-system' and
10999 `define-coding-system-alias'.  */);
11000   Vcoding_system_list = Qnil;
11001
11002   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
11003                doc: /* Alist of coding system names.
11004 Each element is one element list of coding system name.
11005 This variable is given to `completing-read' as COLLECTION argument.
11006
11007 Do not alter the value of this variable manually.  This variable should be
11008 updated by the functions `make-coding-system' and
11009 `define-coding-system-alias'.  */);
11010   Vcoding_system_alist = Qnil;
11011
11012   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
11013                doc: /* List of coding-categories (symbols) ordered by priority.
11014
11015 On detecting a coding system, Emacs tries code detection algorithms
11016 associated with each coding-category one by one in this order.  When
11017 one algorithm agrees with a byte sequence of source text, the coding
11018 system bound to the corresponding coding-category is selected.
11019
11020 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
11021   {
11022     int i;
11023
11024     Vcoding_category_list = Qnil;
11025     for (i = coding_category_max - 1; i >= 0; i--)
11026       Vcoding_category_list
11027         = Fcons (AREF (Vcoding_category_table, i),
11028                  Vcoding_category_list);
11029   }
11030
11031   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
11032                doc: /* Specify the coding system for read operations.
11033 It is useful to bind this variable with `let', but do not set it globally.
11034 If the value is a coding system, it is used for decoding on read operation.
11035 If not, an appropriate element is used from one of the coding system alists.
11036 There are three such tables: `file-coding-system-alist',
11037 `process-coding-system-alist', and `network-coding-system-alist'.  */);
11038   Vcoding_system_for_read = Qnil;
11039
11040   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
11041                doc: /* Specify the coding system for write operations.
11042 Programs bind this variable with `let', but you should not set it globally.
11043 If the value is a coding system, it is used for encoding of output,
11044 when writing it to a file and when sending it to a file or subprocess.
11045
11046 If this does not specify a coding system, an appropriate element
11047 is used from one of the coding system alists.
11048 There are three such tables: `file-coding-system-alist',
11049 `process-coding-system-alist', and `network-coding-system-alist'.
11050 For output to files, if the above procedure does not specify a coding system,
11051 the value of `buffer-file-coding-system' is used.  */);
11052   Vcoding_system_for_write = Qnil;
11053
11054   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
11055                doc: /*
11056 Coding system used in the latest file or process I/O.  */);
11057   Vlast_coding_system_used = Qnil;
11058
11059   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
11060                doc: /*
11061 Error status of the last code conversion.
11062
11063 When an error was detected in the last code conversion, this variable
11064 is set to one of the following symbols.
11065   `insufficient-source'
11066   `inconsistent-eol'
11067   `invalid-source'
11068   `interrupted'
11069   `insufficient-memory'
11070 When no error was detected, the value doesn't change.  So, to check
11071 the error status of a code conversion by this variable, you must
11072 explicitly set this variable to nil before performing code
11073 conversion.  */);
11074   Vlast_code_conversion_error = Qnil;
11075
11076   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
11077                doc: /*
11078 *Non-nil means always inhibit code conversion of end-of-line format.
11079 See info node `Coding Systems' and info node `Text and Binary' concerning
11080 such conversion.  */);
11081   inhibit_eol_conversion = 0;
11082
11083   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
11084                doc: /*
11085 Non-nil means process buffer inherits coding system of process output.
11086 Bind it to t if the process output is to be treated as if it were a file
11087 read from some filesystem.  */);
11088   inherit_process_coding_system = 0;
11089
11090   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
11091                doc: /*
11092 Alist to decide a coding system to use for a file I/O operation.
11093 The format is ((PATTERN . VAL) ...),
11094 where PATTERN is a regular expression matching a file name,
11095 VAL is a coding system, a cons of coding systems, or a function symbol.
11096 If VAL is a coding system, it is used for both decoding and encoding
11097 the file contents.
11098 If VAL is a cons of coding systems, the car part is used for decoding,
11099 and the cdr part is used for encoding.
11100 If VAL is a function symbol, the function must return a coding system
11101 or a cons of coding systems which are used as above.  The function is
11102 called with an argument that is a list of the arguments with which
11103 `find-operation-coding-system' was called.  If the function can't decide
11104 a coding system, it can return `undecided' so that the normal
11105 code-detection is performed.
11106
11107 See also the function `find-operation-coding-system'
11108 and the variable `auto-coding-alist'.  */);
11109   Vfile_coding_system_alist = Qnil;
11110
11111   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
11112                doc: /*
11113 Alist to decide a coding system to use for a process I/O operation.
11114 The format is ((PATTERN . VAL) ...),
11115 where PATTERN is a regular expression matching a program name,
11116 VAL is a coding system, a cons of coding systems, or a function symbol.
11117 If VAL is a coding system, it is used for both decoding what received
11118 from the program and encoding what sent to the program.
11119 If VAL is a cons of coding systems, the car part is used for decoding,
11120 and the cdr part is used for encoding.
11121 If VAL is a function symbol, the function must return a coding system
11122 or a cons of coding systems which are used as above.
11123
11124 See also the function `find-operation-coding-system'.  */);
11125   Vprocess_coding_system_alist = Qnil;
11126
11127   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
11128                doc: /*
11129 Alist to decide a coding system to use for a network I/O operation.
11130 The format is ((PATTERN . VAL) ...),
11131 where PATTERN is a regular expression matching a network service name
11132 or is a port number to connect to,
11133 VAL is a coding system, a cons of coding systems, or a function symbol.
11134 If VAL is a coding system, it is used for both decoding what received
11135 from the network stream and encoding what sent to the network stream.
11136 If VAL is a cons of coding systems, the car part is used for decoding,
11137 and the cdr part is used for encoding.
11138 If VAL is a function symbol, the function must return a coding system
11139 or a cons of coding systems which are used as above.
11140
11141 See also the function `find-operation-coding-system'.  */);
11142   Vnetwork_coding_system_alist = Qnil;
11143
11144   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
11145                doc: /* Coding system to use with system messages.
11146 Also used for decoding keyboard input on X Window system.  */);
11147   Vlocale_coding_system = Qnil;
11148
11149   /* The eol mnemonics are reset in startup.el system-dependently.  */
11150   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
11151                doc: /*
11152 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
11153   eol_mnemonic_unix = build_pure_c_string (":");
11154
11155   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
11156                doc: /*
11157 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
11158   eol_mnemonic_dos = build_pure_c_string ("\\");
11159
11160   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
11161                doc: /*
11162 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
11163   eol_mnemonic_mac = build_pure_c_string ("/");
11164
11165   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
11166                doc: /*
11167 *String displayed in mode line when end-of-line format is not yet determined.  */);
11168   eol_mnemonic_undecided = build_pure_c_string (":");
11169
11170   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
11171                doc: /*
11172 *Non-nil enables character translation while encoding and decoding.  */);
11173   Venable_character_translation = Qt;
11174
11175   DEFVAR_LISP ("standard-translation-table-for-decode",
11176                Vstandard_translation_table_for_decode,
11177                doc: /* Table for translating characters while decoding.  */);
11178   Vstandard_translation_table_for_decode = Qnil;
11179
11180   DEFVAR_LISP ("standard-translation-table-for-encode",
11181                Vstandard_translation_table_for_encode,
11182                doc: /* Table for translating characters while encoding.  */);
11183   Vstandard_translation_table_for_encode = Qnil;
11184
11185   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
11186                doc: /* Alist of charsets vs revision numbers.
11187 While encoding, if a charset (car part of an element) is found,
11188 designate it with the escape sequence identifying revision (cdr part
11189 of the element).  */);
11190   Vcharset_revision_table = Qnil;
11191
11192   DEFVAR_LISP ("default-process-coding-system",
11193                Vdefault_process_coding_system,
11194                doc: /* Cons of coding systems used for process I/O by default.
11195 The car part is used for decoding a process output,
11196 the cdr part is used for encoding a text to be sent to a process.  */);
11197   Vdefault_process_coding_system = Qnil;
11198
11199   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
11200                doc: /*
11201 Table of extra Latin codes in the range 128..159 (inclusive).
11202 This is a vector of length 256.
11203 If Nth element is non-nil, the existence of code N in a file
11204 \(or output of subprocess) doesn't prevent it to be detected as
11205 a coding system of ISO 2022 variant which has a flag
11206 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
11207 or reading output of a subprocess.
11208 Only 128th through 159th elements have a meaning.  */);
11209   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
11210
11211   DEFVAR_LISP ("select-safe-coding-system-function",
11212                Vselect_safe_coding_system_function,
11213                doc: /*
11214 Function to call to select safe coding system for encoding a text.
11215
11216 If set, this function is called to force a user to select a proper
11217 coding system which can encode the text in the case that a default
11218 coding system used in each operation can't encode the text.  The
11219 function should take care that the buffer is not modified while
11220 the coding system is being selected.
11221
11222 The default value is `select-safe-coding-system' (which see).  */);
11223   Vselect_safe_coding_system_function = Qnil;
11224
11225   DEFVAR_BOOL ("coding-system-require-warning",
11226                coding_system_require_warning,
11227                doc: /* Internal use only.
11228 If non-nil, on writing a file, `select-safe-coding-system-function' is
11229 called even if `coding-system-for-write' is non-nil.  The command
11230 `universal-coding-system-argument' binds this variable to t temporarily.  */);
11231   coding_system_require_warning = 0;
11232
11233
11234   DEFVAR_BOOL ("inhibit-iso-escape-detection",
11235                inhibit_iso_escape_detection,
11236                doc: /*
11237 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
11238
11239 When Emacs reads text, it tries to detect how the text is encoded.
11240 This code detection is sensitive to escape sequences.  If Emacs sees
11241 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
11242 of the ISO2022 encodings, and decodes text by the corresponding coding
11243 system (e.g. `iso-2022-7bit').
11244
11245 However, there may be a case that you want to read escape sequences in
11246 a file as is.  In such a case, you can set this variable to non-nil.
11247 Then the code detection will ignore any escape sequences, and no text is
11248 detected as encoded in some ISO-2022 encoding.  The result is that all
11249 escape sequences become visible in a buffer.
11250
11251 The default value is nil, and it is strongly recommended not to change
11252 it.  That is because many Emacs Lisp source files that contain
11253 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
11254 in Emacs's distribution, and they won't be decoded correctly on
11255 reading if you suppress escape sequence detection.
11256
11257 The other way to read escape sequences in a file without decoding is
11258 to explicitly specify some coding system that doesn't use ISO-2022
11259 escape sequence (e.g., `latin-1') on reading by \\[universal-coding-system-argument].  */);
11260   inhibit_iso_escape_detection = 0;
11261
11262   DEFVAR_BOOL ("inhibit-null-byte-detection",
11263                inhibit_null_byte_detection,
11264                doc: /* If non-nil, Emacs ignores null bytes on code detection.
11265 By default, Emacs treats it as binary data, and does not attempt to
11266 decode it.  The effect is as if you specified `no-conversion' for
11267 reading that text.
11268
11269 Set this to non-nil when a regular text happens to include null bytes.
11270 Examples are Index nodes of Info files and null-byte delimited output
11271 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
11272 decode text as usual.  */);
11273   inhibit_null_byte_detection = 0;
11274
11275   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
11276                doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
11277 Internal use only.  Removed after the experimental optimizer gets stable. */);
11278   disable_ascii_optimization = 0;
11279
11280   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
11281                doc: /* Char table for translating self-inserting characters.
11282 This is applied to the result of input methods, not their input.
11283 See also `keyboard-translate-table'.
11284
11285 Use of this variable for character code unification was rendered
11286 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
11287 internal character representation.  */);
11288     Vtranslation_table_for_input = Qnil;
11289
11290   {
11291     Lisp_Object args[coding_arg_undecided_max];
11292     Lisp_Object plist[16];
11293     int i;
11294
11295     for (i = 0; i < coding_arg_undecided_max; i++)
11296       args[i] = Qnil;
11297
11298     plist[0] = intern_c_string (":name");
11299     plist[1] = args[coding_arg_name] = Qno_conversion;
11300     plist[2] = intern_c_string (":mnemonic");
11301     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
11302     plist[4] = intern_c_string (":coding-type");
11303     plist[5] = args[coding_arg_coding_type] = Qraw_text;
11304     plist[6] = intern_c_string (":ascii-compatible-p");
11305     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
11306     plist[8] = intern_c_string (":default-char");
11307     plist[9] = args[coding_arg_default_char] = make_number (0);
11308     plist[10] = intern_c_string (":for-unibyte");
11309     plist[11] = args[coding_arg_for_unibyte] = Qt;
11310     plist[12] = intern_c_string (":docstring");
11311     plist[13] = build_pure_c_string ("Do no conversion.\n\
11312 \n\
11313 When you visit a file with this coding, the file is read into a\n\
11314 unibyte buffer as is, thus each byte of a file is treated as a\n\
11315 character.");
11316     plist[14] = intern_c_string (":eol-type");
11317     plist[15] = args[coding_arg_eol_type] = Qunix;
11318     args[coding_arg_plist] = Flist (16, plist);
11319     Fdefine_coding_system_internal (coding_arg_max, args);
11320
11321     plist[1] = args[coding_arg_name] = Qundecided;
11322     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
11323     plist[5] = args[coding_arg_coding_type] = Qundecided;
11324     /* This is already set.
11325        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
11326     plist[8] = intern_c_string (":charset-list");
11327     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
11328     plist[11] = args[coding_arg_for_unibyte] = Qnil;
11329     plist[13] = build_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
11330     plist[15] = args[coding_arg_eol_type] = Qnil;
11331     args[coding_arg_plist] = Flist (16, plist);
11332     args[coding_arg_undecided_inhibit_null_byte_detection] = make_number (0);
11333     args[coding_arg_undecided_inhibit_iso_escape_detection] = make_number (0);
11334     Fdefine_coding_system_internal (coding_arg_undecided_max, args);
11335   }
11336
11337   setup_coding_system (Qno_conversion, &safe_terminal_coding);
11338
11339   {
11340     int i;
11341
11342     for (i = 0; i < coding_category_max; i++)
11343       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
11344   }
11345 #if defined (DOS_NT)
11346   system_eol_type = Qdos;
11347 #else
11348   system_eol_type = Qunix;
11349 #endif
11350   staticpro (&system_eol_type);
11351 }
11352
11353 char *
11354 emacs_strerror (int error_number)
11355 {
11356   char *str;
11357
11358   synchronize_system_messages_locale ();
11359   str = strerror (error_number);
11360
11361   if (! NILP (Vlocale_coding_system))
11362     {
11363       Lisp_Object dec = code_convert_string_norecord (build_string (str),
11364                                                       Vlocale_coding_system,
11365                                                       0);
11366       str = SSDATA (dec);
11367     }
11368
11369   return str;
11370 }
11371
11372 #endif /* emacs */